koichi12 commited on
Commit
eda6db7
·
verified ·
1 Parent(s): b891f5b

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__init__.py +51 -0
  3. tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_util.cpython-311.pyc +0 -0
  4. tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_windows.cpython-311.pyc +0 -0
  5. tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_unix.py +65 -0
  6. tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_util.py +47 -0
  7. tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_windows.py +65 -0
  8. tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/__pycache__/test_fp.cpython-311.pyc +3 -0
  9. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/__pycache__/__init__.cpython-311.pyc +0 -0
  10. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openmp/cupti_openmp.h +100 -0
  11. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openmp/omp-tools.h +1083 -0
  12. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/__pycache__/__init__.cpython-311.pyc +0 -0
  13. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cuda_stdint.h +112 -0
  14. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_driver_cbid.h +690 -0
  15. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_metrics.h +825 -0
  16. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_gl_interop_meta.h +71 -0
  17. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/lib/__init__.py +0 -0
  18. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/lib/__pycache__/__init__.cpython-311.pyc +0 -0
  19. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/common_functions.h +65 -0
  20. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaEGL.h +659 -0
  21. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_egl_interop.h +642 -0
  22. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp8.hpp +1546 -0
  23. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline_helpers.h +373 -0
  24. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline_primitives.h +148 -0
  25. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_types.h +81 -0
  26. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/host_defines.h +65 -0
  27. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/mma.h +60 -0
  28. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_61_intrinsics.h +123 -0
  29. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_types.h +281 -0
  30. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/vector_functions.h +175 -0
  31. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_globals.h +93 -0
  32. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_lognormal.h +697 -0
  33. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mrg32k3a.h +0 -0
  34. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32dc_p_11213.h +0 -0
  35. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_normal_static.h +127 -0
  36. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_philox4x32_x.h +194 -0
  37. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_precalc.h +0 -0
  38. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtCuda.h +164 -0
  39. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtOpenCL.h +214 -0
  40. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExtOpenCL.h +220 -0
  41. tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/WHEEL +5 -0
  42. tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/top_level.txt +1 -0
  43. tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__init__.py +4 -0
  44. tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__pycache__/_pyximport2.cpython-311.pyc +0 -0
  45. tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__pycache__/pyxbuild.cpython-311.pyc +0 -0
  46. tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/_pyximport3.py +478 -0
  47. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_VF.py +30 -0
  48. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_classes.py +55 -0
  49. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_deploy.py +105 -0
  50. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_linalg_utils.py +164 -0
.gitattributes CHANGED
@@ -62,3 +62,5 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/__py
62
  tuning-competition-baseline/.venv/lib/python3.11/site-packages/jinja2/__pycache__/compiler.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
63
  tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/_C.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
64
  tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Nodes.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 
 
 
62
  tuning-competition-baseline/.venv/lib/python3.11/site-packages/jinja2/__pycache__/compiler.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
63
  tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/_C.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
64
  tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Nodes.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
65
+ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/model.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
66
+ tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/__pycache__/test_fp.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__init__.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A platform independent file lock that supports the with-statement.
3
+
4
+ .. autodata:: filelock.__version__
5
+ :no-value:
6
+
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import sys
11
+ import warnings
12
+ from typing import TYPE_CHECKING
13
+
14
+ from ._api import AcquireReturnProxy, BaseFileLock
15
+ from ._error import Timeout
16
+ from ._soft import SoftFileLock
17
+ from ._unix import UnixFileLock, has_fcntl
18
+ from ._windows import WindowsFileLock
19
+ from .version import version
20
+
21
+ #: version of the project as a string
22
+ __version__: str = version
23
+
24
+
25
+ if sys.platform == "win32": # pragma: win32 cover
26
+ _FileLock: type[BaseFileLock] = WindowsFileLock
27
+ else: # pragma: win32 no cover # noqa: PLR5501
28
+ if has_fcntl:
29
+ _FileLock: type[BaseFileLock] = UnixFileLock
30
+ else:
31
+ _FileLock = SoftFileLock
32
+ if warnings is not None:
33
+ warnings.warn("only soft file lock is available", stacklevel=2)
34
+
35
+ if TYPE_CHECKING:
36
+ FileLock = SoftFileLock
37
+ else:
38
+ #: Alias for the lock, which should be used for the current platform.
39
+ FileLock = _FileLock
40
+
41
+
42
+ __all__ = [
43
+ "__version__",
44
+ "FileLock",
45
+ "SoftFileLock",
46
+ "Timeout",
47
+ "UnixFileLock",
48
+ "WindowsFileLock",
49
+ "BaseFileLock",
50
+ "AcquireReturnProxy",
51
+ ]
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_util.cpython-311.pyc ADDED
Binary file (2.2 kB). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_windows.cpython-311.pyc ADDED
Binary file (3.68 kB). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_unix.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import sys
5
+ from contextlib import suppress
6
+ from errno import ENOSYS
7
+ from typing import cast
8
+
9
+ from ._api import BaseFileLock
10
+ from ._util import ensure_directory_exists
11
+
12
+ #: a flag to indicate if the fcntl API is available
13
+ has_fcntl = False
14
+ if sys.platform == "win32": # pragma: win32 cover
15
+
16
+ class UnixFileLock(BaseFileLock):
17
+ """Uses the :func:`fcntl.flock` to hard lock the lock file on unix systems."""
18
+
19
+ def _acquire(self) -> None:
20
+ raise NotImplementedError
21
+
22
+ def _release(self) -> None:
23
+ raise NotImplementedError
24
+
25
+ else: # pragma: win32 no cover
26
+ try:
27
+ import fcntl
28
+ except ImportError:
29
+ pass
30
+ else:
31
+ has_fcntl = True
32
+
33
+ class UnixFileLock(BaseFileLock):
34
+ """Uses the :func:`fcntl.flock` to hard lock the lock file on unix systems."""
35
+
36
+ def _acquire(self) -> None:
37
+ ensure_directory_exists(self.lock_file)
38
+ open_flags = os.O_RDWR | os.O_CREAT | os.O_TRUNC
39
+ fd = os.open(self.lock_file, open_flags, self._context.mode)
40
+ with suppress(PermissionError): # This locked is not owned by this UID
41
+ os.fchmod(fd, self._context.mode)
42
+ try:
43
+ fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
44
+ except OSError as exception:
45
+ os.close(fd)
46
+ if exception.errno == ENOSYS: # NotImplemented error
47
+ msg = "FileSystem does not appear to support flock; user SoftFileLock instead"
48
+ raise NotImplementedError(msg) from exception
49
+ else:
50
+ self._context.lock_file_fd = fd
51
+
52
+ def _release(self) -> None:
53
+ # Do not remove the lockfile:
54
+ # https://github.com/tox-dev/py-filelock/issues/31
55
+ # https://stackoverflow.com/questions/17708885/flock-removing-locked-file-without-race-condition
56
+ fd = cast(int, self._context.lock_file_fd)
57
+ self._context.lock_file_fd = None
58
+ fcntl.flock(fd, fcntl.LOCK_UN)
59
+ os.close(fd)
60
+
61
+
62
+ __all__ = [
63
+ "has_fcntl",
64
+ "UnixFileLock",
65
+ ]
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_util.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import stat
5
+ import sys
6
+ from errno import EACCES, EISDIR
7
+ from pathlib import Path
8
+
9
+
10
+ def raise_on_not_writable_file(filename: str) -> None:
11
+ """
12
+ Raise an exception if attempting to open the file for writing would fail.
13
+ This is done so files that will never be writable can be separated from
14
+ files that are writable but currently locked
15
+ :param filename: file to check
16
+ :raises OSError: as if the file was opened for writing.
17
+ """
18
+ try: # use stat to do exists + can write to check without race condition
19
+ file_stat = os.stat(filename) # noqa: PTH116
20
+ except OSError:
21
+ return # swallow does not exist or other errors
22
+
23
+ if file_stat.st_mtime != 0: # if os.stat returns but modification is zero that's an invalid os.stat - ignore it
24
+ if not (file_stat.st_mode & stat.S_IWUSR):
25
+ raise PermissionError(EACCES, "Permission denied", filename)
26
+
27
+ if stat.S_ISDIR(file_stat.st_mode):
28
+ if sys.platform == "win32": # pragma: win32 cover
29
+ # On Windows, this is PermissionError
30
+ raise PermissionError(EACCES, "Permission denied", filename)
31
+ else: # pragma: win32 no cover # noqa: RET506
32
+ # On linux / macOS, this is IsADirectoryError
33
+ raise IsADirectoryError(EISDIR, "Is a directory", filename)
34
+
35
+
36
+ def ensure_directory_exists(filename: Path | str) -> None:
37
+ """
38
+ Ensure the directory containing the file exists (create it if necessary)
39
+ :param filename: file.
40
+ """
41
+ Path(filename).parent.mkdir(parents=True, exist_ok=True)
42
+
43
+
44
+ __all__ = [
45
+ "raise_on_not_writable_file",
46
+ "ensure_directory_exists",
47
+ ]
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_windows.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import sys
5
+ from contextlib import suppress
6
+ from errno import EACCES
7
+ from pathlib import Path
8
+ from typing import cast
9
+
10
+ from ._api import BaseFileLock
11
+ from ._util import ensure_directory_exists, raise_on_not_writable_file
12
+
13
+ if sys.platform == "win32": # pragma: win32 cover
14
+ import msvcrt
15
+
16
+ class WindowsFileLock(BaseFileLock):
17
+ """Uses the :func:`msvcrt.locking` function to hard lock the lock file on Windows systems."""
18
+
19
+ def _acquire(self) -> None:
20
+ raise_on_not_writable_file(self.lock_file)
21
+ ensure_directory_exists(self.lock_file)
22
+ flags = (
23
+ os.O_RDWR # open for read and write
24
+ | os.O_CREAT # create file if not exists
25
+ | os.O_TRUNC # truncate file if not empty
26
+ )
27
+ try:
28
+ fd = os.open(self.lock_file, flags, self._context.mode)
29
+ except OSError as exception:
30
+ if exception.errno != EACCES: # has no access to this lock
31
+ raise
32
+ else:
33
+ try:
34
+ msvcrt.locking(fd, msvcrt.LK_NBLCK, 1)
35
+ except OSError as exception:
36
+ os.close(fd) # close file first
37
+ if exception.errno != EACCES: # file is already locked
38
+ raise
39
+ else:
40
+ self._context.lock_file_fd = fd
41
+
42
+ def _release(self) -> None:
43
+ fd = cast(int, self._context.lock_file_fd)
44
+ self._context.lock_file_fd = None
45
+ msvcrt.locking(fd, msvcrt.LK_UNLCK, 1)
46
+ os.close(fd)
47
+
48
+ with suppress(OSError): # Probably another instance of the application hat acquired the file lock.
49
+ Path(self.lock_file).unlink()
50
+
51
+ else: # pragma: win32 no cover
52
+
53
+ class WindowsFileLock(BaseFileLock):
54
+ """Uses the :func:`msvcrt.locking` function to hard lock the lock file on Windows systems."""
55
+
56
+ def _acquire(self) -> None:
57
+ raise NotImplementedError
58
+
59
+ def _release(self) -> None:
60
+ raise NotImplementedError
61
+
62
+
63
+ __all__ = [
64
+ "WindowsFileLock",
65
+ ]
tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/__pycache__/test_fp.cpython-311.pyc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fac5cd5bfbd06bb4a9b6ca2c30c684bea761aa5b6dbe0c019ed92f1f4a7d8143
3
+ size 142559
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (218 Bytes). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openmp/cupti_openmp.h ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2018 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #include <cuda_stdint.h>
51
+ #include "Openmp/omp-tools.h"
52
+
53
+ #if !defined(_CUPTI_OPENMP_H_)
54
+ #define _CUPTI_OPENMP_H_
55
+
56
+ #ifndef CUPTIAPI
57
+ #ifdef _WIN32
58
+ #define CUPTIAPI __stdcall
59
+ #else
60
+ #define CUPTIAPI
61
+ #endif
62
+ #endif
63
+
64
+ #if defined(__LP64__)
65
+ #define CUPTILP64 1
66
+ #elif defined(_WIN64)
67
+ #define CUPTILP64 1
68
+ #else
69
+ #undef CUPTILP64
70
+ #endif
71
+
72
+ #if defined(__cplusplus)
73
+ extern "C" {
74
+ #endif
75
+
76
+ #if defined(__GNUC__) && defined(CUPTI_LIB)
77
+ #pragma GCC visibility push(default)
78
+ #endif
79
+
80
+ /**
81
+ * \brief Initialize OPENMP support (deprecated, used before OpenMP 5.0)
82
+ *
83
+ */
84
+ int CUPTIAPI cuptiOpenMpInitialize(ompt_function_lookup_t ompt_fn_lookup, const char *runtime_version, unsigned int ompt_version);
85
+
86
+ /**
87
+ * \brief Initialize OPENMP support
88
+ *
89
+ */
90
+ int CUPTIAPI cuptiOpenMpInitialize_v2(ompt_function_lookup_t lookup, int initial_device_num, ompt_data_t *tool_data);
91
+
92
+ #if defined(__GNUC__) && defined(CUPTI_LIB)
93
+ #pragma GCC visibility pop
94
+ #endif
95
+
96
+ #if defined(__cplusplus)
97
+ }
98
+ #endif
99
+
100
+ #endif /*_CUPTI_OPENMP_H_*/
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openmp/omp-tools.h ADDED
@@ -0,0 +1,1083 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * include/50/omp-tools.h.var
3
+ */
4
+
5
+ //===----------------------------------------------------------------------===//
6
+ //
7
+ // The LLVM Compiler Infrastructure
8
+ //
9
+ // This file is dual licensed under the MIT and the University of Illinois Open
10
+ // Source Licenses. See LICENSE.txt for details.
11
+ //
12
+ //===----------------------------------------------------------------------===//
13
+
14
+ #ifndef __OMPT__
15
+ #define __OMPT__
16
+
17
+ /*****************************************************************************
18
+ * system include files
19
+ *****************************************************************************/
20
+
21
+ #include <stdint.h>
22
+ #include <stddef.h>
23
+
24
+ /*****************************************************************************
25
+ * iteration macros
26
+ *****************************************************************************/
27
+
28
+ #define FOREACH_OMPT_INQUIRY_FN(macro) \
29
+ macro (ompt_enumerate_states) \
30
+ macro (ompt_enumerate_mutex_impls) \
31
+ \
32
+ macro (ompt_set_callback) \
33
+ macro (ompt_get_callback) \
34
+ \
35
+ macro (ompt_get_state) \
36
+ \
37
+ macro (ompt_get_parallel_info) \
38
+ macro (ompt_get_task_info) \
39
+ macro (ompt_get_task_memory) \
40
+ macro (ompt_get_thread_data) \
41
+ macro (ompt_get_unique_id) \
42
+ macro (ompt_finalize_tool) \
43
+ \
44
+ macro(ompt_get_num_procs) \
45
+ macro(ompt_get_num_places) \
46
+ macro(ompt_get_place_proc_ids) \
47
+ macro(ompt_get_place_num) \
48
+ macro(ompt_get_partition_place_nums) \
49
+ macro(ompt_get_proc_id) \
50
+ \
51
+ macro(ompt_get_target_info) \
52
+ macro(ompt_get_num_devices)
53
+
54
+ #define FOREACH_OMPT_STATE(macro) \
55
+ \
56
+ /* first available state */ \
57
+ macro (ompt_state_undefined, 0x102) /* undefined thread state */ \
58
+ \
59
+ /* work states (0..15) */ \
60
+ macro (ompt_state_work_serial, 0x000) /* working outside parallel */ \
61
+ macro (ompt_state_work_parallel, 0x001) /* working within parallel */ \
62
+ macro (ompt_state_work_reduction, 0x002) /* performing a reduction */ \
63
+ \
64
+ /* barrier wait states (16..31) */ \
65
+ macro (ompt_state_wait_barrier, 0x010) /* waiting at a barrier */ \
66
+ macro (ompt_state_wait_barrier_implicit_parallel, 0x011) \
67
+ /* implicit barrier at the end of parallel region */\
68
+ macro (ompt_state_wait_barrier_implicit_workshare, 0x012) \
69
+ /* implicit barrier at the end of worksharing */ \
70
+ macro (ompt_state_wait_barrier_implicit, 0x013) /* implicit barrier */ \
71
+ macro (ompt_state_wait_barrier_explicit, 0x014) /* explicit barrier */ \
72
+ \
73
+ /* task wait states (32..63) */ \
74
+ macro (ompt_state_wait_taskwait, 0x020) /* waiting at a taskwait */ \
75
+ macro (ompt_state_wait_taskgroup, 0x021) /* waiting at a taskgroup */ \
76
+ \
77
+ /* mutex wait states (64..127) */ \
78
+ macro (ompt_state_wait_mutex, 0x040) \
79
+ macro (ompt_state_wait_lock, 0x041) /* waiting for lock */ \
80
+ macro (ompt_state_wait_critical, 0x042) /* waiting for critical */ \
81
+ macro (ompt_state_wait_atomic, 0x043) /* waiting for atomic */ \
82
+ macro (ompt_state_wait_ordered, 0x044) /* waiting for ordered */ \
83
+ \
84
+ /* target wait states (128..255) */ \
85
+ macro (ompt_state_wait_target, 0x080) /* waiting for target region */ \
86
+ macro (ompt_state_wait_target_map, 0x081) /* waiting for target data mapping operation */ \
87
+ macro (ompt_state_wait_target_update, 0x082) /* waiting for target update operation */ \
88
+ \
89
+ /* misc (256..511) */ \
90
+ macro (ompt_state_idle, 0x100) /* waiting for work */ \
91
+ macro (ompt_state_overhead, 0x101) /* overhead excluding wait states */ \
92
+ \
93
+ /* implementation-specific states (512..) */
94
+
95
+
96
+ #define FOREACH_KMP_MUTEX_IMPL(macro) \
97
+ macro (kmp_mutex_impl_none, 0) /* unknown implementation */ \
98
+ macro (kmp_mutex_impl_spin, 1) /* based on spin */ \
99
+ macro (kmp_mutex_impl_queuing, 2) /* based on some fair policy */ \
100
+ macro (kmp_mutex_impl_speculative, 3) /* based on HW-supported speculation */
101
+
102
+ #define FOREACH_OMPT_EVENT(macro) \
103
+ \
104
+ /*--- Mandatory Events ---*/ \
105
+ macro (ompt_callback_thread_begin, ompt_callback_thread_begin_t, 1) /* thread begin */ \
106
+ macro (ompt_callback_thread_end, ompt_callback_thread_end_t, 2) /* thread end */ \
107
+ \
108
+ macro (ompt_callback_parallel_begin, ompt_callback_parallel_begin_t, 3) /* parallel begin */ \
109
+ macro (ompt_callback_parallel_end, ompt_callback_parallel_end_t, 4) /* parallel end */ \
110
+ \
111
+ macro (ompt_callback_task_create, ompt_callback_task_create_t, 5) /* task begin */ \
112
+ macro (ompt_callback_task_schedule, ompt_callback_task_schedule_t, 6) /* task schedule */ \
113
+ macro (ompt_callback_implicit_task, ompt_callback_implicit_task_t, 7) /* implicit task */ \
114
+ \
115
+ macro (ompt_callback_target, ompt_callback_target_t, 8) /* target */ \
116
+ macro (ompt_callback_target_data_op, ompt_callback_target_data_op_t, 9) /* target data op */ \
117
+ macro (ompt_callback_target_submit, ompt_callback_target_submit_t, 10) /* target submit */ \
118
+ \
119
+ macro (ompt_callback_control_tool, ompt_callback_control_tool_t, 11) /* control tool */ \
120
+ \
121
+ macro (ompt_callback_device_initialize, ompt_callback_device_initialize_t, 12) /* device initialize */ \
122
+ macro (ompt_callback_device_finalize, ompt_callback_device_finalize_t, 13) /* device finalize */ \
123
+ \
124
+ macro (ompt_callback_device_load, ompt_callback_device_load_t, 14) /* device load */ \
125
+ macro (ompt_callback_device_unload, ompt_callback_device_unload_t, 15) /* device unload */ \
126
+ \
127
+ /* Optional Events */ \
128
+ macro (ompt_callback_sync_region_wait, ompt_callback_sync_region_t, 16) /* sync region wait begin or end */ \
129
+ \
130
+ macro (ompt_callback_mutex_released, ompt_callback_mutex_t, 17) /* mutex released */ \
131
+ \
132
+ macro (ompt_callback_dependences, ompt_callback_dependences_t, 18) /* report task dependences */ \
133
+ macro (ompt_callback_task_dependence, ompt_callback_task_dependence_t, 19) /* report task dependence */ \
134
+ \
135
+ macro (ompt_callback_work, ompt_callback_work_t, 20) /* task at work begin or end */ \
136
+ \
137
+ macro (ompt_callback_master, ompt_callback_master_t, 21) /* task at master begin or end */ \
138
+ \
139
+ macro (ompt_callback_target_map, ompt_callback_target_map_t, 22) /* target map */ \
140
+ \
141
+ macro (ompt_callback_sync_region, ompt_callback_sync_region_t, 23) /* sync region begin or end */ \
142
+ \
143
+ macro (ompt_callback_lock_init, ompt_callback_mutex_acquire_t, 24) /* lock init */ \
144
+ macro (ompt_callback_lock_destroy, ompt_callback_mutex_t, 25) /* lock destroy */ \
145
+ \
146
+ macro (ompt_callback_mutex_acquire, ompt_callback_mutex_acquire_t, 26) /* mutex acquire */ \
147
+ macro (ompt_callback_mutex_acquired, ompt_callback_mutex_t, 27) /* mutex acquired */ \
148
+ \
149
+ macro (ompt_callback_nest_lock, ompt_callback_nest_lock_t, 28) /* nest lock */ \
150
+ \
151
+ macro (ompt_callback_flush, ompt_callback_flush_t, 29) /* after executing flush */ \
152
+ \
153
+ macro (ompt_callback_cancel, ompt_callback_cancel_t, 30) /* cancel innermost binding region */ \
154
+ \
155
+ macro (ompt_callback_reduction, ompt_callback_sync_region_t, 31) /* reduction */ \
156
+ \
157
+ macro (ompt_callback_dispatch, ompt_callback_dispatch_t, 32) /* dispatch of work */
158
+
159
+ /*****************************************************************************
160
+ * implementation specific types
161
+ *****************************************************************************/
162
+
163
+ typedef enum kmp_mutex_impl_t {
164
+ #define kmp_mutex_impl_macro(impl, code) impl = code,
165
+ FOREACH_KMP_MUTEX_IMPL(kmp_mutex_impl_macro)
166
+ #undef kmp_mutex_impl_macro
167
+ } kmp_mutex_impl_t;
168
+
169
+ /*****************************************************************************
170
+ * definitions generated from spec
171
+ *****************************************************************************/
172
+
173
+ typedef enum ompt_callbacks_t {
174
+ ompt_callback_thread_begin = 1,
175
+ ompt_callback_thread_end = 2,
176
+ ompt_callback_parallel_begin = 3,
177
+ ompt_callback_parallel_end = 4,
178
+ ompt_callback_task_create = 5,
179
+ ompt_callback_task_schedule = 6,
180
+ ompt_callback_implicit_task = 7,
181
+ ompt_callback_target = 8,
182
+ ompt_callback_target_data_op = 9,
183
+ ompt_callback_target_submit = 10,
184
+ ompt_callback_control_tool = 11,
185
+ ompt_callback_device_initialize = 12,
186
+ ompt_callback_device_finalize = 13,
187
+ ompt_callback_device_load = 14,
188
+ ompt_callback_device_unload = 15,
189
+ ompt_callback_sync_region_wait = 16,
190
+ ompt_callback_mutex_released = 17,
191
+ ompt_callback_dependences = 18,
192
+ ompt_callback_task_dependence = 19,
193
+ ompt_callback_work = 20,
194
+ ompt_callback_master = 21,
195
+ ompt_callback_target_map = 22,
196
+ ompt_callback_sync_region = 23,
197
+ ompt_callback_lock_init = 24,
198
+ ompt_callback_lock_destroy = 25,
199
+ ompt_callback_mutex_acquire = 26,
200
+ ompt_callback_mutex_acquired = 27,
201
+ ompt_callback_nest_lock = 28,
202
+ ompt_callback_flush = 29,
203
+ ompt_callback_cancel = 30,
204
+ ompt_callback_reduction = 31,
205
+ ompt_callback_dispatch = 32
206
+ } ompt_callbacks_t;
207
+
208
+ typedef enum ompt_record_t {
209
+ ompt_record_ompt = 1,
210
+ ompt_record_native = 2,
211
+ ompt_record_invalid = 3
212
+ } ompt_record_t;
213
+
214
+ typedef enum ompt_record_native_t {
215
+ ompt_record_native_info = 1,
216
+ ompt_record_native_event = 2
217
+ } ompt_record_native_t;
218
+
219
+ typedef enum ompt_set_result_t {
220
+ ompt_set_error = 0,
221
+ ompt_set_never = 1,
222
+ ompt_set_impossible = 2,
223
+ ompt_set_sometimes = 3,
224
+ ompt_set_sometimes_paired = 4,
225
+ ompt_set_always = 5
226
+ } ompt_set_result_t;
227
+
228
+ typedef uint64_t ompt_id_t;
229
+
230
+ typedef uint64_t ompt_device_time_t;
231
+
232
+ typedef uint64_t ompt_buffer_cursor_t;
233
+
234
+ typedef enum ompt_thread_t {
235
+ ompt_thread_initial = 1,
236
+ ompt_thread_worker = 2,
237
+ ompt_thread_other = 3,
238
+ ompt_thread_unknown = 4
239
+ } ompt_thread_t;
240
+
241
+ typedef enum ompt_scope_endpoint_t {
242
+ ompt_scope_begin = 1,
243
+ ompt_scope_end = 2
244
+ } ompt_scope_endpoint_t;
245
+
246
+ typedef enum ompt_dispatch_t {
247
+ ompt_dispatch_iteration = 1,
248
+ ompt_dispatch_section = 2
249
+ } ompt_dispatch_t;
250
+
251
+ typedef enum ompt_sync_region_t {
252
+ ompt_sync_region_barrier = 1,
253
+ ompt_sync_region_barrier_implicit = 2,
254
+ ompt_sync_region_barrier_explicit = 3,
255
+ ompt_sync_region_barrier_implementation = 4,
256
+ ompt_sync_region_taskwait = 5,
257
+ ompt_sync_region_taskgroup = 6,
258
+ ompt_sync_region_reduction = 7
259
+ } ompt_sync_region_t;
260
+
261
+ typedef enum ompt_target_data_op_t {
262
+ ompt_target_data_alloc = 1,
263
+ ompt_target_data_transfer_to_device = 2,
264
+ ompt_target_data_transfer_from_device = 3,
265
+ ompt_target_data_delete = 4,
266
+ ompt_target_data_associate = 5,
267
+ ompt_target_data_disassociate = 6
268
+ } ompt_target_data_op_t;
269
+
270
+ typedef enum ompt_work_t {
271
+ ompt_work_loop = 1,
272
+ ompt_work_sections = 2,
273
+ ompt_work_single_executor = 3,
274
+ ompt_work_single_other = 4,
275
+ ompt_work_workshare = 5,
276
+ ompt_work_distribute = 6,
277
+ ompt_work_taskloop = 7
278
+ } ompt_work_t;
279
+
280
+ typedef enum ompt_mutex_t {
281
+ ompt_mutex_lock = 1,
282
+ ompt_mutex_test_lock = 2,
283
+ ompt_mutex_nest_lock = 3,
284
+ ompt_mutex_test_nest_lock = 4,
285
+ ompt_mutex_critical = 5,
286
+ ompt_mutex_atomic = 6,
287
+ ompt_mutex_ordered = 7
288
+ } ompt_mutex_t;
289
+
290
+ typedef enum ompt_native_mon_flag_t {
291
+ ompt_native_data_motion_explicit = 0x01,
292
+ ompt_native_data_motion_implicit = 0x02,
293
+ ompt_native_kernel_invocation = 0x04,
294
+ ompt_native_kernel_execution = 0x08,
295
+ ompt_native_driver = 0x10,
296
+ ompt_native_runtime = 0x20,
297
+ ompt_native_overhead = 0x40,
298
+ ompt_native_idleness = 0x80
299
+ } ompt_native_mon_flag_t;
300
+
301
+ typedef enum ompt_task_flag_t {
302
+ ompt_task_initial = 0x00000001,
303
+ ompt_task_implicit = 0x00000002,
304
+ ompt_task_explicit = 0x00000004,
305
+ ompt_task_target = 0x00000008,
306
+ ompt_task_undeferred = 0x08000000,
307
+ ompt_task_untied = 0x10000000,
308
+ ompt_task_final = 0x20000000,
309
+ ompt_task_mergeable = 0x40000000,
310
+ ompt_task_merged = 0x80000000
311
+ } ompt_task_flag_t;
312
+
313
+ typedef enum ompt_task_status_t {
314
+ ompt_task_complete = 1,
315
+ ompt_task_yield = 2,
316
+ ompt_task_cancel = 3,
317
+ ompt_task_detach = 4,
318
+ ompt_task_early_fulfill = 5,
319
+ ompt_task_late_fulfill = 6,
320
+ ompt_task_switch = 7
321
+ } ompt_task_status_t;
322
+
323
+ typedef enum ompt_target_t {
324
+ ompt_target = 1,
325
+ ompt_target_enter_data = 2,
326
+ ompt_target_exit_data = 3,
327
+ ompt_target_update = 4
328
+ } ompt_target_t;
329
+
330
+ typedef enum ompt_parallel_flag_t {
331
+ ompt_parallel_invoker_program = 0x00000001,
332
+ ompt_parallel_invoker_runtime = 0x00000002,
333
+ ompt_parallel_league = 0x40000000,
334
+ ompt_parallel_team = 0x80000000
335
+ } ompt_parallel_flag_t;
336
+
337
+ typedef enum ompt_target_map_flag_t {
338
+ ompt_target_map_flag_to = 0x01,
339
+ ompt_target_map_flag_from = 0x02,
340
+ ompt_target_map_flag_alloc = 0x04,
341
+ ompt_target_map_flag_release = 0x08,
342
+ ompt_target_map_flag_delete = 0x10,
343
+ ompt_target_map_flag_implicit = 0x20
344
+ } ompt_target_map_flag_t;
345
+
346
+ typedef enum ompt_dependence_type_t {
347
+ ompt_dependence_type_in = 1,
348
+ ompt_dependence_type_out = 2,
349
+ ompt_dependence_type_inout = 3,
350
+ ompt_dependence_type_mutexinoutset = 4,
351
+ ompt_dependence_type_source = 5,
352
+ ompt_dependence_type_sink = 6
353
+ } ompt_dependence_type_t;
354
+
355
+ typedef enum ompt_cancel_flag_t {
356
+ ompt_cancel_parallel = 0x01,
357
+ ompt_cancel_sections = 0x02,
358
+ ompt_cancel_loop = 0x04,
359
+ ompt_cancel_taskgroup = 0x08,
360
+ ompt_cancel_activated = 0x10,
361
+ ompt_cancel_detected = 0x20,
362
+ ompt_cancel_discarded_task = 0x40
363
+ } ompt_cancel_flag_t;
364
+
365
+ typedef uint64_t ompt_hwid_t;
366
+
367
+ typedef uint64_t ompt_wait_id_t;
368
+
369
+ typedef enum ompt_frame_flag_t {
370
+ ompt_frame_runtime = 0x00,
371
+ ompt_frame_application = 0x01,
372
+ ompt_frame_cfa = 0x10,
373
+ ompt_frame_framepointer = 0x20,
374
+ ompt_frame_stackaddress = 0x30
375
+ } ompt_frame_flag_t;
376
+
377
+ typedef enum ompt_state_t {
378
+ ompt_state_work_serial = 0x000,
379
+ ompt_state_work_parallel = 0x001,
380
+ ompt_state_work_reduction = 0x002,
381
+
382
+ ompt_state_wait_barrier = 0x010,
383
+ ompt_state_wait_barrier_implicit_parallel = 0x011,
384
+ ompt_state_wait_barrier_implicit_workshare = 0x012,
385
+ ompt_state_wait_barrier_implicit = 0x013,
386
+ ompt_state_wait_barrier_explicit = 0x014,
387
+
388
+ ompt_state_wait_taskwait = 0x020,
389
+ ompt_state_wait_taskgroup = 0x021,
390
+
391
+ ompt_state_wait_mutex = 0x040,
392
+ ompt_state_wait_lock = 0x041,
393
+ ompt_state_wait_critical = 0x042,
394
+ ompt_state_wait_atomic = 0x043,
395
+ ompt_state_wait_ordered = 0x044,
396
+
397
+ ompt_state_wait_target = 0x080,
398
+ ompt_state_wait_target_map = 0x081,
399
+ ompt_state_wait_target_update = 0x082,
400
+
401
+ ompt_state_idle = 0x100,
402
+ ompt_state_overhead = 0x101,
403
+ ompt_state_undefined = 0x102
404
+ } ompt_state_t;
405
+
406
+ typedef uint64_t (*ompt_get_unique_id_t) (void);
407
+
408
+ typedef uint64_t ompd_size_t;
409
+
410
+ typedef uint64_t ompd_wait_id_t;
411
+
412
+ typedef uint64_t ompd_addr_t;
413
+ typedef int64_t ompd_word_t;
414
+ typedef uint64_t ompd_seg_t;
415
+
416
+ typedef uint64_t ompd_device_t;
417
+
418
+ typedef uint64_t ompd_thread_id_t;
419
+
420
+ typedef enum ompd_scope_t {
421
+ ompd_scope_global = 1,
422
+ ompd_scope_address_space = 2,
423
+ ompd_scope_thread = 3,
424
+ ompd_scope_parallel = 4,
425
+ ompd_scope_implicit_task = 5,
426
+ ompd_scope_task = 6
427
+ } ompd_scope_t;
428
+
429
+ typedef uint64_t ompd_icv_id_t;
430
+
431
+ typedef enum ompd_rc_t {
432
+ ompd_rc_ok = 0,
433
+ ompd_rc_unavailable = 1,
434
+ ompd_rc_stale_handle = 2,
435
+ ompd_rc_bad_input = 3,
436
+ ompd_rc_error = 4,
437
+ ompd_rc_unsupported = 5,
438
+ ompd_rc_needs_state_tracking = 6,
439
+ ompd_rc_incompatible = 7,
440
+ ompd_rc_device_read_error = 8,
441
+ ompd_rc_device_write_error = 9,
442
+ ompd_rc_nomem = 10,
443
+ } ompd_rc_t;
444
+
445
+ typedef void (*ompt_interface_fn_t) (void);
446
+
447
+ typedef ompt_interface_fn_t (*ompt_function_lookup_t) (
448
+ const char *interface_function_name
449
+ );
450
+
451
+ typedef union ompt_data_t {
452
+ uint64_t value;
453
+ void *ptr;
454
+ } ompt_data_t;
455
+
456
+ typedef struct ompt_frame_t {
457
+ ompt_data_t exit_frame;
458
+ ompt_data_t enter_frame;
459
+ int exit_frame_flags;
460
+ int enter_frame_flags;
461
+ } ompt_frame_t;
462
+
463
+ typedef void (*ompt_callback_t) (void);
464
+
465
+ typedef void ompt_device_t;
466
+
467
+ typedef void ompt_buffer_t;
468
+
469
+ typedef void (*ompt_callback_buffer_request_t) (
470
+ int device_num,
471
+ ompt_buffer_t **buffer,
472
+ size_t *bytes
473
+ );
474
+
475
+ typedef void (*ompt_callback_buffer_complete_t) (
476
+ int device_num,
477
+ ompt_buffer_t *buffer,
478
+ size_t bytes,
479
+ ompt_buffer_cursor_t begin,
480
+ int buffer_owned
481
+ );
482
+
483
+ typedef void (*ompt_finalize_t) (
484
+ ompt_data_t *tool_data
485
+ );
486
+
487
+ typedef int (*ompt_initialize_t) (
488
+ ompt_function_lookup_t lookup,
489
+ int initial_device_num,
490
+ ompt_data_t *tool_data
491
+ );
492
+
493
+ typedef struct ompt_start_tool_result_t {
494
+ ompt_initialize_t initialize;
495
+ ompt_finalize_t finalize;
496
+ ompt_data_t tool_data;
497
+ } ompt_start_tool_result_t;
498
+
499
+ typedef struct ompt_record_abstract_t {
500
+ ompt_record_native_t rclass;
501
+ const char *type;
502
+ ompt_device_time_t start_time;
503
+ ompt_device_time_t end_time;
504
+ ompt_hwid_t hwid;
505
+ } ompt_record_abstract_t;
506
+
507
+ typedef struct ompt_dependence_t {
508
+ ompt_data_t variable;
509
+ ompt_dependence_type_t dependence_type;
510
+ } ompt_dependence_t;
511
+
512
+ typedef int (*ompt_enumerate_states_t) (
513
+ int current_state,
514
+ int *next_state,
515
+ const char **next_state_name
516
+ );
517
+
518
+ typedef int (*ompt_enumerate_mutex_impls_t) (
519
+ int current_impl,
520
+ int *next_impl,
521
+ const char **next_impl_name
522
+ );
523
+
524
+ typedef ompt_set_result_t (*ompt_set_callback_t) (
525
+ ompt_callbacks_t event,
526
+ ompt_callback_t callback
527
+ );
528
+
529
+ typedef int (*ompt_get_callback_t) (
530
+ ompt_callbacks_t event,
531
+ ompt_callback_t *callback
532
+ );
533
+
534
+ typedef ompt_data_t *(*ompt_get_thread_data_t) (void);
535
+
536
+ typedef int (*ompt_get_num_procs_t) (void);
537
+
538
+ typedef int (*ompt_get_num_places_t) (void);
539
+
540
+ typedef int (*ompt_get_place_proc_ids_t) (
541
+ int place_num,
542
+ int ids_size,
543
+ int *ids
544
+ );
545
+
546
+ typedef int (*ompt_get_place_num_t) (void);
547
+
548
+ typedef int (*ompt_get_partition_place_nums_t) (
549
+ int place_nums_size,
550
+ int *place_nums
551
+ );
552
+
553
+ typedef int (*ompt_get_proc_id_t) (void);
554
+
555
+ typedef int (*ompt_get_state_t) (
556
+ ompt_wait_id_t *wait_id
557
+ );
558
+
559
+ typedef int (*ompt_get_parallel_info_t) (
560
+ int ancestor_level,
561
+ ompt_data_t **parallel_data,
562
+ int *team_size
563
+ );
564
+
565
+ typedef int (*ompt_get_task_info_t) (
566
+ int ancestor_level,
567
+ int *flags,
568
+ ompt_data_t **task_data,
569
+ ompt_frame_t **task_frame,
570
+ ompt_data_t **parallel_data,
571
+ int *thread_num
572
+ );
573
+
574
+ typedef int (*ompt_get_task_memory_t)(
575
+ void **addr,
576
+ size_t *size,
577
+ int block
578
+ );
579
+
580
+ typedef int (*ompt_get_target_info_t) (
581
+ uint64_t *device_num,
582
+ ompt_id_t *target_id,
583
+ ompt_id_t *host_op_id
584
+ );
585
+
586
+ typedef int (*ompt_get_num_devices_t) (void);
587
+
588
+ typedef void (*ompt_finalize_tool_t) (void);
589
+
590
+ typedef int (*ompt_get_device_num_procs_t) (
591
+ ompt_device_t *device
592
+ );
593
+
594
+ typedef ompt_device_time_t (*ompt_get_device_time_t) (
595
+ ompt_device_t *device
596
+ );
597
+
598
+ typedef double (*ompt_translate_time_t) (
599
+ ompt_device_t *device,
600
+ ompt_device_time_t time
601
+ );
602
+
603
+ typedef ompt_set_result_t (*ompt_set_trace_ompt_t) (
604
+ ompt_device_t *device,
605
+ unsigned int enable,
606
+ unsigned int etype
607
+ );
608
+
609
+ typedef ompt_set_result_t (*ompt_set_trace_native_t) (
610
+ ompt_device_t *device,
611
+ int enable,
612
+ int flags
613
+ );
614
+
615
+ typedef int (*ompt_start_trace_t) (
616
+ ompt_device_t *device,
617
+ ompt_callback_buffer_request_t request,
618
+ ompt_callback_buffer_complete_t complete
619
+ );
620
+
621
+ typedef int (*ompt_pause_trace_t) (
622
+ ompt_device_t *device,
623
+ int begin_pause
624
+ );
625
+
626
+ typedef int (*ompt_flush_trace_t) (
627
+ ompt_device_t *device
628
+ );
629
+
630
+ typedef int (*ompt_stop_trace_t) (
631
+ ompt_device_t *device
632
+ );
633
+
634
+ typedef int (*ompt_advance_buffer_cursor_t) (
635
+ ompt_device_t *device,
636
+ ompt_buffer_t *buffer,
637
+ size_t size,
638
+ ompt_buffer_cursor_t current,
639
+ ompt_buffer_cursor_t *next
640
+ );
641
+
642
+ typedef ompt_record_t (*ompt_get_record_type_t) (
643
+ ompt_buffer_t *buffer,
644
+ ompt_buffer_cursor_t current
645
+ );
646
+
647
+ typedef void *(*ompt_get_record_native_t) (
648
+ ompt_buffer_t *buffer,
649
+ ompt_buffer_cursor_t current,
650
+ ompt_id_t *host_op_id
651
+ );
652
+
653
+ typedef ompt_record_abstract_t *
654
+ (*ompt_get_record_abstract_t) (
655
+ void *native_record
656
+ );
657
+
658
+ typedef void (*ompt_callback_thread_begin_t) (
659
+ ompt_thread_t thread_type,
660
+ ompt_data_t *thread_data
661
+ );
662
+
663
+ typedef struct ompt_record_thread_begin_t {
664
+ ompt_thread_t thread_type;
665
+ } ompt_record_thread_begin_t;
666
+
667
+ typedef void (*ompt_callback_thread_end_t) (
668
+ ompt_data_t *thread_data
669
+ );
670
+
671
+ typedef void (*ompt_callback_parallel_begin_t) (
672
+ ompt_data_t *encountering_task_data,
673
+ const ompt_frame_t *encountering_task_frame,
674
+ ompt_data_t *parallel_data,
675
+ unsigned int requested_parallelism,
676
+ int flags,
677
+ const void *codeptr_ra
678
+ );
679
+
680
+ typedef struct ompt_record_parallel_begin_t {
681
+ ompt_id_t encountering_task_id;
682
+ ompt_id_t parallel_id;
683
+ unsigned int requested_parallelism;
684
+ int flags;
685
+ const void *codeptr_ra;
686
+ } ompt_record_parallel_begin_t;
687
+
688
+ typedef void (*ompt_callback_parallel_end_t) (
689
+ ompt_data_t *parallel_data,
690
+ ompt_data_t *encountering_task_data,
691
+ int flags,
692
+ const void *codeptr_ra
693
+ );
694
+
695
+ typedef struct ompt_record_parallel_end_t {
696
+ ompt_id_t parallel_id;
697
+ ompt_id_t encountering_task_id;
698
+ int flags;
699
+ const void *codeptr_ra;
700
+ } ompt_record_parallel_end_t;
701
+
702
+ typedef void (*ompt_callback_work_t) (
703
+ ompt_work_t wstype,
704
+ ompt_scope_endpoint_t endpoint,
705
+ ompt_data_t *parallel_data,
706
+ ompt_data_t *task_data,
707
+ uint64_t count,
708
+ const void *codeptr_ra
709
+ );
710
+
711
+ typedef struct ompt_record_work_t {
712
+ ompt_work_t wstype;
713
+ ompt_scope_endpoint_t endpoint;
714
+ ompt_id_t parallel_id;
715
+ ompt_id_t task_id;
716
+ uint64_t count;
717
+ const void *codeptr_ra;
718
+ } ompt_record_work_t;
719
+
720
+ typedef void (*ompt_callback_dispatch_t) (
721
+ ompt_data_t *parallel_data,
722
+ ompt_data_t *task_data,
723
+ ompt_dispatch_t kind,
724
+ ompt_data_t instance
725
+ );
726
+
727
+ typedef struct ompt_record_dispatch_t {
728
+ ompt_id_t parallel_id;
729
+ ompt_id_t task_id;
730
+ ompt_dispatch_t kind;
731
+ ompt_data_t instance;
732
+ } ompt_record_dispatch_t;
733
+
734
+ typedef void (*ompt_callback_task_create_t) (
735
+ ompt_data_t *encountering_task_data,
736
+ const ompt_frame_t *encountering_task_frame,
737
+ ompt_data_t *new_task_data,
738
+ int flags,
739
+ int has_dependences,
740
+ const void *codeptr_ra
741
+ );
742
+
743
+ typedef struct ompt_record_task_create_t {
744
+ ompt_id_t encountering_task_id;
745
+ ompt_id_t new_task_id;
746
+ int flags;
747
+ int has_dependences;
748
+ const void *codeptr_ra;
749
+ } ompt_record_task_create_t;
750
+
751
+ typedef void (*ompt_callback_dependences_t) (
752
+ ompt_data_t *task_data,
753
+ const ompt_dependence_t *deps,
754
+ int ndeps
755
+ );
756
+
757
+ typedef struct ompt_record_dependences_t {
758
+ ompt_id_t task_id;
759
+ ompt_dependence_t dep;
760
+ int ndeps;
761
+ } ompt_record_dependences_t;
762
+
763
+ typedef void (*ompt_callback_task_dependence_t) (
764
+ ompt_data_t *src_task_data,
765
+ ompt_data_t *sink_task_data
766
+ );
767
+
768
+ typedef struct ompt_record_task_dependence_t {
769
+ ompt_id_t src_task_id;
770
+ ompt_id_t sink_task_id;
771
+ } ompt_record_task_dependence_t;
772
+
773
+ typedef void (*ompt_callback_task_schedule_t) (
774
+ ompt_data_t *prior_task_data,
775
+ ompt_task_status_t prior_task_status,
776
+ ompt_data_t *next_task_data
777
+ );
778
+
779
+ typedef struct ompt_record_task_schedule_t {
780
+ ompt_id_t prior_task_id;
781
+ ompt_task_status_t prior_task_status;
782
+ ompt_id_t next_task_id;
783
+ } ompt_record_task_schedule_t;
784
+
785
+ typedef void (*ompt_callback_implicit_task_t) (
786
+ ompt_scope_endpoint_t endpoint,
787
+ ompt_data_t *parallel_data,
788
+ ompt_data_t *task_data,
789
+ unsigned int actual_parallelism,
790
+ unsigned int index,
791
+ int flags
792
+ );
793
+
794
+ typedef struct ompt_record_implicit_task_t {
795
+ ompt_scope_endpoint_t endpoint;
796
+ ompt_id_t parallel_id;
797
+ ompt_id_t task_id;
798
+ unsigned int actual_parallelism;
799
+ unsigned int index;
800
+ int flags;
801
+ } ompt_record_implicit_task_t;
802
+
803
+ typedef void (*ompt_callback_master_t) (
804
+ ompt_scope_endpoint_t endpoint,
805
+ ompt_data_t *parallel_data,
806
+ ompt_data_t *task_data,
807
+ const void *codeptr_ra
808
+ );
809
+
810
+ typedef struct ompt_record_master_t {
811
+ ompt_scope_endpoint_t endpoint;
812
+ ompt_id_t parallel_id;
813
+ ompt_id_t task_id;
814
+ const void *codeptr_ra;
815
+ } ompt_record_master_t;
816
+
817
+ typedef void (*ompt_callback_sync_region_t) (
818
+ ompt_sync_region_t kind,
819
+ ompt_scope_endpoint_t endpoint,
820
+ ompt_data_t *parallel_data,
821
+ ompt_data_t *task_data,
822
+ const void *codeptr_ra
823
+ );
824
+
825
+ typedef struct ompt_record_sync_region_t {
826
+ ompt_sync_region_t kind;
827
+ ompt_scope_endpoint_t endpoint;
828
+ ompt_id_t parallel_id;
829
+ ompt_id_t task_id;
830
+ const void *codeptr_ra;
831
+ } ompt_record_sync_region_t;
832
+
833
+ typedef void (*ompt_callback_mutex_acquire_t) (
834
+ ompt_mutex_t kind,
835
+ unsigned int hint,
836
+ unsigned int impl,
837
+ ompt_wait_id_t wait_id,
838
+ const void *codeptr_ra
839
+ );
840
+
841
+ typedef struct ompt_record_mutex_acquire_t {
842
+ ompt_mutex_t kind;
843
+ unsigned int hint;
844
+ unsigned int impl;
845
+ ompt_wait_id_t wait_id;
846
+ const void *codeptr_ra;
847
+ } ompt_record_mutex_acquire_t;
848
+
849
+ typedef void (*ompt_callback_mutex_t) (
850
+ ompt_mutex_t kind,
851
+ ompt_wait_id_t wait_id,
852
+ const void *codeptr_ra
853
+ );
854
+
855
+ typedef struct ompt_record_mutex_t {
856
+ ompt_mutex_t kind;
857
+ ompt_wait_id_t wait_id;
858
+ const void *codeptr_ra;
859
+ } ompt_record_mutex_t;
860
+
861
+ typedef void (*ompt_callback_nest_lock_t) (
862
+ ompt_scope_endpoint_t endpoint,
863
+ ompt_wait_id_t wait_id,
864
+ const void *codeptr_ra
865
+ );
866
+
867
+ typedef struct ompt_record_nest_lock_t {
868
+ ompt_scope_endpoint_t endpoint;
869
+ ompt_wait_id_t wait_id;
870
+ const void *codeptr_ra;
871
+ } ompt_record_nest_lock_t;
872
+
873
+ typedef void (*ompt_callback_flush_t) (
874
+ ompt_data_t *thread_data,
875
+ const void *codeptr_ra
876
+ );
877
+
878
+ typedef struct ompt_record_flush_t {
879
+ const void *codeptr_ra;
880
+ } ompt_record_flush_t;
881
+
882
+ typedef void (*ompt_callback_cancel_t) (
883
+ ompt_data_t *task_data,
884
+ int flags,
885
+ const void *codeptr_ra
886
+ );
887
+
888
+ typedef struct ompt_record_cancel_t {
889
+ ompt_id_t task_id;
890
+ int flags;
891
+ const void *codeptr_ra;
892
+ } ompt_record_cancel_t;
893
+
894
+ typedef void (*ompt_callback_device_initialize_t) (
895
+ int device_num,
896
+ const char *type,
897
+ ompt_device_t *device,
898
+ ompt_function_lookup_t lookup,
899
+ const char *documentation
900
+ );
901
+
902
+ typedef void (*ompt_callback_device_finalize_t) (
903
+ int device_num
904
+ );
905
+
906
+ typedef void (*ompt_callback_device_load_t) (
907
+ int device_num,
908
+ const char *filename,
909
+ int64_t offset_in_file,
910
+ void *vma_in_file,
911
+ size_t bytes,
912
+ void *host_addr,
913
+ void *device_addr,
914
+ uint64_t module_id
915
+ );
916
+
917
+ typedef void (*ompt_callback_device_unload_t) (
918
+ int device_num,
919
+ uint64_t module_id
920
+ );
921
+
922
+ typedef void (*ompt_callback_target_data_op_t) (
923
+ ompt_id_t target_id,
924
+ ompt_id_t host_op_id,
925
+ ompt_target_data_op_t optype,
926
+ void *src_addr,
927
+ int src_device_num,
928
+ void *dest_addr,
929
+ int dest_device_num,
930
+ size_t bytes,
931
+ const void *codeptr_ra
932
+ );
933
+
934
+ typedef struct ompt_record_target_data_op_t {
935
+ ompt_id_t host_op_id;
936
+ ompt_target_data_op_t optype;
937
+ void *src_addr;
938
+ int src_device_num;
939
+ void *dest_addr;
940
+ int dest_device_num;
941
+ size_t bytes;
942
+ ompt_device_time_t end_time;
943
+ const void *codeptr_ra;
944
+ } ompt_record_target_data_op_t;
945
+
946
+ typedef void (*ompt_callback_target_t) (
947
+ ompt_target_t kind,
948
+ ompt_scope_endpoint_t endpoint,
949
+ int device_num,
950
+ ompt_data_t *task_data,
951
+ ompt_id_t target_id,
952
+ const void *codeptr_ra
953
+ );
954
+
955
+ typedef struct ompt_record_target_t {
956
+ ompt_target_t kind;
957
+ ompt_scope_endpoint_t endpoint;
958
+ int device_num;
959
+ ompt_id_t task_id;
960
+ ompt_id_t target_id;
961
+ const void *codeptr_ra;
962
+ } ompt_record_target_t;
963
+
964
+ typedef void (*ompt_callback_target_map_t) (
965
+ ompt_id_t target_id,
966
+ unsigned int nitems,
967
+ void **host_addr,
968
+ void **device_addr,
969
+ size_t *bytes,
970
+ unsigned int *mapping_flags,
971
+ const void *codeptr_ra
972
+ );
973
+
974
+ typedef struct ompt_record_target_map_t {
975
+ ompt_id_t target_id;
976
+ unsigned int nitems;
977
+ void **host_addr;
978
+ void **device_addr;
979
+ size_t *bytes;
980
+ unsigned int *mapping_flags;
981
+ const void *codeptr_ra;
982
+ } ompt_record_target_map_t;
983
+
984
+ typedef void (*ompt_callback_target_submit_t) (
985
+ ompt_id_t target_id,
986
+ ompt_id_t host_op_id,
987
+ unsigned int requested_num_teams
988
+ );
989
+
990
+ typedef struct ompt_record_target_kernel_t {
991
+ ompt_id_t host_op_id;
992
+ unsigned int requested_num_teams;
993
+ unsigned int granted_num_teams;
994
+ ompt_device_time_t end_time;
995
+ } ompt_record_target_kernel_t;
996
+
997
+ typedef int (*ompt_callback_control_tool_t) (
998
+ uint64_t command,
999
+ uint64_t modifier,
1000
+ void *arg,
1001
+ const void *codeptr_ra
1002
+ );
1003
+
1004
+ typedef struct ompt_record_control_tool_t {
1005
+ uint64_t command;
1006
+ uint64_t modifier;
1007
+ const void *codeptr_ra;
1008
+ } ompt_record_control_tool_t;
1009
+
1010
+ typedef struct ompd_address_t {
1011
+ ompd_seg_t segment;
1012
+ ompd_addr_t address;
1013
+ } ompd_address_t;
1014
+
1015
+ typedef struct ompd_frame_info_t {
1016
+ ompd_address_t frame_address;
1017
+ ompd_word_t frame_flag;
1018
+ } ompd_frame_info_t;
1019
+
1020
+ typedef struct _ompd_aspace_handle ompd_address_space_handle_t;
1021
+ typedef struct _ompd_thread_handle ompd_thread_handle_t;
1022
+ typedef struct _ompd_parallel_handle ompd_parallel_handle_t;
1023
+ typedef struct _ompd_task_handle ompd_task_handle_t;
1024
+
1025
+ typedef struct _ompd_aspace_cont ompd_address_space_context_t;
1026
+ typedef struct _ompd_thread_cont ompd_thread_context_t;
1027
+
1028
+ typedef struct ompd_device_type_sizes_t {
1029
+ uint8_t sizeof_char;
1030
+ uint8_t sizeof_short;
1031
+ uint8_t sizeof_int;
1032
+ uint8_t sizeof_long;
1033
+ uint8_t sizeof_long_long;
1034
+ uint8_t sizeof_pointer;
1035
+ } ompd_device_type_sizes_t;
1036
+
1037
+ typedef struct ompt_record_ompt_t {
1038
+ ompt_callbacks_t type;
1039
+ ompt_device_time_t time;
1040
+ ompt_id_t thread_id;
1041
+ ompt_id_t target_id;
1042
+ union {
1043
+ ompt_record_thread_begin_t thread_begin;
1044
+ ompt_record_parallel_begin_t parallel_begin;
1045
+ ompt_record_parallel_end_t parallel_end;
1046
+ ompt_record_work_t work;
1047
+ ompt_record_dispatch_t dispatch;
1048
+ ompt_record_task_create_t task_create;
1049
+ ompt_record_dependences_t dependences;
1050
+ ompt_record_task_dependence_t task_dependence;
1051
+ ompt_record_task_schedule_t task_schedule;
1052
+ ompt_record_implicit_task_t implicit_task;
1053
+ ompt_record_master_t master;
1054
+ ompt_record_sync_region_t sync_region;
1055
+ ompt_record_mutex_acquire_t mutex_acquire;
1056
+ ompt_record_mutex_t mutex;
1057
+ ompt_record_nest_lock_t nest_lock;
1058
+ ompt_record_flush_t flush;
1059
+ ompt_record_cancel_t cancel;
1060
+ ompt_record_target_t target;
1061
+ ompt_record_target_data_op_t target_data_op;
1062
+ ompt_record_target_map_t target_map;
1063
+ ompt_record_target_kernel_t target_kernel;
1064
+ ompt_record_control_tool_t control_tool;
1065
+ } record;
1066
+ } ompt_record_ompt_t;
1067
+
1068
+ typedef ompt_record_ompt_t *(*ompt_get_record_ompt_t) (
1069
+ ompt_buffer_t *buffer,
1070
+ ompt_buffer_cursor_t current
1071
+ );
1072
+
1073
+ #define ompt_id_none 0
1074
+ #define ompt_data_none {0}
1075
+ #define ompt_time_none 0
1076
+ #define ompt_hwid_none 0
1077
+ #define ompt_addr_none ~0
1078
+ #define ompt_mutex_impl_none 0
1079
+ #define ompt_wait_id_none 0
1080
+
1081
+ #define ompd_segment_none 0
1082
+
1083
+ #endif /* __OMPT__ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (226 Bytes). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cuda_stdint.h ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2009-2017 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * Redistribution and use in source and binary forms, with or without
5
+ * modification, are permitted provided that the following conditions
6
+ * are met:
7
+ * * Redistributions of source code must retain the above copyright
8
+ * notice, this list of conditions and the following disclaimer.
9
+ * * Redistributions in binary form must reproduce the above copyright
10
+ * notice, this list of conditions and the following disclaimer in the
11
+ * documentation and/or other materials provided with the distribution.
12
+ * * Neither the name of NVIDIA CORPORATION nor the names of its
13
+ * contributors may be used to endorse or promote products derived
14
+ * from this software without specific prior written permission.
15
+ *
16
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
+ */
28
+
29
+ #ifndef __cuda_stdint_h__
30
+ #define __cuda_stdint_h__
31
+
32
+ // Compiler-specific treatment for C99's stdint.h
33
+ //
34
+ // By default, this header will use the standard headers (so it
35
+ // is your responsibility to make sure they are available), except
36
+ // on MSVC before Visual Studio 2010, when they were not provided.
37
+ // To support old MSVC, a few of the commonly-used definitions are
38
+ // provided here. If more definitions are needed, add them here,
39
+ // or replace these definitions with a complete implementation,
40
+ // such as the ones available from Google, Boost, or MSVC10. You
41
+ // can prevent the definition of any of these types (in order to
42
+ // use your own) by #defining CU_STDINT_TYPES_ALREADY_DEFINED.
43
+
44
+ #if !defined(CU_STDINT_TYPES_ALREADY_DEFINED)
45
+
46
+ // In VS including stdint.h forces the C++ runtime dep - provide an opt-out
47
+ // (CU_STDINT_VS_FORCE_NO_STDINT_H) for users that care (notably static
48
+ // cudart).
49
+ #if defined(_MSC_VER) && ((_MSC_VER < 1600) || defined(CU_STDINT_VS_FORCE_NO_STDINT_H))
50
+
51
+ // These definitions can be used with MSVC 8 and 9,
52
+ // which don't ship with stdint.h:
53
+
54
+ typedef unsigned char uint8_t;
55
+
56
+ typedef short int16_t;
57
+ typedef unsigned short uint16_t;
58
+
59
+ // To keep it consistent with all MSVC build. define those types
60
+ // in the exact same way they are defined with the MSVC headers
61
+ #if defined(_MSC_VER)
62
+ typedef signed char int8_t;
63
+
64
+ typedef int int32_t;
65
+ typedef unsigned int uint32_t;
66
+
67
+ typedef long long int64_t;
68
+ typedef unsigned long long uint64_t;
69
+ #else
70
+ typedef char int8_t;
71
+
72
+ typedef long int32_t;
73
+ typedef unsigned long uint32_t;
74
+
75
+ typedef __int64 int64_t;
76
+ typedef unsigned __int64 uint64_t;
77
+ #endif
78
+
79
+ #elif defined(__DJGPP__)
80
+
81
+ // These definitions can be used when compiling
82
+ // C code with DJGPP, which only provides stdint.h
83
+ // when compiling C++ code with TR1 enabled.
84
+
85
+ typedef char int8_t;
86
+ typedef unsigned char uint8_t;
87
+
88
+ typedef short int16_t;
89
+ typedef unsigned short uint16_t;
90
+
91
+ typedef long int32_t;
92
+ typedef unsigned long uint32_t;
93
+
94
+ typedef long long int64_t;
95
+ typedef unsigned long long uint64_t;
96
+
97
+ #else
98
+
99
+ // Use standard headers, as specified by C99 and C++ TR1.
100
+ // Known to be provided by:
101
+ // - gcc/glibc, supported by all versions of glibc
102
+ // - djgpp, supported since 2001
103
+ // - MSVC, supported by Visual Studio 2010 and later
104
+
105
+ #include <stdint.h>
106
+
107
+ #endif
108
+
109
+ #endif // !defined(CU_STDINT_TYPES_ALREADY_DEFINED)
110
+
111
+
112
+ #endif // file guard
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_driver_cbid.h ADDED
@@ -0,0 +1,690 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ // *************************************************************************
3
+ // Definitions of indices for API functions, unique across entire API
4
+ // *************************************************************************
5
+
6
+ // This file is generated. Any changes you make will be lost during the next clean build.
7
+ // CUDA public interface, for type definitions and cu* function prototypes
8
+
9
+ typedef enum CUpti_driver_api_trace_cbid_enum {
10
+ CUPTI_DRIVER_TRACE_CBID_INVALID = 0,
11
+ CUPTI_DRIVER_TRACE_CBID_cuInit = 1,
12
+ CUPTI_DRIVER_TRACE_CBID_cuDriverGetVersion = 2,
13
+ CUPTI_DRIVER_TRACE_CBID_cuDeviceGet = 3,
14
+ CUPTI_DRIVER_TRACE_CBID_cuDeviceGetCount = 4,
15
+ CUPTI_DRIVER_TRACE_CBID_cuDeviceGetName = 5,
16
+ CUPTI_DRIVER_TRACE_CBID_cuDeviceComputeCapability = 6,
17
+ CUPTI_DRIVER_TRACE_CBID_cuDeviceTotalMem = 7,
18
+ CUPTI_DRIVER_TRACE_CBID_cuDeviceGetProperties = 8,
19
+ CUPTI_DRIVER_TRACE_CBID_cuDeviceGetAttribute = 9,
20
+ CUPTI_DRIVER_TRACE_CBID_cuCtxCreate = 10,
21
+ CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy = 11,
22
+ CUPTI_DRIVER_TRACE_CBID_cuCtxAttach = 12,
23
+ CUPTI_DRIVER_TRACE_CBID_cuCtxDetach = 13,
24
+ CUPTI_DRIVER_TRACE_CBID_cuCtxPushCurrent = 14,
25
+ CUPTI_DRIVER_TRACE_CBID_cuCtxPopCurrent = 15,
26
+ CUPTI_DRIVER_TRACE_CBID_cuCtxGetDevice = 16,
27
+ CUPTI_DRIVER_TRACE_CBID_cuCtxSynchronize = 17,
28
+ CUPTI_DRIVER_TRACE_CBID_cuModuleLoad = 18,
29
+ CUPTI_DRIVER_TRACE_CBID_cuModuleLoadData = 19,
30
+ CUPTI_DRIVER_TRACE_CBID_cuModuleLoadDataEx = 20,
31
+ CUPTI_DRIVER_TRACE_CBID_cuModuleLoadFatBinary = 21,
32
+ CUPTI_DRIVER_TRACE_CBID_cuModuleUnload = 22,
33
+ CUPTI_DRIVER_TRACE_CBID_cuModuleGetFunction = 23,
34
+ CUPTI_DRIVER_TRACE_CBID_cuModuleGetGlobal = 24,
35
+ CUPTI_DRIVER_TRACE_CBID_cu64ModuleGetGlobal = 25,
36
+ CUPTI_DRIVER_TRACE_CBID_cuModuleGetTexRef = 26,
37
+ CUPTI_DRIVER_TRACE_CBID_cuMemGetInfo = 27,
38
+ CUPTI_DRIVER_TRACE_CBID_cu64MemGetInfo = 28,
39
+ CUPTI_DRIVER_TRACE_CBID_cuMemAlloc = 29,
40
+ CUPTI_DRIVER_TRACE_CBID_cu64MemAlloc = 30,
41
+ CUPTI_DRIVER_TRACE_CBID_cuMemAllocPitch = 31,
42
+ CUPTI_DRIVER_TRACE_CBID_cu64MemAllocPitch = 32,
43
+ CUPTI_DRIVER_TRACE_CBID_cuMemFree = 33,
44
+ CUPTI_DRIVER_TRACE_CBID_cu64MemFree = 34,
45
+ CUPTI_DRIVER_TRACE_CBID_cuMemGetAddressRange = 35,
46
+ CUPTI_DRIVER_TRACE_CBID_cu64MemGetAddressRange = 36,
47
+ CUPTI_DRIVER_TRACE_CBID_cuMemAllocHost = 37,
48
+ CUPTI_DRIVER_TRACE_CBID_cuMemFreeHost = 38,
49
+ CUPTI_DRIVER_TRACE_CBID_cuMemHostAlloc = 39,
50
+ CUPTI_DRIVER_TRACE_CBID_cuMemHostGetDevicePointer = 40,
51
+ CUPTI_DRIVER_TRACE_CBID_cu64MemHostGetDevicePointer = 41,
52
+ CUPTI_DRIVER_TRACE_CBID_cuMemHostGetFlags = 42,
53
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD = 43,
54
+ CUPTI_DRIVER_TRACE_CBID_cu64MemcpyHtoD = 44,
55
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH = 45,
56
+ CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoH = 46,
57
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD = 47,
58
+ CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoD = 48,
59
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA = 49,
60
+ CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoA = 50,
61
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD = 51,
62
+ CUPTI_DRIVER_TRACE_CBID_cu64MemcpyAtoD = 52,
63
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA = 53,
64
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH = 54,
65
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA = 55,
66
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D = 56,
67
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned = 57,
68
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D = 58,
69
+ CUPTI_DRIVER_TRACE_CBID_cu64Memcpy3D = 59,
70
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync = 60,
71
+ CUPTI_DRIVER_TRACE_CBID_cu64MemcpyHtoDAsync = 61,
72
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync = 62,
73
+ CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoHAsync = 63,
74
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync = 64,
75
+ CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoDAsync = 65,
76
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync = 66,
77
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync = 67,
78
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync = 68,
79
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync = 69,
80
+ CUPTI_DRIVER_TRACE_CBID_cu64Memcpy3DAsync = 70,
81
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD8 = 71,
82
+ CUPTI_DRIVER_TRACE_CBID_cu64MemsetD8 = 72,
83
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD16 = 73,
84
+ CUPTI_DRIVER_TRACE_CBID_cu64MemsetD16 = 74,
85
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD32 = 75,
86
+ CUPTI_DRIVER_TRACE_CBID_cu64MemsetD32 = 76,
87
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8 = 77,
88
+ CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D8 = 78,
89
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16 = 79,
90
+ CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D16 = 80,
91
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32 = 81,
92
+ CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D32 = 82,
93
+ CUPTI_DRIVER_TRACE_CBID_cuFuncSetBlockShape = 83,
94
+ CUPTI_DRIVER_TRACE_CBID_cuFuncSetSharedSize = 84,
95
+ CUPTI_DRIVER_TRACE_CBID_cuFuncGetAttribute = 85,
96
+ CUPTI_DRIVER_TRACE_CBID_cuFuncSetCacheConfig = 86,
97
+ CUPTI_DRIVER_TRACE_CBID_cuArrayCreate = 87,
98
+ CUPTI_DRIVER_TRACE_CBID_cuArrayGetDescriptor = 88,
99
+ CUPTI_DRIVER_TRACE_CBID_cuArrayDestroy = 89,
100
+ CUPTI_DRIVER_TRACE_CBID_cuArray3DCreate = 90,
101
+ CUPTI_DRIVER_TRACE_CBID_cuArray3DGetDescriptor = 91,
102
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefCreate = 92,
103
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefDestroy = 93,
104
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefSetArray = 94,
105
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress = 95,
106
+ CUPTI_DRIVER_TRACE_CBID_cu64TexRefSetAddress = 96,
107
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress2D = 97,
108
+ CUPTI_DRIVER_TRACE_CBID_cu64TexRefSetAddress2D = 98,
109
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefSetFormat = 99,
110
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddressMode = 100,
111
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefSetFilterMode = 101,
112
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefSetFlags = 102,
113
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefGetAddress = 103,
114
+ CUPTI_DRIVER_TRACE_CBID_cu64TexRefGetAddress = 104,
115
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefGetArray = 105,
116
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefGetAddressMode = 106,
117
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefGetFilterMode = 107,
118
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefGetFormat = 108,
119
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefGetFlags = 109,
120
+ CUPTI_DRIVER_TRACE_CBID_cuParamSetSize = 110,
121
+ CUPTI_DRIVER_TRACE_CBID_cuParamSeti = 111,
122
+ CUPTI_DRIVER_TRACE_CBID_cuParamSetf = 112,
123
+ CUPTI_DRIVER_TRACE_CBID_cuParamSetv = 113,
124
+ CUPTI_DRIVER_TRACE_CBID_cuParamSetTexRef = 114,
125
+ CUPTI_DRIVER_TRACE_CBID_cuLaunch = 115,
126
+ CUPTI_DRIVER_TRACE_CBID_cuLaunchGrid = 116,
127
+ CUPTI_DRIVER_TRACE_CBID_cuLaunchGridAsync = 117,
128
+ CUPTI_DRIVER_TRACE_CBID_cuEventCreate = 118,
129
+ CUPTI_DRIVER_TRACE_CBID_cuEventRecord = 119,
130
+ CUPTI_DRIVER_TRACE_CBID_cuEventQuery = 120,
131
+ CUPTI_DRIVER_TRACE_CBID_cuEventSynchronize = 121,
132
+ CUPTI_DRIVER_TRACE_CBID_cuEventDestroy = 122,
133
+ CUPTI_DRIVER_TRACE_CBID_cuEventElapsedTime = 123,
134
+ CUPTI_DRIVER_TRACE_CBID_cuStreamCreate = 124,
135
+ CUPTI_DRIVER_TRACE_CBID_cuStreamQuery = 125,
136
+ CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize = 126,
137
+ CUPTI_DRIVER_TRACE_CBID_cuStreamDestroy = 127,
138
+ CUPTI_DRIVER_TRACE_CBID_cuGraphicsUnregisterResource = 128,
139
+ CUPTI_DRIVER_TRACE_CBID_cuGraphicsSubResourceGetMappedArray = 129,
140
+ CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceGetMappedPointer = 130,
141
+ CUPTI_DRIVER_TRACE_CBID_cu64GraphicsResourceGetMappedPointer = 131,
142
+ CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceSetMapFlags = 132,
143
+ CUPTI_DRIVER_TRACE_CBID_cuGraphicsMapResources = 133,
144
+ CUPTI_DRIVER_TRACE_CBID_cuGraphicsUnmapResources = 134,
145
+ CUPTI_DRIVER_TRACE_CBID_cuGetExportTable = 135,
146
+ CUPTI_DRIVER_TRACE_CBID_cuCtxSetLimit = 136,
147
+ CUPTI_DRIVER_TRACE_CBID_cuCtxGetLimit = 137,
148
+ CUPTI_DRIVER_TRACE_CBID_cuD3D10GetDevice = 138,
149
+ CUPTI_DRIVER_TRACE_CBID_cuD3D10CtxCreate = 139,
150
+ CUPTI_DRIVER_TRACE_CBID_cuGraphicsD3D10RegisterResource = 140,
151
+ CUPTI_DRIVER_TRACE_CBID_cuD3D10RegisterResource = 141,
152
+ CUPTI_DRIVER_TRACE_CBID_cuD3D10UnregisterResource = 142,
153
+ CUPTI_DRIVER_TRACE_CBID_cuD3D10MapResources = 143,
154
+ CUPTI_DRIVER_TRACE_CBID_cuD3D10UnmapResources = 144,
155
+ CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceSetMapFlags = 145,
156
+ CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedArray = 146,
157
+ CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedPointer = 147,
158
+ CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedSize = 148,
159
+ CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedPitch = 149,
160
+ CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetSurfaceDimensions = 150,
161
+ CUPTI_DRIVER_TRACE_CBID_cuD3D11GetDevice = 151,
162
+ CUPTI_DRIVER_TRACE_CBID_cuD3D11CtxCreate = 152,
163
+ CUPTI_DRIVER_TRACE_CBID_cuGraphicsD3D11RegisterResource = 153,
164
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9GetDevice = 154,
165
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9CtxCreate = 155,
166
+ CUPTI_DRIVER_TRACE_CBID_cuGraphicsD3D9RegisterResource = 156,
167
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9GetDirect3DDevice = 157,
168
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9RegisterResource = 158,
169
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9UnregisterResource = 159,
170
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9MapResources = 160,
171
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9UnmapResources = 161,
172
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceSetMapFlags = 162,
173
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetSurfaceDimensions = 163,
174
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedArray = 164,
175
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedPointer = 165,
176
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedSize = 166,
177
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedPitch = 167,
178
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9Begin = 168,
179
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9End = 169,
180
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9RegisterVertexBuffer = 170,
181
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9MapVertexBuffer = 171,
182
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9UnmapVertexBuffer = 172,
183
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9UnregisterVertexBuffer = 173,
184
+ CUPTI_DRIVER_TRACE_CBID_cuGLCtxCreate = 174,
185
+ CUPTI_DRIVER_TRACE_CBID_cuGraphicsGLRegisterBuffer = 175,
186
+ CUPTI_DRIVER_TRACE_CBID_cuGraphicsGLRegisterImage = 176,
187
+ CUPTI_DRIVER_TRACE_CBID_cuWGLGetDevice = 177,
188
+ CUPTI_DRIVER_TRACE_CBID_cuGLInit = 178,
189
+ CUPTI_DRIVER_TRACE_CBID_cuGLRegisterBufferObject = 179,
190
+ CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObject = 180,
191
+ CUPTI_DRIVER_TRACE_CBID_cuGLUnmapBufferObject = 181,
192
+ CUPTI_DRIVER_TRACE_CBID_cuGLUnregisterBufferObject = 182,
193
+ CUPTI_DRIVER_TRACE_CBID_cuGLSetBufferObjectMapFlags = 183,
194
+ CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObjectAsync = 184,
195
+ CUPTI_DRIVER_TRACE_CBID_cuGLUnmapBufferObjectAsync = 185,
196
+ CUPTI_DRIVER_TRACE_CBID_cuVDPAUGetDevice = 186,
197
+ CUPTI_DRIVER_TRACE_CBID_cuVDPAUCtxCreate = 187,
198
+ CUPTI_DRIVER_TRACE_CBID_cuGraphicsVDPAURegisterVideoSurface = 188,
199
+ CUPTI_DRIVER_TRACE_CBID_cuGraphicsVDPAURegisterOutputSurface = 189,
200
+ CUPTI_DRIVER_TRACE_CBID_cuModuleGetSurfRef = 190,
201
+ CUPTI_DRIVER_TRACE_CBID_cuSurfRefCreate = 191,
202
+ CUPTI_DRIVER_TRACE_CBID_cuSurfRefDestroy = 192,
203
+ CUPTI_DRIVER_TRACE_CBID_cuSurfRefSetFormat = 193,
204
+ CUPTI_DRIVER_TRACE_CBID_cuSurfRefSetArray = 194,
205
+ CUPTI_DRIVER_TRACE_CBID_cuSurfRefGetFormat = 195,
206
+ CUPTI_DRIVER_TRACE_CBID_cuSurfRefGetArray = 196,
207
+ CUPTI_DRIVER_TRACE_CBID_cu64DeviceTotalMem = 197,
208
+ CUPTI_DRIVER_TRACE_CBID_cu64D3D10ResourceGetMappedPointer = 198,
209
+ CUPTI_DRIVER_TRACE_CBID_cu64D3D10ResourceGetMappedSize = 199,
210
+ CUPTI_DRIVER_TRACE_CBID_cu64D3D10ResourceGetMappedPitch = 200,
211
+ CUPTI_DRIVER_TRACE_CBID_cu64D3D10ResourceGetSurfaceDimensions = 201,
212
+ CUPTI_DRIVER_TRACE_CBID_cu64D3D9ResourceGetSurfaceDimensions = 202,
213
+ CUPTI_DRIVER_TRACE_CBID_cu64D3D9ResourceGetMappedPointer = 203,
214
+ CUPTI_DRIVER_TRACE_CBID_cu64D3D9ResourceGetMappedSize = 204,
215
+ CUPTI_DRIVER_TRACE_CBID_cu64D3D9ResourceGetMappedPitch = 205,
216
+ CUPTI_DRIVER_TRACE_CBID_cu64D3D9MapVertexBuffer = 206,
217
+ CUPTI_DRIVER_TRACE_CBID_cu64GLMapBufferObject = 207,
218
+ CUPTI_DRIVER_TRACE_CBID_cu64GLMapBufferObjectAsync = 208,
219
+ CUPTI_DRIVER_TRACE_CBID_cuD3D11GetDevices = 209,
220
+ CUPTI_DRIVER_TRACE_CBID_cuD3D11CtxCreateOnDevice = 210,
221
+ CUPTI_DRIVER_TRACE_CBID_cuD3D10GetDevices = 211,
222
+ CUPTI_DRIVER_TRACE_CBID_cuD3D10CtxCreateOnDevice = 212,
223
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9GetDevices = 213,
224
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9CtxCreateOnDevice = 214,
225
+ CUPTI_DRIVER_TRACE_CBID_cu64MemHostAlloc = 215,
226
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async = 216,
227
+ CUPTI_DRIVER_TRACE_CBID_cu64MemsetD8Async = 217,
228
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async = 218,
229
+ CUPTI_DRIVER_TRACE_CBID_cu64MemsetD16Async = 219,
230
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async = 220,
231
+ CUPTI_DRIVER_TRACE_CBID_cu64MemsetD32Async = 221,
232
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async = 222,
233
+ CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D8Async = 223,
234
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async = 224,
235
+ CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D16Async = 225,
236
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async = 226,
237
+ CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D32Async = 227,
238
+ CUPTI_DRIVER_TRACE_CBID_cu64ArrayCreate = 228,
239
+ CUPTI_DRIVER_TRACE_CBID_cu64ArrayGetDescriptor = 229,
240
+ CUPTI_DRIVER_TRACE_CBID_cu64Array3DCreate = 230,
241
+ CUPTI_DRIVER_TRACE_CBID_cu64Array3DGetDescriptor = 231,
242
+ CUPTI_DRIVER_TRACE_CBID_cu64Memcpy2D = 232,
243
+ CUPTI_DRIVER_TRACE_CBID_cu64Memcpy2DUnaligned = 233,
244
+ CUPTI_DRIVER_TRACE_CBID_cu64Memcpy2DAsync = 234,
245
+ CUPTI_DRIVER_TRACE_CBID_cuCtxCreate_v2 = 235,
246
+ CUPTI_DRIVER_TRACE_CBID_cuD3D10CtxCreate_v2 = 236,
247
+ CUPTI_DRIVER_TRACE_CBID_cuD3D11CtxCreate_v2 = 237,
248
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9CtxCreate_v2 = 238,
249
+ CUPTI_DRIVER_TRACE_CBID_cuGLCtxCreate_v2 = 239,
250
+ CUPTI_DRIVER_TRACE_CBID_cuVDPAUCtxCreate_v2 = 240,
251
+ CUPTI_DRIVER_TRACE_CBID_cuModuleGetGlobal_v2 = 241,
252
+ CUPTI_DRIVER_TRACE_CBID_cuMemGetInfo_v2 = 242,
253
+ CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2 = 243,
254
+ CUPTI_DRIVER_TRACE_CBID_cuMemAllocPitch_v2 = 244,
255
+ CUPTI_DRIVER_TRACE_CBID_cuMemFree_v2 = 245,
256
+ CUPTI_DRIVER_TRACE_CBID_cuMemGetAddressRange_v2 = 246,
257
+ CUPTI_DRIVER_TRACE_CBID_cuMemHostGetDevicePointer_v2 = 247,
258
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpy_v2 = 248,
259
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2 = 249,
260
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2 = 250,
261
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2 = 251,
262
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2 = 252,
263
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2 = 253,
264
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2 = 254,
265
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress_v2 = 255,
266
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress2D_v2 = 256,
267
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefGetAddress_v2 = 257,
268
+ CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceGetMappedPointer_v2 = 258,
269
+ CUPTI_DRIVER_TRACE_CBID_cuDeviceTotalMem_v2 = 259,
270
+ CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedPointer_v2 = 260,
271
+ CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedSize_v2 = 261,
272
+ CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedPitch_v2 = 262,
273
+ CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetSurfaceDimensions_v2 = 263,
274
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetSurfaceDimensions_v2 = 264,
275
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedPointer_v2 = 265,
276
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedSize_v2 = 266,
277
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedPitch_v2 = 267,
278
+ CUPTI_DRIVER_TRACE_CBID_cuD3D9MapVertexBuffer_v2 = 268,
279
+ CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObject_v2 = 269,
280
+ CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObjectAsync_v2 = 270,
281
+ CUPTI_DRIVER_TRACE_CBID_cuMemHostAlloc_v2 = 271,
282
+ CUPTI_DRIVER_TRACE_CBID_cuArrayCreate_v2 = 272,
283
+ CUPTI_DRIVER_TRACE_CBID_cuArrayGetDescriptor_v2 = 273,
284
+ CUPTI_DRIVER_TRACE_CBID_cuArray3DCreate_v2 = 274,
285
+ CUPTI_DRIVER_TRACE_CBID_cuArray3DGetDescriptor_v2 = 275,
286
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2 = 276,
287
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2 = 277,
288
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2 = 278,
289
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2 = 279,
290
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2 = 280,
291
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2 = 281,
292
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2 = 282,
293
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2 = 283,
294
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2 = 284,
295
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2 = 285,
296
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2 = 286,
297
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2 = 287,
298
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2 = 288,
299
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2 = 289,
300
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2 = 290,
301
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2 = 291,
302
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2 = 292,
303
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2 = 293,
304
+ CUPTI_DRIVER_TRACE_CBID_cuMemAllocHost_v2 = 294,
305
+ CUPTI_DRIVER_TRACE_CBID_cuStreamWaitEvent = 295,
306
+ CUPTI_DRIVER_TRACE_CBID_cuCtxGetApiVersion = 296,
307
+ CUPTI_DRIVER_TRACE_CBID_cuD3D10GetDirect3DDevice = 297,
308
+ CUPTI_DRIVER_TRACE_CBID_cuD3D11GetDirect3DDevice = 298,
309
+ CUPTI_DRIVER_TRACE_CBID_cuCtxGetCacheConfig = 299,
310
+ CUPTI_DRIVER_TRACE_CBID_cuCtxSetCacheConfig = 300,
311
+ CUPTI_DRIVER_TRACE_CBID_cuMemHostRegister = 301,
312
+ CUPTI_DRIVER_TRACE_CBID_cuMemHostUnregister = 302,
313
+ CUPTI_DRIVER_TRACE_CBID_cuCtxSetCurrent = 303,
314
+ CUPTI_DRIVER_TRACE_CBID_cuCtxGetCurrent = 304,
315
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpy = 305,
316
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync = 306,
317
+ CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel = 307,
318
+ CUPTI_DRIVER_TRACE_CBID_cuProfilerStart = 308,
319
+ CUPTI_DRIVER_TRACE_CBID_cuProfilerStop = 309,
320
+ CUPTI_DRIVER_TRACE_CBID_cuPointerGetAttribute = 310,
321
+ CUPTI_DRIVER_TRACE_CBID_cuProfilerInitialize = 311,
322
+ CUPTI_DRIVER_TRACE_CBID_cuDeviceCanAccessPeer = 312,
323
+ CUPTI_DRIVER_TRACE_CBID_cuCtxEnablePeerAccess = 313,
324
+ CUPTI_DRIVER_TRACE_CBID_cuCtxDisablePeerAccess = 314,
325
+ CUPTI_DRIVER_TRACE_CBID_cuMemPeerRegister = 315,
326
+ CUPTI_DRIVER_TRACE_CBID_cuMemPeerUnregister = 316,
327
+ CUPTI_DRIVER_TRACE_CBID_cuMemPeerGetDevicePointer = 317,
328
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer = 318,
329
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync = 319,
330
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DPeer = 320,
331
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DPeerAsync = 321,
332
+ CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy_v2 = 322,
333
+ CUPTI_DRIVER_TRACE_CBID_cuCtxPushCurrent_v2 = 323,
334
+ CUPTI_DRIVER_TRACE_CBID_cuCtxPopCurrent_v2 = 324,
335
+ CUPTI_DRIVER_TRACE_CBID_cuEventDestroy_v2 = 325,
336
+ CUPTI_DRIVER_TRACE_CBID_cuStreamDestroy_v2 = 326,
337
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress2D_v3 = 327,
338
+ CUPTI_DRIVER_TRACE_CBID_cuIpcGetMemHandle = 328,
339
+ CUPTI_DRIVER_TRACE_CBID_cuIpcOpenMemHandle = 329,
340
+ CUPTI_DRIVER_TRACE_CBID_cuIpcCloseMemHandle = 330,
341
+ CUPTI_DRIVER_TRACE_CBID_cuDeviceGetByPCIBusId = 331,
342
+ CUPTI_DRIVER_TRACE_CBID_cuDeviceGetPCIBusId = 332,
343
+ CUPTI_DRIVER_TRACE_CBID_cuGLGetDevices = 333,
344
+ CUPTI_DRIVER_TRACE_CBID_cuIpcGetEventHandle = 334,
345
+ CUPTI_DRIVER_TRACE_CBID_cuIpcOpenEventHandle = 335,
346
+ CUPTI_DRIVER_TRACE_CBID_cuCtxSetSharedMemConfig = 336,
347
+ CUPTI_DRIVER_TRACE_CBID_cuCtxGetSharedMemConfig = 337,
348
+ CUPTI_DRIVER_TRACE_CBID_cuFuncSetSharedMemConfig = 338,
349
+ CUPTI_DRIVER_TRACE_CBID_cuTexObjectCreate = 339,
350
+ CUPTI_DRIVER_TRACE_CBID_cuTexObjectDestroy = 340,
351
+ CUPTI_DRIVER_TRACE_CBID_cuTexObjectGetResourceDesc = 341,
352
+ CUPTI_DRIVER_TRACE_CBID_cuTexObjectGetTextureDesc = 342,
353
+ CUPTI_DRIVER_TRACE_CBID_cuSurfObjectCreate = 343,
354
+ CUPTI_DRIVER_TRACE_CBID_cuSurfObjectDestroy = 344,
355
+ CUPTI_DRIVER_TRACE_CBID_cuSurfObjectGetResourceDesc = 345,
356
+ CUPTI_DRIVER_TRACE_CBID_cuStreamAddCallback = 346,
357
+ CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayCreate = 347,
358
+ CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayGetLevel = 348,
359
+ CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayDestroy = 349,
360
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMipmappedArray = 350,
361
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMipmapFilterMode = 351,
362
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMipmapLevelBias = 352,
363
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMipmapLevelClamp = 353,
364
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMaxAnisotropy = 354,
365
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMipmappedArray = 355,
366
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMipmapFilterMode = 356,
367
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMipmapLevelBias = 357,
368
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMipmapLevelClamp = 358,
369
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMaxAnisotropy = 359,
370
+ CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceGetMappedMipmappedArray = 360,
371
+ CUPTI_DRIVER_TRACE_CBID_cuTexObjectGetResourceViewDesc = 361,
372
+ CUPTI_DRIVER_TRACE_CBID_cuLinkCreate = 362,
373
+ CUPTI_DRIVER_TRACE_CBID_cuLinkAddData = 363,
374
+ CUPTI_DRIVER_TRACE_CBID_cuLinkAddFile = 364,
375
+ CUPTI_DRIVER_TRACE_CBID_cuLinkComplete = 365,
376
+ CUPTI_DRIVER_TRACE_CBID_cuLinkDestroy = 366,
377
+ CUPTI_DRIVER_TRACE_CBID_cuStreamCreateWithPriority = 367,
378
+ CUPTI_DRIVER_TRACE_CBID_cuStreamGetPriority = 368,
379
+ CUPTI_DRIVER_TRACE_CBID_cuStreamGetFlags = 369,
380
+ CUPTI_DRIVER_TRACE_CBID_cuCtxGetStreamPriorityRange = 370,
381
+ CUPTI_DRIVER_TRACE_CBID_cuMemAllocManaged = 371,
382
+ CUPTI_DRIVER_TRACE_CBID_cuGetErrorString = 372,
383
+ CUPTI_DRIVER_TRACE_CBID_cuGetErrorName = 373,
384
+ CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxActiveBlocksPerMultiprocessor = 374,
385
+ CUPTI_DRIVER_TRACE_CBID_cuCompilePtx = 375,
386
+ CUPTI_DRIVER_TRACE_CBID_cuBinaryFree = 376,
387
+ CUPTI_DRIVER_TRACE_CBID_cuStreamAttachMemAsync = 377,
388
+ CUPTI_DRIVER_TRACE_CBID_cuPointerSetAttribute = 378,
389
+ CUPTI_DRIVER_TRACE_CBID_cuMemHostRegister_v2 = 379,
390
+ CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceSetMapFlags_v2 = 380,
391
+ CUPTI_DRIVER_TRACE_CBID_cuLinkCreate_v2 = 381,
392
+ CUPTI_DRIVER_TRACE_CBID_cuLinkAddData_v2 = 382,
393
+ CUPTI_DRIVER_TRACE_CBID_cuLinkAddFile_v2 = 383,
394
+ CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxPotentialBlockSize = 384,
395
+ CUPTI_DRIVER_TRACE_CBID_cuGLGetDevices_v2 = 385,
396
+ CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxRetain = 386,
397
+ CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxRelease = 387,
398
+ CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxSetFlags = 388,
399
+ CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxReset = 389,
400
+ CUPTI_DRIVER_TRACE_CBID_cuGraphicsEGLRegisterImage = 390,
401
+ CUPTI_DRIVER_TRACE_CBID_cuCtxGetFlags = 391,
402
+ CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxGetState = 392,
403
+ CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerConnect = 393,
404
+ CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerDisconnect = 394,
405
+ CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerAcquireFrame = 395,
406
+ CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerReleaseFrame = 396,
407
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2_ptds = 397,
408
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2_ptds = 398,
409
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2_ptds = 399,
410
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2_ptds = 400,
411
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2_ptds = 401,
412
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2_ptds = 402,
413
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2_ptds = 403,
414
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2_ptds = 404,
415
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2_ptds = 405,
416
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2_ptds = 406,
417
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2_ptds = 407,
418
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpy_ptds = 408,
419
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer_ptds = 409,
420
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DPeer_ptds = 410,
421
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2_ptds = 411,
422
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2_ptds = 412,
423
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2_ptds = 413,
424
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2_ptds = 414,
425
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2_ptds = 415,
426
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2_ptds = 416,
427
+ CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObject_v2_ptds = 417,
428
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync_ptsz = 418,
429
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2_ptsz = 419,
430
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2_ptsz = 420,
431
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2_ptsz = 421,
432
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2_ptsz = 422,
433
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2_ptsz = 423,
434
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2_ptsz = 424,
435
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2_ptsz = 425,
436
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync_ptsz = 426,
437
+ CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DPeerAsync_ptsz = 427,
438
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async_ptsz = 428,
439
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async_ptsz = 429,
440
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async_ptsz = 430,
441
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async_ptsz = 431,
442
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async_ptsz = 432,
443
+ CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async_ptsz = 433,
444
+ CUPTI_DRIVER_TRACE_CBID_cuStreamGetPriority_ptsz = 434,
445
+ CUPTI_DRIVER_TRACE_CBID_cuStreamGetFlags_ptsz = 435,
446
+ CUPTI_DRIVER_TRACE_CBID_cuStreamWaitEvent_ptsz = 436,
447
+ CUPTI_DRIVER_TRACE_CBID_cuStreamAddCallback_ptsz = 437,
448
+ CUPTI_DRIVER_TRACE_CBID_cuStreamAttachMemAsync_ptsz = 438,
449
+ CUPTI_DRIVER_TRACE_CBID_cuStreamQuery_ptsz = 439,
450
+ CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize_ptsz = 440,
451
+ CUPTI_DRIVER_TRACE_CBID_cuEventRecord_ptsz = 441,
452
+ CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel_ptsz = 442,
453
+ CUPTI_DRIVER_TRACE_CBID_cuGraphicsMapResources_ptsz = 443,
454
+ CUPTI_DRIVER_TRACE_CBID_cuGraphicsUnmapResources_ptsz = 444,
455
+ CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObjectAsync_v2_ptsz = 445,
456
+ CUPTI_DRIVER_TRACE_CBID_cuEGLStreamProducerConnect = 446,
457
+ CUPTI_DRIVER_TRACE_CBID_cuEGLStreamProducerDisconnect = 447,
458
+ CUPTI_DRIVER_TRACE_CBID_cuEGLStreamProducerPresentFrame = 448,
459
+ CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceGetMappedEglFrame = 449,
460
+ CUPTI_DRIVER_TRACE_CBID_cuPointerGetAttributes = 450,
461
+ CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags = 451,
462
+ CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxPotentialBlockSizeWithFlags = 452,
463
+ CUPTI_DRIVER_TRACE_CBID_cuEGLStreamProducerReturnFrame = 453,
464
+ CUPTI_DRIVER_TRACE_CBID_cuDeviceGetP2PAttribute = 454,
465
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefSetBorderColor = 455,
466
+ CUPTI_DRIVER_TRACE_CBID_cuTexRefGetBorderColor = 456,
467
+ CUPTI_DRIVER_TRACE_CBID_cuMemAdvise = 457,
468
+ CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue32 = 458,
469
+ CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue32_ptsz = 459,
470
+ CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue32 = 460,
471
+ CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue32_ptsz = 461,
472
+ CUPTI_DRIVER_TRACE_CBID_cuStreamBatchMemOp = 462,
473
+ CUPTI_DRIVER_TRACE_CBID_cuStreamBatchMemOp_ptsz = 463,
474
+ CUPTI_DRIVER_TRACE_CBID_cuNVNbufferGetPointer = 464,
475
+ CUPTI_DRIVER_TRACE_CBID_cuNVNtextureGetArray = 465,
476
+ CUPTI_DRIVER_TRACE_CBID_cuNNSetAllocator = 466,
477
+ CUPTI_DRIVER_TRACE_CBID_cuMemPrefetchAsync = 467,
478
+ CUPTI_DRIVER_TRACE_CBID_cuMemPrefetchAsync_ptsz = 468,
479
+ CUPTI_DRIVER_TRACE_CBID_cuEventCreateFromNVNSync = 469,
480
+ CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerConnectWithFlags = 470,
481
+ CUPTI_DRIVER_TRACE_CBID_cuMemRangeGetAttribute = 471,
482
+ CUPTI_DRIVER_TRACE_CBID_cuMemRangeGetAttributes = 472,
483
+ CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue64 = 473,
484
+ CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue64_ptsz = 474,
485
+ CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue64 = 475,
486
+ CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue64_ptsz = 476,
487
+ CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel = 477,
488
+ CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel_ptsz = 478,
489
+ CUPTI_DRIVER_TRACE_CBID_cuEventCreateFromEGLSync = 479,
490
+ CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice = 480,
491
+ CUPTI_DRIVER_TRACE_CBID_cuFuncSetAttribute = 481,
492
+ CUPTI_DRIVER_TRACE_CBID_cuDeviceGetUuid = 482,
493
+ CUPTI_DRIVER_TRACE_CBID_cuStreamGetCtx = 483,
494
+ CUPTI_DRIVER_TRACE_CBID_cuStreamGetCtx_ptsz = 484,
495
+ CUPTI_DRIVER_TRACE_CBID_cuImportExternalMemory = 485,
496
+ CUPTI_DRIVER_TRACE_CBID_cuExternalMemoryGetMappedBuffer = 486,
497
+ CUPTI_DRIVER_TRACE_CBID_cuExternalMemoryGetMappedMipmappedArray = 487,
498
+ CUPTI_DRIVER_TRACE_CBID_cuDestroyExternalMemory = 488,
499
+ CUPTI_DRIVER_TRACE_CBID_cuImportExternalSemaphore = 489,
500
+ CUPTI_DRIVER_TRACE_CBID_cuSignalExternalSemaphoresAsync = 490,
501
+ CUPTI_DRIVER_TRACE_CBID_cuSignalExternalSemaphoresAsync_ptsz = 491,
502
+ CUPTI_DRIVER_TRACE_CBID_cuWaitExternalSemaphoresAsync = 492,
503
+ CUPTI_DRIVER_TRACE_CBID_cuWaitExternalSemaphoresAsync_ptsz = 493,
504
+ CUPTI_DRIVER_TRACE_CBID_cuDestroyExternalSemaphore = 494,
505
+ CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCapture = 495,
506
+ CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCapture_ptsz = 496,
507
+ CUPTI_DRIVER_TRACE_CBID_cuStreamEndCapture = 497,
508
+ CUPTI_DRIVER_TRACE_CBID_cuStreamEndCapture_ptsz = 498,
509
+ CUPTI_DRIVER_TRACE_CBID_cuStreamIsCapturing = 499,
510
+ CUPTI_DRIVER_TRACE_CBID_cuStreamIsCapturing_ptsz = 500,
511
+ CUPTI_DRIVER_TRACE_CBID_cuGraphCreate = 501,
512
+ CUPTI_DRIVER_TRACE_CBID_cuGraphAddKernelNode = 502,
513
+ CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeGetParams = 503,
514
+ CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemcpyNode = 504,
515
+ CUPTI_DRIVER_TRACE_CBID_cuGraphMemcpyNodeGetParams = 505,
516
+ CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemsetNode = 506,
517
+ CUPTI_DRIVER_TRACE_CBID_cuGraphMemsetNodeGetParams = 507,
518
+ CUPTI_DRIVER_TRACE_CBID_cuGraphMemsetNodeSetParams = 508,
519
+ CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetType = 509,
520
+ CUPTI_DRIVER_TRACE_CBID_cuGraphGetRootNodes = 510,
521
+ CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetDependencies = 511,
522
+ CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetDependentNodes = 512,
523
+ CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiate = 513,
524
+ CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch = 514,
525
+ CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz = 515,
526
+ CUPTI_DRIVER_TRACE_CBID_cuGraphExecDestroy = 516,
527
+ CUPTI_DRIVER_TRACE_CBID_cuGraphDestroy = 517,
528
+ CUPTI_DRIVER_TRACE_CBID_cuGraphAddDependencies = 518,
529
+ CUPTI_DRIVER_TRACE_CBID_cuGraphRemoveDependencies = 519,
530
+ CUPTI_DRIVER_TRACE_CBID_cuGraphMemcpyNodeSetParams = 520,
531
+ CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeSetParams = 521,
532
+ CUPTI_DRIVER_TRACE_CBID_cuGraphDestroyNode = 522,
533
+ CUPTI_DRIVER_TRACE_CBID_cuGraphClone = 523,
534
+ CUPTI_DRIVER_TRACE_CBID_cuGraphNodeFindInClone = 524,
535
+ CUPTI_DRIVER_TRACE_CBID_cuGraphAddChildGraphNode = 525,
536
+ CUPTI_DRIVER_TRACE_CBID_cuGraphAddEmptyNode = 526,
537
+ CUPTI_DRIVER_TRACE_CBID_cuLaunchHostFunc = 527,
538
+ CUPTI_DRIVER_TRACE_CBID_cuLaunchHostFunc_ptsz = 528,
539
+ CUPTI_DRIVER_TRACE_CBID_cuGraphChildGraphNodeGetGraph = 529,
540
+ CUPTI_DRIVER_TRACE_CBID_cuGraphAddHostNode = 530,
541
+ CUPTI_DRIVER_TRACE_CBID_cuGraphHostNodeGetParams = 531,
542
+ CUPTI_DRIVER_TRACE_CBID_cuDeviceGetLuid = 532,
543
+ CUPTI_DRIVER_TRACE_CBID_cuGraphHostNodeSetParams = 533,
544
+ CUPTI_DRIVER_TRACE_CBID_cuGraphGetNodes = 534,
545
+ CUPTI_DRIVER_TRACE_CBID_cuGraphGetEdges = 535,
546
+ CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo = 536,
547
+ CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo_ptsz = 537,
548
+ CUPTI_DRIVER_TRACE_CBID_cuGraphExecKernelNodeSetParams = 538,
549
+ CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCapture_v2 = 539,
550
+ CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCapture_v2_ptsz = 540,
551
+ CUPTI_DRIVER_TRACE_CBID_cuThreadExchangeStreamCaptureMode = 541,
552
+ CUPTI_DRIVER_TRACE_CBID_cuDeviceGetNvSciSyncAttributes = 542,
553
+ CUPTI_DRIVER_TRACE_CBID_cuOccupancyAvailableDynamicSMemPerBlock = 543,
554
+ CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxRelease_v2 = 544,
555
+ CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxReset_v2 = 545,
556
+ CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxSetFlags_v2 = 546,
557
+ CUPTI_DRIVER_TRACE_CBID_cuMemAddressReserve = 547,
558
+ CUPTI_DRIVER_TRACE_CBID_cuMemAddressFree = 548,
559
+ CUPTI_DRIVER_TRACE_CBID_cuMemCreate = 549,
560
+ CUPTI_DRIVER_TRACE_CBID_cuMemRelease = 550,
561
+ CUPTI_DRIVER_TRACE_CBID_cuMemMap = 551,
562
+ CUPTI_DRIVER_TRACE_CBID_cuMemUnmap = 552,
563
+ CUPTI_DRIVER_TRACE_CBID_cuMemSetAccess = 553,
564
+ CUPTI_DRIVER_TRACE_CBID_cuMemExportToShareableHandle = 554,
565
+ CUPTI_DRIVER_TRACE_CBID_cuMemImportFromShareableHandle = 555,
566
+ CUPTI_DRIVER_TRACE_CBID_cuMemGetAllocationGranularity = 556,
567
+ CUPTI_DRIVER_TRACE_CBID_cuMemGetAllocationPropertiesFromHandle = 557,
568
+ CUPTI_DRIVER_TRACE_CBID_cuMemGetAccess = 558,
569
+ CUPTI_DRIVER_TRACE_CBID_cuStreamSetFlags = 559,
570
+ CUPTI_DRIVER_TRACE_CBID_cuStreamSetFlags_ptsz = 560,
571
+ CUPTI_DRIVER_TRACE_CBID_cuGraphExecUpdate = 561,
572
+ CUPTI_DRIVER_TRACE_CBID_cuGraphExecMemcpyNodeSetParams = 562,
573
+ CUPTI_DRIVER_TRACE_CBID_cuGraphExecMemsetNodeSetParams = 563,
574
+ CUPTI_DRIVER_TRACE_CBID_cuGraphExecHostNodeSetParams = 564,
575
+ CUPTI_DRIVER_TRACE_CBID_cuMemRetainAllocationHandle = 565,
576
+ CUPTI_DRIVER_TRACE_CBID_cuFuncGetModule = 566,
577
+ CUPTI_DRIVER_TRACE_CBID_cuIpcOpenMemHandle_v2 = 567,
578
+ CUPTI_DRIVER_TRACE_CBID_cuCtxResetPersistingL2Cache = 568,
579
+ CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeCopyAttributes = 569,
580
+ CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeGetAttribute = 570,
581
+ CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeSetAttribute = 571,
582
+ CUPTI_DRIVER_TRACE_CBID_cuStreamCopyAttributes = 572,
583
+ CUPTI_DRIVER_TRACE_CBID_cuStreamCopyAttributes_ptsz = 573,
584
+ CUPTI_DRIVER_TRACE_CBID_cuStreamGetAttribute = 574,
585
+ CUPTI_DRIVER_TRACE_CBID_cuStreamGetAttribute_ptsz = 575,
586
+ CUPTI_DRIVER_TRACE_CBID_cuStreamSetAttribute = 576,
587
+ CUPTI_DRIVER_TRACE_CBID_cuStreamSetAttribute_ptsz = 577,
588
+ CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiate_v2 = 578,
589
+ CUPTI_DRIVER_TRACE_CBID_cuDeviceGetTexture1DLinearMaxWidth = 579,
590
+ CUPTI_DRIVER_TRACE_CBID_cuGraphUpload = 580,
591
+ CUPTI_DRIVER_TRACE_CBID_cuGraphUpload_ptsz = 581,
592
+ CUPTI_DRIVER_TRACE_CBID_cuArrayGetSparseProperties = 582,
593
+ CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayGetSparseProperties = 583,
594
+ CUPTI_DRIVER_TRACE_CBID_cuMemMapArrayAsync = 584,
595
+ CUPTI_DRIVER_TRACE_CBID_cuMemMapArrayAsync_ptsz = 585,
596
+ CUPTI_DRIVER_TRACE_CBID_cuGraphExecChildGraphNodeSetParams = 586,
597
+ CUPTI_DRIVER_TRACE_CBID_cuEventRecordWithFlags = 587,
598
+ CUPTI_DRIVER_TRACE_CBID_cuEventRecordWithFlags_ptsz = 588,
599
+ CUPTI_DRIVER_TRACE_CBID_cuGraphAddEventRecordNode = 589,
600
+ CUPTI_DRIVER_TRACE_CBID_cuGraphAddEventWaitNode = 590,
601
+ CUPTI_DRIVER_TRACE_CBID_cuGraphEventRecordNodeGetEvent = 591,
602
+ CUPTI_DRIVER_TRACE_CBID_cuGraphEventWaitNodeGetEvent = 592,
603
+ CUPTI_DRIVER_TRACE_CBID_cuGraphEventRecordNodeSetEvent = 593,
604
+ CUPTI_DRIVER_TRACE_CBID_cuGraphEventWaitNodeSetEvent = 594,
605
+ CUPTI_DRIVER_TRACE_CBID_cuGraphExecEventRecordNodeSetEvent = 595,
606
+ CUPTI_DRIVER_TRACE_CBID_cuGraphExecEventWaitNodeSetEvent = 596,
607
+ CUPTI_DRIVER_TRACE_CBID_cuArrayGetPlane = 597,
608
+ CUPTI_DRIVER_TRACE_CBID_cuMemAllocAsync = 598,
609
+ CUPTI_DRIVER_TRACE_CBID_cuMemAllocAsync_ptsz = 599,
610
+ CUPTI_DRIVER_TRACE_CBID_cuMemFreeAsync = 600,
611
+ CUPTI_DRIVER_TRACE_CBID_cuMemFreeAsync_ptsz = 601,
612
+ CUPTI_DRIVER_TRACE_CBID_cuMemPoolTrimTo = 602,
613
+ CUPTI_DRIVER_TRACE_CBID_cuMemPoolSetAttribute = 603,
614
+ CUPTI_DRIVER_TRACE_CBID_cuMemPoolGetAttribute = 604,
615
+ CUPTI_DRIVER_TRACE_CBID_cuMemPoolSetAccess = 605,
616
+ CUPTI_DRIVER_TRACE_CBID_cuDeviceGetDefaultMemPool = 606,
617
+ CUPTI_DRIVER_TRACE_CBID_cuMemPoolCreate = 607,
618
+ CUPTI_DRIVER_TRACE_CBID_cuMemPoolDestroy = 608,
619
+ CUPTI_DRIVER_TRACE_CBID_cuDeviceSetMemPool = 609,
620
+ CUPTI_DRIVER_TRACE_CBID_cuDeviceGetMemPool = 610,
621
+ CUPTI_DRIVER_TRACE_CBID_cuMemAllocFromPoolAsync = 611,
622
+ CUPTI_DRIVER_TRACE_CBID_cuMemAllocFromPoolAsync_ptsz = 612,
623
+ CUPTI_DRIVER_TRACE_CBID_cuMemPoolExportToShareableHandle = 613,
624
+ CUPTI_DRIVER_TRACE_CBID_cuMemPoolImportFromShareableHandle = 614,
625
+ CUPTI_DRIVER_TRACE_CBID_cuMemPoolExportPointer = 615,
626
+ CUPTI_DRIVER_TRACE_CBID_cuMemPoolImportPointer = 616,
627
+ CUPTI_DRIVER_TRACE_CBID_cuMemPoolGetAccess = 617,
628
+ CUPTI_DRIVER_TRACE_CBID_cuGraphAddExternalSemaphoresSignalNode = 618,
629
+ CUPTI_DRIVER_TRACE_CBID_cuGraphExternalSemaphoresSignalNodeGetParams = 619,
630
+ CUPTI_DRIVER_TRACE_CBID_cuGraphExternalSemaphoresSignalNodeSetParams = 620,
631
+ CUPTI_DRIVER_TRACE_CBID_cuGraphAddExternalSemaphoresWaitNode = 621,
632
+ CUPTI_DRIVER_TRACE_CBID_cuGraphExternalSemaphoresWaitNodeGetParams = 622,
633
+ CUPTI_DRIVER_TRACE_CBID_cuGraphExternalSemaphoresWaitNodeSetParams = 623,
634
+ CUPTI_DRIVER_TRACE_CBID_cuGraphExecExternalSemaphoresSignalNodeSetParams = 624,
635
+ CUPTI_DRIVER_TRACE_CBID_cuGraphExecExternalSemaphoresWaitNodeSetParams = 625,
636
+ CUPTI_DRIVER_TRACE_CBID_cuGetProcAddress = 626,
637
+ CUPTI_DRIVER_TRACE_CBID_cuFlushGPUDirectRDMAWrites = 627,
638
+ CUPTI_DRIVER_TRACE_CBID_cuGraphDebugDotPrint = 628,
639
+ CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo_v2 = 629,
640
+ CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo_v2_ptsz = 630,
641
+ CUPTI_DRIVER_TRACE_CBID_cuStreamUpdateCaptureDependencies = 631,
642
+ CUPTI_DRIVER_TRACE_CBID_cuStreamUpdateCaptureDependencies_ptsz = 632,
643
+ CUPTI_DRIVER_TRACE_CBID_cuUserObjectCreate = 633,
644
+ CUPTI_DRIVER_TRACE_CBID_cuUserObjectRetain = 634,
645
+ CUPTI_DRIVER_TRACE_CBID_cuUserObjectRelease = 635,
646
+ CUPTI_DRIVER_TRACE_CBID_cuGraphRetainUserObject = 636,
647
+ CUPTI_DRIVER_TRACE_CBID_cuGraphReleaseUserObject = 637,
648
+ CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemAllocNode = 638,
649
+ CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemFreeNode = 639,
650
+ CUPTI_DRIVER_TRACE_CBID_cuDeviceGraphMemTrim = 640,
651
+ CUPTI_DRIVER_TRACE_CBID_cuDeviceGetGraphMemAttribute = 641,
652
+ CUPTI_DRIVER_TRACE_CBID_cuDeviceSetGraphMemAttribute = 642,
653
+ CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiateWithFlags = 643,
654
+ CUPTI_DRIVER_TRACE_CBID_cuDeviceGetExecAffinitySupport = 644,
655
+ CUPTI_DRIVER_TRACE_CBID_cuCtxCreate_v3 = 645,
656
+ CUPTI_DRIVER_TRACE_CBID_cuCtxGetExecAffinity = 646,
657
+ CUPTI_DRIVER_TRACE_CBID_cuDeviceGetUuid_v2 = 647,
658
+ CUPTI_DRIVER_TRACE_CBID_cuGraphMemAllocNodeGetParams = 648,
659
+ CUPTI_DRIVER_TRACE_CBID_cuGraphMemFreeNodeGetParams = 649,
660
+ CUPTI_DRIVER_TRACE_CBID_cuGraphNodeSetEnabled = 650,
661
+ CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetEnabled = 651,
662
+ CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx = 652,
663
+ CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx_ptsz = 653,
664
+ CUPTI_DRIVER_TRACE_CBID_cuArrayGetMemoryRequirements = 654,
665
+ CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayGetMemoryRequirements = 655,
666
+ CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiateWithParams = 656,
667
+ CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiateWithParams_ptsz = 657,
668
+ CUPTI_DRIVER_TRACE_CBID_cuGraphExecGetFlags = 658,
669
+ CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue32_v2 = 659,
670
+ CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue32_v2_ptsz = 660,
671
+ CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue64_v2 = 661,
672
+ CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue64_v2_ptsz = 662,
673
+ CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue32_v2 = 663,
674
+ CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue32_v2_ptsz = 664,
675
+ CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue64_v2 = 665,
676
+ CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue64_v2_ptsz = 666,
677
+ CUPTI_DRIVER_TRACE_CBID_cuStreamBatchMemOp_v2 = 667,
678
+ CUPTI_DRIVER_TRACE_CBID_cuStreamBatchMemOp_v2_ptsz = 668,
679
+ CUPTI_DRIVER_TRACE_CBID_cuGraphAddBatchMemOpNode = 669,
680
+ CUPTI_DRIVER_TRACE_CBID_cuGraphBatchMemOpNodeGetParams = 670,
681
+ CUPTI_DRIVER_TRACE_CBID_cuGraphBatchMemOpNodeSetParams = 671,
682
+ CUPTI_DRIVER_TRACE_CBID_cuGraphExecBatchMemOpNodeSetParams = 672,
683
+ CUPTI_DRIVER_TRACE_CBID_cuModuleGetLoadingMode = 673,
684
+ CUPTI_DRIVER_TRACE_CBID_cuMemGetHandleForAddressRange = 674,
685
+ CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxPotentialClusterSize = 675,
686
+ CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxActiveClusters = 676,
687
+ CUPTI_DRIVER_TRACE_CBID_SIZE = 677,
688
+ CUPTI_DRIVER_TRACE_CBID_FORCE_INT = 0x7fffffff
689
+ } CUpti_driver_api_trace_cbid;
690
+
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_metrics.h ADDED
@@ -0,0 +1,825 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2011-2020 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(_CUPTI_METRIC_H_)
51
+ #define _CUPTI_METRIC_H_
52
+
53
+ #include <cuda.h>
54
+ #include <string.h>
55
+ #include <cuda_stdint.h>
56
+ #include <cupti_result.h>
57
+
58
+ #ifndef CUPTIAPI
59
+ #ifdef _WIN32
60
+ #define CUPTIAPI __stdcall
61
+ #else
62
+ #define CUPTIAPI
63
+ #endif
64
+ #endif
65
+
66
+ #if defined(__cplusplus)
67
+ extern "C" {
68
+ #endif
69
+
70
+ #if defined(__GNUC__) && defined(CUPTI_LIB)
71
+ #pragma GCC visibility push(default)
72
+ #endif
73
+
74
+ /**
75
+ * \defgroup CUPTI_METRIC_API CUPTI Metric API
76
+ * Functions, types, and enums that implement the CUPTI Metric API.
77
+ *
78
+ * \note CUPTI metric API from the header cupti_metrics.h are not supported on devices
79
+ * with compute capability 7.5 and higher (i.e. Turing and later GPU architectures).
80
+ * These API will be deprecated in a future CUDA release. These are replaced by
81
+ * Profiling API in the header cupti_profiler_target.h and Perfworks metrics API
82
+ * in the headers nvperf_host.h and nvperf_target.h which are supported on
83
+ * devices with compute capability 7.0 and higher (i.e. Volta and later GPU
84
+ * architectures).
85
+ *
86
+ * @{
87
+ */
88
+
89
+ /**
90
+ * \brief ID for a metric.
91
+ *
92
+ * A metric provides a measure of some aspect of the device.
93
+ */
94
+ typedef uint32_t CUpti_MetricID;
95
+
96
+ /**
97
+ * \brief A metric category.
98
+ *
99
+ * Each metric is assigned to a category that represents the general
100
+ * type of the metric. A metric's category is accessed using \ref
101
+ * cuptiMetricGetAttribute and the CUPTI_METRIC_ATTR_CATEGORY
102
+ * attribute.
103
+ */
104
+ typedef enum {
105
+ /**
106
+ * A memory related metric.
107
+ */
108
+ CUPTI_METRIC_CATEGORY_MEMORY = 0,
109
+ /**
110
+ * An instruction related metric.
111
+ */
112
+ CUPTI_METRIC_CATEGORY_INSTRUCTION = 1,
113
+ /**
114
+ * A multiprocessor related metric.
115
+ */
116
+ CUPTI_METRIC_CATEGORY_MULTIPROCESSOR = 2,
117
+ /**
118
+ * A cache related metric.
119
+ */
120
+ CUPTI_METRIC_CATEGORY_CACHE = 3,
121
+ /**
122
+ * A texture related metric.
123
+ */
124
+ CUPTI_METRIC_CATEGORY_TEXTURE = 4,
125
+ /**
126
+ *A Nvlink related metric.
127
+ */
128
+ CUPTI_METRIC_CATEGORY_NVLINK = 5,
129
+ /**
130
+ *A PCIe related metric.
131
+ */
132
+ CUPTI_METRIC_CATEGORY_PCIE = 6,
133
+ CUPTI_METRIC_CATEGORY_FORCE_INT = 0x7fffffff,
134
+ } CUpti_MetricCategory;
135
+
136
+ /**
137
+ * \brief A metric evaluation mode.
138
+ *
139
+ * A metric can be evaluated per hardware instance to know the load balancing
140
+ * across instances of a domain or the metric can be evaluated in aggregate mode
141
+ * when the events involved in metric evaluation are from different event
142
+ * domains. It might be possible to evaluate some metrics in both
143
+ * modes for convenience. A metric's evaluation mode is accessed using \ref
144
+ * CUpti_MetricEvaluationMode and the CUPTI_METRIC_ATTR_EVALUATION_MODE
145
+ * attribute.
146
+ */
147
+ typedef enum {
148
+ /**
149
+ * If this bit is set, the metric can be profiled for each instance of the
150
+ * domain. The event values passed to \ref cuptiMetricGetValue can contain
151
+ * values for one instance of the domain. And \ref cuptiMetricGetValue can
152
+ * be called for each instance.
153
+ */
154
+ CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE = 1,
155
+ /**
156
+ * If this bit is set, the metric can be profiled over all instances. The
157
+ * event values passed to \ref cuptiMetricGetValue can be aggregated values
158
+ * of events for all instances of the domain.
159
+ */
160
+ CUPTI_METRIC_EVALUATION_MODE_AGGREGATE = 1 << 1,
161
+ CUPTI_METRIC_EVALUATION_MODE_FORCE_INT = 0x7fffffff,
162
+ } CUpti_MetricEvaluationMode;
163
+
164
+ /**
165
+ * \brief Kinds of metric values.
166
+ *
167
+ * Metric values can be one of several different kinds. Corresponding
168
+ * to each kind is a member of the CUpti_MetricValue union. The metric
169
+ * value returned by \ref cuptiMetricGetValue should be accessed using
170
+ * the appropriate member of that union based on its value kind.
171
+ */
172
+ typedef enum {
173
+ /**
174
+ * The metric value is a 64-bit double.
175
+ */
176
+ CUPTI_METRIC_VALUE_KIND_DOUBLE = 0,
177
+ /**
178
+ * The metric value is a 64-bit unsigned integer.
179
+ */
180
+ CUPTI_METRIC_VALUE_KIND_UINT64 = 1,
181
+ /**
182
+ * The metric value is a percentage represented by a 64-bit
183
+ * double. For example, 57.5% is represented by the value 57.5.
184
+ */
185
+ CUPTI_METRIC_VALUE_KIND_PERCENT = 2,
186
+ /**
187
+ * The metric value is a throughput represented by a 64-bit
188
+ * integer. The unit for throughput values is bytes/second.
189
+ */
190
+ CUPTI_METRIC_VALUE_KIND_THROUGHPUT = 3,
191
+ /**
192
+ * The metric value is a 64-bit signed integer.
193
+ */
194
+ CUPTI_METRIC_VALUE_KIND_INT64 = 4,
195
+ /**
196
+ * The metric value is a utilization level, as represented by
197
+ * CUpti_MetricValueUtilizationLevel.
198
+ */
199
+ CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL = 5,
200
+
201
+ CUPTI_METRIC_VALUE_KIND_FORCE_INT = 0x7fffffff
202
+ } CUpti_MetricValueKind;
203
+
204
+ /**
205
+ * \brief Enumeration of utilization levels for metrics values of kind
206
+ * CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL. Utilization values can
207
+ * vary from IDLE (0) to MAX (10) but the enumeration only provides
208
+ * specific names for a few values.
209
+ */
210
+ typedef enum {
211
+ CUPTI_METRIC_VALUE_UTILIZATION_IDLE = 0,
212
+ CUPTI_METRIC_VALUE_UTILIZATION_LOW = 2,
213
+ CUPTI_METRIC_VALUE_UTILIZATION_MID = 5,
214
+ CUPTI_METRIC_VALUE_UTILIZATION_HIGH = 8,
215
+ CUPTI_METRIC_VALUE_UTILIZATION_MAX = 10,
216
+ CUPTI_METRIC_VALUE_UTILIZATION_FORCE_INT = 0x7fffffff
217
+ } CUpti_MetricValueUtilizationLevel;
218
+
219
+ /**
220
+ * \brief Metric attributes.
221
+ *
222
+ * Metric attributes describe properties of a metric. These attributes
223
+ * can be read using \ref cuptiMetricGetAttribute.
224
+ */
225
+ typedef enum {
226
+ /**
227
+ * Metric name. Value is a null terminated const c-string.
228
+ */
229
+ CUPTI_METRIC_ATTR_NAME = 0,
230
+ /**
231
+ * Short description of metric. Value is a null terminated const c-string.
232
+ */
233
+ CUPTI_METRIC_ATTR_SHORT_DESCRIPTION = 1,
234
+ /**
235
+ * Long description of metric. Value is a null terminated const c-string.
236
+ */
237
+ CUPTI_METRIC_ATTR_LONG_DESCRIPTION = 2,
238
+ /**
239
+ * Category of the metric. Value is of type CUpti_MetricCategory.
240
+ */
241
+ CUPTI_METRIC_ATTR_CATEGORY = 3,
242
+ /**
243
+ * Value type of the metric. Value is of type CUpti_MetricValueKind.
244
+ */
245
+ CUPTI_METRIC_ATTR_VALUE_KIND = 4,
246
+ /**
247
+ * Metric evaluation mode. Value is of type CUpti_MetricEvaluationMode.
248
+ */
249
+ CUPTI_METRIC_ATTR_EVALUATION_MODE = 5,
250
+ CUPTI_METRIC_ATTR_FORCE_INT = 0x7fffffff,
251
+ } CUpti_MetricAttribute;
252
+
253
+ /**
254
+ * \brief A metric value.
255
+ *
256
+ * Metric values can be one of several different kinds. Corresponding
257
+ * to each kind is a member of the CUpti_MetricValue union. The metric
258
+ * value returned by \ref cuptiMetricGetValue should be accessed using
259
+ * the appropriate member of that union based on its value kind.
260
+ */
261
+ typedef union {
262
+ /*
263
+ * Value for CUPTI_METRIC_VALUE_KIND_DOUBLE.
264
+ */
265
+ double metricValueDouble;
266
+ /*
267
+ * Value for CUPTI_METRIC_VALUE_KIND_UINT64.
268
+ */
269
+ uint64_t metricValueUint64;
270
+ /*
271
+ * Value for CUPTI_METRIC_VALUE_KIND_INT64.
272
+ */
273
+ int64_t metricValueInt64;
274
+ /*
275
+ * Value for CUPTI_METRIC_VALUE_KIND_PERCENT. For example, 57.5% is
276
+ * represented by the value 57.5.
277
+ */
278
+ double metricValuePercent;
279
+ /*
280
+ * Value for CUPTI_METRIC_VALUE_KIND_THROUGHPUT. The unit for
281
+ * throughput values is bytes/second.
282
+ */
283
+ uint64_t metricValueThroughput;
284
+ /*
285
+ * Value for CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL.
286
+ */
287
+ CUpti_MetricValueUtilizationLevel metricValueUtilizationLevel;
288
+ } CUpti_MetricValue;
289
+
290
+ /**
291
+ * \brief Device class.
292
+ *
293
+ * Enumeration of device classes for metric property
294
+ * CUPTI_METRIC_PROPERTY_DEVICE_CLASS.
295
+ */
296
+ typedef enum {
297
+ CUPTI_METRIC_PROPERTY_DEVICE_CLASS_TESLA = 0,
298
+ CUPTI_METRIC_PROPERTY_DEVICE_CLASS_QUADRO = 1,
299
+ CUPTI_METRIC_PROPERTY_DEVICE_CLASS_GEFORCE = 2,
300
+ CUPTI_METRIC_PROPERTY_DEVICE_CLASS_TEGRA = 3,
301
+ } CUpti_MetricPropertyDeviceClass;
302
+
303
+ /**
304
+ * \brief Metric device properties.
305
+ *
306
+ * Metric device properties describe device properties which are needed for a metric.
307
+ * Some of these properties can be collected using cuDeviceGetAttribute.
308
+ */
309
+ typedef enum {
310
+ /*
311
+ * Number of multiprocessors on a device. This can be collected
312
+ * using value of \param CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT of
313
+ * cuDeviceGetAttribute.
314
+ */
315
+ CUPTI_METRIC_PROPERTY_MULTIPROCESSOR_COUNT,
316
+ /*
317
+ * Maximum number of warps on a multiprocessor. This can be
318
+ * collected using ratio of value of \param
319
+ * CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR and \param
320
+ * CU_DEVICE_ATTRIBUTE_WARP_SIZE of cuDeviceGetAttribute.
321
+ */
322
+ CUPTI_METRIC_PROPERTY_WARPS_PER_MULTIPROCESSOR,
323
+ /*
324
+ * GPU Time for kernel in ns. This should be profiled using CUPTI
325
+ * Activity API.
326
+ */
327
+ CUPTI_METRIC_PROPERTY_KERNEL_GPU_TIME,
328
+ /*
329
+ * Clock rate for device in KHz. This should be collected using
330
+ * value of \param CU_DEVICE_ATTRIBUTE_CLOCK_RATE of
331
+ * cuDeviceGetAttribute.
332
+ */
333
+ CUPTI_METRIC_PROPERTY_CLOCK_RATE,
334
+ /*
335
+ * Number of Frame buffer units for device. This should be collected
336
+ * using value of \param CUPTI_DEVICE_ATTRIBUTE_MAX_FRAME_BUFFERS of
337
+ * cuptiDeviceGetAttribute.
338
+ */
339
+ CUPTI_METRIC_PROPERTY_FRAME_BUFFER_COUNT,
340
+ /*
341
+ * Global memory bandwidth in KBytes/sec. This should be collected
342
+ * using value of \param CUPTI_DEVICE_ATTR_GLOBAL_MEMORY_BANDWIDTH
343
+ * of cuptiDeviceGetAttribute.
344
+ */
345
+ CUPTI_METRIC_PROPERTY_GLOBAL_MEMORY_BANDWIDTH,
346
+ /*
347
+ * PCIE link rate in Mega bits/sec. This should be collected using
348
+ * value of \param CUPTI_DEVICE_ATTR_PCIE_LINK_RATE of
349
+ * cuptiDeviceGetAttribute.
350
+ */
351
+ CUPTI_METRIC_PROPERTY_PCIE_LINK_RATE,
352
+ /*
353
+ * PCIE link width for device. This should be collected using
354
+ * value of \param CUPTI_DEVICE_ATTR_PCIE_LINK_WIDTH of
355
+ * cuptiDeviceGetAttribute.
356
+ */
357
+ CUPTI_METRIC_PROPERTY_PCIE_LINK_WIDTH,
358
+ /*
359
+ * PCIE generation for device. This should be collected using
360
+ * value of \param CUPTI_DEVICE_ATTR_PCIE_GEN of
361
+ * cuptiDeviceGetAttribute.
362
+ */
363
+ CUPTI_METRIC_PROPERTY_PCIE_GEN,
364
+ /*
365
+ * The device class. This should be collected using
366
+ * value of \param CUPTI_DEVICE_ATTR_DEVICE_CLASS of
367
+ * cuptiDeviceGetAttribute.
368
+ */
369
+ CUPTI_METRIC_PROPERTY_DEVICE_CLASS,
370
+ /*
371
+ * Peak single precision floating point operations that
372
+ * can be performed in one cycle by the device.
373
+ * This should be collected using value of
374
+ * \param CUPTI_DEVICE_ATTR_FLOP_SP_PER_CYCLE of
375
+ * cuptiDeviceGetAttribute.
376
+ */
377
+ CUPTI_METRIC_PROPERTY_FLOP_SP_PER_CYCLE,
378
+ /*
379
+ * Peak double precision floating point operations that
380
+ * can be performed in one cycle by the device.
381
+ * This should be collected using value of
382
+ * \param CUPTI_DEVICE_ATTR_FLOP_DP_PER_CYCLE of
383
+ * cuptiDeviceGetAttribute.
384
+ */
385
+ CUPTI_METRIC_PROPERTY_FLOP_DP_PER_CYCLE,
386
+ /*
387
+ * Number of L2 units on a device. This can be collected
388
+ * using value of \param CUPTI_DEVICE_ATTR_MAX_L2_UNITS of
389
+ * cuDeviceGetAttribute.
390
+ */
391
+ CUPTI_METRIC_PROPERTY_L2_UNITS,
392
+ /*
393
+ * Whether ECC support is enabled on the device. This can be
394
+ * collected using value of \param CU_DEVICE_ATTRIBUTE_ECC_ENABLED of
395
+ * cuDeviceGetAttribute.
396
+ */
397
+ CUPTI_METRIC_PROPERTY_ECC_ENABLED,
398
+ /*
399
+ * Peak half precision floating point operations that
400
+ * can be performed in one cycle by the device.
401
+ * This should be collected using value of
402
+ * \param CUPTI_DEVICE_ATTR_FLOP_HP_PER_CYCLE of
403
+ * cuptiDeviceGetAttribute.
404
+ */
405
+ CUPTI_METRIC_PROPERTY_FLOP_HP_PER_CYCLE,
406
+ /*
407
+ * NVLINK Bandwitdh for device. This should be collected
408
+ * using value of \param CUPTI_DEVICE_ATTR_GPU_CPU_NVLINK_BW of
409
+ * cuptiDeviceGetAttribute.
410
+ */
411
+ CUPTI_METRIC_PROPERTY_GPU_CPU_NVLINK_BANDWIDTH,
412
+ } CUpti_MetricPropertyID;
413
+
414
+ /**
415
+ * \brief Get the total number of metrics available on any device.
416
+ *
417
+ * Returns the total number of metrics available on any CUDA-capable
418
+ * devices.
419
+ *
420
+ * \param numMetrics Returns the number of metrics
421
+ *
422
+ * \retval CUPTI_SUCCESS
423
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numMetrics is NULL
424
+ */
425
+ CUptiResult CUPTIAPI cuptiGetNumMetrics(uint32_t *numMetrics);
426
+
427
+ /**
428
+ * \brief Get all the metrics available on any device.
429
+ *
430
+ * Returns the metric IDs in \p metricArray for all CUDA-capable
431
+ * devices. The size of the \p metricArray buffer is given by \p
432
+ * *arraySizeBytes. The size of the \p metricArray buffer must be at
433
+ * least \p numMetrics * sizeof(CUpti_MetricID) or all metric IDs will
434
+ * not be returned. The value returned in \p *arraySizeBytes contains
435
+ * the number of bytes returned in \p metricArray.
436
+ *
437
+ * \param arraySizeBytes The size of \p metricArray in bytes, and
438
+ * returns the number of bytes written to \p metricArray
439
+ * \param metricArray Returns the IDs of the metrics
440
+ *
441
+ * \retval CUPTI_SUCCESS
442
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or
443
+ * \p metricArray are NULL
444
+ */
445
+ CUptiResult CUPTIAPI cuptiEnumMetrics(size_t *arraySizeBytes,
446
+ CUpti_MetricID *metricArray);
447
+
448
+ /**
449
+ * \brief Get the number of metrics for a device.
450
+ *
451
+ * Returns the number of metrics available for a device.
452
+ *
453
+ * \param device The CUDA device
454
+ * \param numMetrics Returns the number of metrics available for the
455
+ * device
456
+ *
457
+ * \retval CUPTI_SUCCESS
458
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
459
+ * \retval CUPTI_ERROR_INVALID_DEVICE
460
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numMetrics is NULL
461
+ */
462
+ CUptiResult CUPTIAPI cuptiDeviceGetNumMetrics(CUdevice device,
463
+ uint32_t *numMetrics);
464
+
465
+ /**
466
+ * \brief Get the metrics for a device.
467
+ *
468
+ * Returns the metric IDs in \p metricArray for a device. The size of
469
+ * the \p metricArray buffer is given by \p *arraySizeBytes. The size
470
+ * of the \p metricArray buffer must be at least \p numMetrics *
471
+ * sizeof(CUpti_MetricID) or else all metric IDs will not be
472
+ * returned. The value returned in \p *arraySizeBytes contains the
473
+ * number of bytes returned in \p metricArray.
474
+ *
475
+ * \param device The CUDA device
476
+ * \param arraySizeBytes The size of \p metricArray in bytes, and
477
+ * returns the number of bytes written to \p metricArray
478
+ * \param metricArray Returns the IDs of the metrics for the device
479
+ *
480
+ * \retval CUPTI_SUCCESS
481
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
482
+ * \retval CUPTI_ERROR_INVALID_DEVICE
483
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or
484
+ * \p metricArray are NULL
485
+ */
486
+ CUptiResult CUPTIAPI cuptiDeviceEnumMetrics(CUdevice device,
487
+ size_t *arraySizeBytes,
488
+ CUpti_MetricID *metricArray);
489
+
490
+ /**
491
+ * \brief Get a metric attribute.
492
+ *
493
+ * Returns a metric attribute in \p *value. The size of the \p
494
+ * value buffer is given by \p *valueSize. The value returned in \p
495
+ * *valueSize contains the number of bytes returned in \p value.
496
+ *
497
+ * If the attribute value is a c-string that is longer than \p
498
+ * *valueSize, then only the first \p *valueSize characters will be
499
+ * returned and there will be no terminating null byte.
500
+ *
501
+ * \param metric ID of the metric
502
+ * \param attrib The metric attribute to read
503
+ * \param valueSize The size of the \p value buffer in bytes, and
504
+ * returns the number of bytes written to \p value
505
+ * \param value Returns the attribute's value
506
+ *
507
+ * \retval CUPTI_SUCCESS
508
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
509
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
510
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
511
+ * is NULL, or if \p attrib is not a metric attribute
512
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
513
+ * attribute values, indicates that the \p value buffer is too small
514
+ * to hold the attribute value.
515
+ */
516
+ CUptiResult CUPTIAPI cuptiMetricGetAttribute(CUpti_MetricID metric,
517
+ CUpti_MetricAttribute attrib,
518
+ size_t *valueSize,
519
+ void *value);
520
+
521
+ /**
522
+ * \brief Find an metric by name.
523
+ *
524
+ * Find a metric by name and return the metric ID in \p *metric.
525
+ *
526
+ * \param device The CUDA device
527
+ * \param metricName The name of metric to find
528
+ * \param metric Returns the ID of the found metric or undefined if
529
+ * unable to find the metric
530
+ *
531
+ * \retval CUPTI_SUCCESS
532
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
533
+ * \retval CUPTI_ERROR_INVALID_DEVICE
534
+ * \retval CUPTI_ERROR_INVALID_METRIC_NAME if unable to find a metric
535
+ * with name \p metricName. In this case \p *metric is undefined
536
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricName or \p
537
+ * metric are NULL.
538
+ */
539
+ CUptiResult CUPTIAPI cuptiMetricGetIdFromName(CUdevice device,
540
+ const char *metricName,
541
+ CUpti_MetricID *metric);
542
+
543
+ /**
544
+ * \brief Get number of events required to calculate a metric.
545
+ *
546
+ * Returns the number of events in \p numEvents that are required to
547
+ * calculate a metric.
548
+ *
549
+ * \param metric ID of the metric
550
+ * \param numEvents Returns the number of events required for the metric
551
+ *
552
+ * \retval CUPTI_SUCCESS
553
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
554
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
555
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numEvents is NULL
556
+ */
557
+ CUptiResult CUPTIAPI cuptiMetricGetNumEvents(CUpti_MetricID metric,
558
+ uint32_t *numEvents);
559
+
560
+ /**
561
+ * \brief Get the events required to calculating a metric.
562
+ *
563
+ * Gets the event IDs in \p eventIdArray required to calculate a \p
564
+ * metric. The size of the \p eventIdArray buffer is given by \p
565
+ * *eventIdArraySizeBytes and must be at least \p numEvents *
566
+ * sizeof(CUpti_EventID) or all events will not be returned. The value
567
+ * returned in \p *eventIdArraySizeBytes contains the number of bytes
568
+ * returned in \p eventIdArray.
569
+ *
570
+ * \param metric ID of the metric
571
+ * \param eventIdArraySizeBytes The size of \p eventIdArray in bytes,
572
+ * and returns the number of bytes written to \p eventIdArray
573
+ * \param eventIdArray Returns the IDs of the events required to
574
+ * calculate \p metric
575
+ *
576
+ * \retval CUPTI_SUCCESS
577
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
578
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
579
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventIdArraySizeBytes or \p
580
+ * eventIdArray are NULL.
581
+ */
582
+ CUptiResult CUPTIAPI cuptiMetricEnumEvents(CUpti_MetricID metric,
583
+ size_t *eventIdArraySizeBytes,
584
+ CUpti_EventID *eventIdArray);
585
+
586
+ /**
587
+ * \brief Get number of properties required to calculate a metric.
588
+ *
589
+ * Returns the number of properties in \p numProp that are required to
590
+ * calculate a metric.
591
+ *
592
+ * \param metric ID of the metric
593
+ * \param numProp Returns the number of properties required for the
594
+ * metric
595
+ *
596
+ * \retval CUPTI_SUCCESS
597
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
598
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
599
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numProp is NULL
600
+ */
601
+ CUptiResult CUPTIAPI cuptiMetricGetNumProperties(CUpti_MetricID metric,
602
+ uint32_t *numProp);
603
+
604
+ /**
605
+ * \brief Get the properties required to calculating a metric.
606
+ *
607
+ * Gets the property IDs in \p propIdArray required to calculate a \p
608
+ * metric. The size of the \p propIdArray buffer is given by \p
609
+ * *propIdArraySizeBytes and must be at least \p numProp *
610
+ * sizeof(CUpti_DeviceAttribute) or all properties will not be
611
+ * returned. The value returned in \p *propIdArraySizeBytes contains
612
+ * the number of bytes returned in \p propIdArray.
613
+ *
614
+ * \param metric ID of the metric
615
+ * \param propIdArraySizeBytes The size of \p propIdArray in bytes,
616
+ * and returns the number of bytes written to \p propIdArray
617
+ * \param propIdArray Returns the IDs of the properties required to
618
+ * calculate \p metric
619
+ *
620
+ * \retval CUPTI_SUCCESS
621
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
622
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
623
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p propIdArraySizeBytes or \p
624
+ * propIdArray are NULL.
625
+ */
626
+ CUptiResult CUPTIAPI cuptiMetricEnumProperties(CUpti_MetricID metric,
627
+ size_t *propIdArraySizeBytes,
628
+ CUpti_MetricPropertyID *propIdArray);
629
+
630
+
631
+ /**
632
+ * \brief For a metric get the groups of events that must be collected
633
+ * in the same pass.
634
+ *
635
+ * For a metric get the groups of events that must be collected in the
636
+ * same pass to ensure that the metric is calculated correctly. If the
637
+ * events are not collected as specified then the metric value may be
638
+ * inaccurate.
639
+ *
640
+ * The function returns NULL if a metric does not have any required
641
+ * event group. In this case the events needed for the metric can be
642
+ * grouped in any manner for collection.
643
+ *
644
+ * \param context The context for event collection
645
+ * \param metric The metric ID
646
+ * \param eventGroupSets Returns a CUpti_EventGroupSets object that
647
+ * indicates the events that must be collected in the same pass to
648
+ * ensure the metric is calculated correctly. Returns NULL if no
649
+ * grouping is required for metric
650
+ * \retval CUPTI_SUCCESS
651
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
652
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
653
+ */
654
+ CUptiResult CUPTIAPI cuptiMetricGetRequiredEventGroupSets(CUcontext context,
655
+ CUpti_MetricID metric,
656
+ CUpti_EventGroupSets **eventGroupSets);
657
+
658
+ /**
659
+ * \brief For a set of metrics, get the grouping that indicates the
660
+ * number of passes and the event groups necessary to collect the
661
+ * events required for those metrics.
662
+ *
663
+ * For a set of metrics, get the grouping that indicates the number of
664
+ * passes and the event groups necessary to collect the events
665
+ * required for those metrics.
666
+ *
667
+ * \see cuptiEventGroupSetsCreate for details on event group set
668
+ * creation.
669
+ *
670
+ * \param context The context for event collection
671
+ * \param metricIdArraySizeBytes Size of the metricIdArray in bytes
672
+ * \param metricIdArray Array of metric IDs
673
+ * \param eventGroupPasses Returns a CUpti_EventGroupSets object that
674
+ * indicates the number of passes required to collect the events and
675
+ * the events to collect on each pass
676
+ *
677
+ * \retval CUPTI_SUCCESS
678
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
679
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
680
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
681
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricIdArray or
682
+ * \p eventGroupPasses is NULL
683
+ */
684
+ CUptiResult CUPTIAPI cuptiMetricCreateEventGroupSets(CUcontext context,
685
+ size_t metricIdArraySizeBytes,
686
+ CUpti_MetricID *metricIdArray,
687
+ CUpti_EventGroupSets **eventGroupPasses);
688
+
689
+ /**
690
+ * \brief Calculate the value for a metric.
691
+ *
692
+ * Use the events collected for a metric to calculate the metric
693
+ * value. Metric value evaluation depends on the evaluation mode
694
+ * \ref CUpti_MetricEvaluationMode that the metric supports.
695
+ * If a metric has evaluation mode as CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE,
696
+ * then it assumes that the input event value is for one domain instance.
697
+ * If a metric has evaluation mode as CUPTI_METRIC_EVALUATION_MODE_AGGREGATE,
698
+ * it assumes that input event values are
699
+ * normalized to represent all domain instances on a device. For the
700
+ * most accurate metric collection, the events required for the metric
701
+ * should be collected for all profiled domain instances. For example,
702
+ * to collect all instances of an event, set the
703
+ * CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES attribute on
704
+ * the group containing the event to 1. The normalized value for the
705
+ * event is then: (\p sum_event_values * \p totalInstanceCount) / \p
706
+ * instanceCount, where \p sum_event_values is the summation of the
707
+ * event values across all profiled domain instances, \p
708
+ * totalInstanceCount is obtained from querying
709
+ * CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT and \p instanceCount
710
+ * is obtained from querying CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT (or
711
+ * CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT).
712
+ *
713
+ * \param device The CUDA device that the metric is being calculated for
714
+ * \param metric The metric ID
715
+ * \param eventIdArraySizeBytes The size of \p eventIdArray in bytes
716
+ * \param eventIdArray The event IDs required to calculate \p metric
717
+ * \param eventValueArraySizeBytes The size of \p eventValueArray in bytes
718
+ * \param eventValueArray The normalized event values required to
719
+ * calculate \p metric. The values must be order to match the order of
720
+ * events in \p eventIdArray
721
+ * \param timeDuration The duration over which the events were
722
+ * collected, in ns
723
+ * \param metricValue Returns the value for the metric
724
+ *
725
+ * \retval CUPTI_SUCCESS
726
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
727
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
728
+ * \retval CUPTI_ERROR_INVALID_OPERATION
729
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if the
730
+ * eventIdArray does not contain all the events needed for metric
731
+ * \retval CUPTI_ERROR_INVALID_EVENT_VALUE if any of the
732
+ * event values required for the metric is CUPTI_EVENT_OVERFLOW
733
+ * \retval CUPTI_ERROR_INVALID_METRIC_VALUE if the computed metric value
734
+ * cannot be represented in the metric's value type. For example,
735
+ * if the metric value type is unsigned and the computed metric value is negative
736
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricValue,
737
+ * \p eventIdArray or \p eventValueArray is NULL
738
+ */
739
+ CUptiResult CUPTIAPI cuptiMetricGetValue(CUdevice device,
740
+ CUpti_MetricID metric,
741
+ size_t eventIdArraySizeBytes,
742
+ CUpti_EventID *eventIdArray,
743
+ size_t eventValueArraySizeBytes,
744
+ uint64_t *eventValueArray,
745
+ uint64_t timeDuration,
746
+ CUpti_MetricValue *metricValue);
747
+
748
+ /**
749
+ * \brief Calculate the value for a metric.
750
+ *
751
+ * Use the events and properties collected for a metric to calculate
752
+ * the metric value. Metric value evaluation depends on the evaluation
753
+ * mode \ref CUpti_MetricEvaluationMode that the metric supports. If
754
+ * a metric has evaluation mode as
755
+ * CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE, then it assumes that the
756
+ * input event value is for one domain instance. If a metric has
757
+ * evaluation mode as CUPTI_METRIC_EVALUATION_MODE_AGGREGATE, it
758
+ * assumes that input event values are normalized to represent all
759
+ * domain instances on a device. For the most accurate metric
760
+ * collection, the events required for the metric should be collected
761
+ * for all profiled domain instances. For example, to collect all
762
+ * instances of an event, set the
763
+ * CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES attribute on
764
+ * the group containing the event to 1. The normalized value for the
765
+ * event is then: (\p sum_event_values * \p totalInstanceCount) / \p
766
+ * instanceCount, where \p sum_event_values is the summation of the
767
+ * event values across all profiled domain instances, \p
768
+ * totalInstanceCount is obtained from querying
769
+ * CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT and \p instanceCount
770
+ * is obtained from querying CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT (or
771
+ * CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT).
772
+ *
773
+ * \param metric The metric ID
774
+ * \param eventIdArraySizeBytes The size of \p eventIdArray in bytes
775
+ * \param eventIdArray The event IDs required to calculate \p metric
776
+ * \param eventValueArraySizeBytes The size of \p eventValueArray in bytes
777
+ * \param eventValueArray The normalized event values required to
778
+ * calculate \p metric. The values must be order to match the order of
779
+ * events in \p eventIdArray
780
+ * \param propIdArraySizeBytes The size of \p propIdArray in bytes
781
+ * \param propIdArray The metric property IDs required to calculate \p metric
782
+ * \param propValueArraySizeBytes The size of \p propValueArray in bytes
783
+ * \param propValueArray The metric property values required to
784
+ * calculate \p metric. The values must be order to match the order of
785
+ * metric properties in \p propIdArray
786
+ * \param metricValue Returns the value for the metric
787
+ *
788
+ * \retval CUPTI_SUCCESS
789
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
790
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
791
+ * \retval CUPTI_ERROR_INVALID_OPERATION
792
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if the
793
+ * eventIdArray does not contain all the events needed for metric
794
+ * \retval CUPTI_ERROR_INVALID_EVENT_VALUE if any of the
795
+ * event values required for the metric is CUPTI_EVENT_OVERFLOW
796
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if the computed metric value
797
+ * cannot be represented in the metric's value type. For example,
798
+ * if the metric value type is unsigned and the computed metric value is negative
799
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricValue,
800
+ * \p eventIdArray or \p eventValueArray is NULL
801
+ */
802
+ CUptiResult CUPTIAPI cuptiMetricGetValue2(CUpti_MetricID metric,
803
+ size_t eventIdArraySizeBytes,
804
+ CUpti_EventID *eventIdArray,
805
+ size_t eventValueArraySizeBytes,
806
+ uint64_t *eventValueArray,
807
+ size_t propIdArraySizeBytes,
808
+ CUpti_MetricPropertyID *propIdArray,
809
+ size_t propValueArraySizeBytes,
810
+ uint64_t *propValueArray,
811
+ CUpti_MetricValue *metricValue);
812
+
813
+ /** @} */ /* END CUPTI_METRIC_API */
814
+
815
+ #if defined(__GNUC__) && defined(CUPTI_LIB)
816
+ #pragma GCC visibility pop
817
+ #endif
818
+
819
+ #if defined(__cplusplus)
820
+ }
821
+ #endif
822
+
823
+ #endif /*_CUPTI_METRIC_H_*/
824
+
825
+
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_gl_interop_meta.h ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // This file is generated. Any changes you make will be lost during the next clean build.
2
+
3
+ // CUDA public interface, for type definitions and api function prototypes
4
+ #include "cuda_gl_interop.h"
5
+
6
+ // *************************************************************************
7
+ // Definitions of structs to hold parameters for each function
8
+ // *************************************************************************
9
+
10
+ // Currently used parameter trace structures
11
+ typedef struct cudaGLGetDevices_v4010_params_st {
12
+ unsigned int *pCudaDeviceCount;
13
+ int *pCudaDevices;
14
+ unsigned int cudaDeviceCount;
15
+ enum cudaGLDeviceList deviceList;
16
+ } cudaGLGetDevices_v4010_params;
17
+
18
+ typedef struct cudaGraphicsGLRegisterImage_v3020_params_st {
19
+ struct cudaGraphicsResource **resource;
20
+ GLuint image;
21
+ GLenum target;
22
+ unsigned int flags;
23
+ } cudaGraphicsGLRegisterImage_v3020_params;
24
+
25
+ typedef struct cudaGraphicsGLRegisterBuffer_v3020_params_st {
26
+ struct cudaGraphicsResource **resource;
27
+ GLuint buffer;
28
+ unsigned int flags;
29
+ } cudaGraphicsGLRegisterBuffer_v3020_params;
30
+
31
+ typedef struct cudaGLSetGLDevice_v3020_params_st {
32
+ int device;
33
+ } cudaGLSetGLDevice_v3020_params;
34
+
35
+ typedef struct cudaGLRegisterBufferObject_v3020_params_st {
36
+ GLuint bufObj;
37
+ } cudaGLRegisterBufferObject_v3020_params;
38
+
39
+ typedef struct cudaGLMapBufferObject_v3020_params_st {
40
+ void **devPtr;
41
+ GLuint bufObj;
42
+ } cudaGLMapBufferObject_v3020_params;
43
+
44
+ typedef struct cudaGLUnmapBufferObject_v3020_params_st {
45
+ GLuint bufObj;
46
+ } cudaGLUnmapBufferObject_v3020_params;
47
+
48
+ typedef struct cudaGLUnregisterBufferObject_v3020_params_st {
49
+ GLuint bufObj;
50
+ } cudaGLUnregisterBufferObject_v3020_params;
51
+
52
+ typedef struct cudaGLSetBufferObjectMapFlags_v3020_params_st {
53
+ GLuint bufObj;
54
+ unsigned int flags;
55
+ } cudaGLSetBufferObjectMapFlags_v3020_params;
56
+
57
+ typedef struct cudaGLMapBufferObjectAsync_v3020_params_st {
58
+ void **devPtr;
59
+ GLuint bufObj;
60
+ cudaStream_t stream;
61
+ } cudaGLMapBufferObjectAsync_v3020_params;
62
+
63
+ typedef struct cudaGLUnmapBufferObjectAsync_v3020_params_st {
64
+ GLuint bufObj;
65
+ cudaStream_t stream;
66
+ } cudaGLUnmapBufferObjectAsync_v3020_params;
67
+
68
+ // Parameter trace structures for removed functions
69
+
70
+
71
+ // End of parameter trace structures
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/lib/__init__.py ADDED
File without changes
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/lib/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (222 Bytes). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/common_functions.h ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #if defined(_MSC_VER)
52
+ #pragma message("common_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
53
+ #else
54
+ #warning "common_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead."
55
+ #endif
56
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
57
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__
58
+ #endif
59
+
60
+ #include "crt/common_functions.h"
61
+
62
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__)
63
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
64
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__
65
+ #endif
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaEGL.h ADDED
@@ -0,0 +1,659 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef CUDAEGL_H
51
+ #define CUDAEGL_H
52
+
53
+ #include "cuda.h"
54
+ #include "EGL/egl.h"
55
+ #include "EGL/eglext.h"
56
+
57
+
58
+ #ifdef CUDA_FORCE_API_VERSION
59
+ #error "CUDA_FORCE_API_VERSION is no longer supported."
60
+ #endif
61
+
62
+ #ifdef __cplusplus
63
+ extern "C" {
64
+ #endif
65
+
66
+ /**
67
+ * \addtogroup CUDA_TYPES
68
+ * @{
69
+ */
70
+
71
+ /**
72
+ * Maximum number of planes per frame
73
+ */
74
+ #define MAX_PLANES 3
75
+
76
+ /**
77
+ * CUDA EglFrame type - array or pointer
78
+ */
79
+ typedef enum CUeglFrameType_enum {
80
+ CU_EGL_FRAME_TYPE_ARRAY = 0, /**< Frame type CUDA array */
81
+ CU_EGL_FRAME_TYPE_PITCH = 1, /**< Frame type pointer */
82
+ } CUeglFrameType;
83
+
84
+ /**
85
+ * Indicates that timeout for ::cuEGLStreamConsumerAcquireFrame is infinite.
86
+ */
87
+ #define CUDA_EGL_INFINITE_TIMEOUT 0xFFFFFFFF
88
+
89
+ /**
90
+ * Resource location flags- sysmem or vidmem
91
+ *
92
+ * For CUDA context on iGPU, since video and system memory are equivalent -
93
+ * these flags will not have an effect on the execution.
94
+ *
95
+ * For CUDA context on dGPU, applications can use the flag ::CUeglResourceLocationFlags
96
+ * to give a hint about the desired location.
97
+ *
98
+ * ::CU_EGL_RESOURCE_LOCATION_SYSMEM - the frame data is made resident on the system memory
99
+ * to be accessed by CUDA.
100
+ *
101
+ * ::CU_EGL_RESOURCE_LOCATION_VIDMEM - the frame data is made resident on the dedicated
102
+ * video memory to be accessed by CUDA.
103
+ *
104
+ * There may be an additional latency due to new allocation and data migration,
105
+ * if the frame is produced on a different memory.
106
+
107
+ */
108
+ typedef enum CUeglResourceLocationFlags_enum {
109
+ CU_EGL_RESOURCE_LOCATION_SYSMEM = 0x00, /**< Resource location sysmem */
110
+ CU_EGL_RESOURCE_LOCATION_VIDMEM = 0x01 /**< Resource location vidmem */
111
+ } CUeglResourceLocationFlags;
112
+
113
+ /**
114
+ * CUDA EGL Color Format - The different planar and multiplanar formats currently supported for CUDA_EGL interops.
115
+ * Three channel formats are currently not supported for ::CU_EGL_FRAME_TYPE_ARRAY
116
+ */
117
+ typedef enum CUeglColorFormat_enum {
118
+ CU_EGL_COLOR_FORMAT_YUV420_PLANAR = 0x00, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
119
+ CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR = 0x01, /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV420Planar. */
120
+ CU_EGL_COLOR_FORMAT_YUV422_PLANAR = 0x02, /**< Y, U, V each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */
121
+ CU_EGL_COLOR_FORMAT_YUV422_SEMIPLANAR = 0x03, /**< Y, UV in two surfaces with VU byte ordering, width, height ratio same as YUV422Planar. */
122
+ CU_EGL_COLOR_FORMAT_RGB = 0x04, /**< R/G/B three channels in one surface with BGR byte ordering. Only pitch linear format supported. */
123
+ CU_EGL_COLOR_FORMAT_BGR = 0x05, /**< R/G/B three channels in one surface with RGB byte ordering. Only pitch linear format supported. */
124
+ CU_EGL_COLOR_FORMAT_ARGB = 0x06, /**< R/G/B/A four channels in one surface with BGRA byte ordering. */
125
+ CU_EGL_COLOR_FORMAT_RGBA = 0x07, /**< R/G/B/A four channels in one surface with ABGR byte ordering. */
126
+ CU_EGL_COLOR_FORMAT_L = 0x08, /**< single luminance channel in one surface. */
127
+ CU_EGL_COLOR_FORMAT_R = 0x09, /**< single color channel in one surface. */
128
+ CU_EGL_COLOR_FORMAT_YUV444_PLANAR = 0x0A, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
129
+ CU_EGL_COLOR_FORMAT_YUV444_SEMIPLANAR = 0x0B, /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV444Planar. */
130
+ CU_EGL_COLOR_FORMAT_YUYV_422 = 0x0C, /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
131
+ CU_EGL_COLOR_FORMAT_UYVY_422 = 0x0D, /**< Y, U, V in one surface, interleaved as YUYV in one channel. */
132
+ CU_EGL_COLOR_FORMAT_ABGR = 0x0E, /**< R/G/B/A four channels in one surface with RGBA byte ordering. */
133
+ CU_EGL_COLOR_FORMAT_BGRA = 0x0F, /**< R/G/B/A four channels in one surface with ARGB byte ordering. */
134
+ CU_EGL_COLOR_FORMAT_A = 0x10, /**< Alpha color format - one channel in one surface. */
135
+ CU_EGL_COLOR_FORMAT_RG = 0x11, /**< R/G color format - two channels in one surface with GR byte ordering */
136
+ CU_EGL_COLOR_FORMAT_AYUV = 0x12, /**< Y, U, V, A four channels in one surface, interleaved as VUYA. */
137
+ CU_EGL_COLOR_FORMAT_YVU444_SEMIPLANAR = 0x13, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
138
+ CU_EGL_COLOR_FORMAT_YVU422_SEMIPLANAR = 0x14, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
139
+ CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR = 0x15, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
140
+ CU_EGL_COLOR_FORMAT_Y10V10U10_444_SEMIPLANAR = 0x16, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
141
+ CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR = 0x17, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
142
+ CU_EGL_COLOR_FORMAT_Y12V12U12_444_SEMIPLANAR = 0x18, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
143
+ CU_EGL_COLOR_FORMAT_Y12V12U12_420_SEMIPLANAR = 0x19, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
144
+ CU_EGL_COLOR_FORMAT_VYUY_ER = 0x1A, /**< Extended Range Y, U, V in one surface, interleaved as YVYU in one channel. */
145
+ CU_EGL_COLOR_FORMAT_UYVY_ER = 0x1B, /**< Extended Range Y, U, V in one surface, interleaved as YUYV in one channel. */
146
+ CU_EGL_COLOR_FORMAT_YUYV_ER = 0x1C, /**< Extended Range Y, U, V in one surface, interleaved as UYVY in one channel. */
147
+ CU_EGL_COLOR_FORMAT_YVYU_ER = 0x1D, /**< Extended Range Y, U, V in one surface, interleaved as VYUY in one channel. */
148
+ CU_EGL_COLOR_FORMAT_YUV_ER = 0x1E, /**< Extended Range Y, U, V three channels in one surface, interleaved as VUY. Only pitch linear format supported. */
149
+ CU_EGL_COLOR_FORMAT_YUVA_ER = 0x1F, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as AVUY. */
150
+ CU_EGL_COLOR_FORMAT_AYUV_ER = 0x20, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as VUYA. */
151
+ CU_EGL_COLOR_FORMAT_YUV444_PLANAR_ER = 0x21, /**< Extended Range Y, U, V in three surfaces, U/V width = Y width, U/V height = Y height. */
152
+ CU_EGL_COLOR_FORMAT_YUV422_PLANAR_ER = 0x22, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
153
+ CU_EGL_COLOR_FORMAT_YUV420_PLANAR_ER = 0x23, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
154
+ CU_EGL_COLOR_FORMAT_YUV444_SEMIPLANAR_ER = 0x24, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = Y width, U/V height = Y height. */
155
+ CU_EGL_COLOR_FORMAT_YUV422_SEMIPLANAR_ER = 0x25, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
156
+ CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR_ER = 0x26, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
157
+ CU_EGL_COLOR_FORMAT_YVU444_PLANAR_ER = 0x27, /**< Extended Range Y, V, U in three surfaces, U/V width = Y width, U/V height = Y height. */
158
+ CU_EGL_COLOR_FORMAT_YVU422_PLANAR_ER = 0x28, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
159
+ CU_EGL_COLOR_FORMAT_YVU420_PLANAR_ER = 0x29, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
160
+ CU_EGL_COLOR_FORMAT_YVU444_SEMIPLANAR_ER = 0x2A, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
161
+ CU_EGL_COLOR_FORMAT_YVU422_SEMIPLANAR_ER = 0x2B, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
162
+ CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR_ER = 0x2C, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
163
+ CU_EGL_COLOR_FORMAT_BAYER_RGGB = 0x2D, /**< Bayer format - one channel in one surface with interleaved RGGB ordering. */
164
+ CU_EGL_COLOR_FORMAT_BAYER_BGGR = 0x2E, /**< Bayer format - one channel in one surface with interleaved BGGR ordering. */
165
+ CU_EGL_COLOR_FORMAT_BAYER_GRBG = 0x2F, /**< Bayer format - one channel in one surface with interleaved GRBG ordering. */
166
+ CU_EGL_COLOR_FORMAT_BAYER_GBRG = 0x30, /**< Bayer format - one channel in one surface with interleaved GBRG ordering. */
167
+ CU_EGL_COLOR_FORMAT_BAYER10_RGGB = 0x31, /**< Bayer10 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
168
+ CU_EGL_COLOR_FORMAT_BAYER10_BGGR = 0x32, /**< Bayer10 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
169
+ CU_EGL_COLOR_FORMAT_BAYER10_GRBG = 0x33, /**< Bayer10 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
170
+ CU_EGL_COLOR_FORMAT_BAYER10_GBRG = 0x34, /**< Bayer10 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
171
+ CU_EGL_COLOR_FORMAT_BAYER12_RGGB = 0x35, /**< Bayer12 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
172
+ CU_EGL_COLOR_FORMAT_BAYER12_BGGR = 0x36, /**< Bayer12 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
173
+ CU_EGL_COLOR_FORMAT_BAYER12_GRBG = 0x37, /**< Bayer12 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
174
+ CU_EGL_COLOR_FORMAT_BAYER12_GBRG = 0x38, /**< Bayer12 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
175
+ CU_EGL_COLOR_FORMAT_BAYER14_RGGB = 0x39, /**< Bayer14 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
176
+ CU_EGL_COLOR_FORMAT_BAYER14_BGGR = 0x3A, /**< Bayer14 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
177
+ CU_EGL_COLOR_FORMAT_BAYER14_GRBG = 0x3B, /**< Bayer14 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
178
+ CU_EGL_COLOR_FORMAT_BAYER14_GBRG = 0x3C, /**< Bayer14 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
179
+ CU_EGL_COLOR_FORMAT_BAYER20_RGGB = 0x3D, /**< Bayer20 format - one channel in one surface with interleaved RGGB ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
180
+ CU_EGL_COLOR_FORMAT_BAYER20_BGGR = 0x3E, /**< Bayer20 format - one channel in one surface with interleaved BGGR ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
181
+ CU_EGL_COLOR_FORMAT_BAYER20_GRBG = 0x3F, /**< Bayer20 format - one channel in one surface with interleaved GRBG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
182
+ CU_EGL_COLOR_FORMAT_BAYER20_GBRG = 0x40, /**< Bayer20 format - one channel in one surface with interleaved GBRG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
183
+ CU_EGL_COLOR_FORMAT_YVU444_PLANAR = 0x41, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
184
+ CU_EGL_COLOR_FORMAT_YVU422_PLANAR = 0x42, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */
185
+ CU_EGL_COLOR_FORMAT_YVU420_PLANAR = 0x43, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
186
+ CU_EGL_COLOR_FORMAT_BAYER_ISP_RGGB = 0x44, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved RGGB ordering and mapped to opaque integer datatype. */
187
+ CU_EGL_COLOR_FORMAT_BAYER_ISP_BGGR = 0x45, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved BGGR ordering and mapped to opaque integer datatype. */
188
+ CU_EGL_COLOR_FORMAT_BAYER_ISP_GRBG = 0x46, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GRBG ordering and mapped to opaque integer datatype. */
189
+ CU_EGL_COLOR_FORMAT_BAYER_ISP_GBRG = 0x47, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GBRG ordering and mapped to opaque integer datatype. */
190
+ CU_EGL_COLOR_FORMAT_BAYER_BCCR = 0x48, /**< Bayer format - one channel in one surface with interleaved BCCR ordering. */
191
+ CU_EGL_COLOR_FORMAT_BAYER_RCCB = 0x49, /**< Bayer format - one channel in one surface with interleaved RCCB ordering. */
192
+ CU_EGL_COLOR_FORMAT_BAYER_CRBC = 0x4A, /**< Bayer format - one channel in one surface with interleaved CRBC ordering. */
193
+ CU_EGL_COLOR_FORMAT_BAYER_CBRC = 0x4B, /**< Bayer format - one channel in one surface with interleaved CBRC ordering. */
194
+ CU_EGL_COLOR_FORMAT_BAYER10_CCCC = 0x4C, /**< Bayer10 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
195
+ CU_EGL_COLOR_FORMAT_BAYER12_BCCR = 0x4D, /**< Bayer12 format - one channel in one surface with interleaved BCCR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
196
+ CU_EGL_COLOR_FORMAT_BAYER12_RCCB = 0x4E, /**< Bayer12 format - one channel in one surface with interleaved RCCB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
197
+ CU_EGL_COLOR_FORMAT_BAYER12_CRBC = 0x4F, /**< Bayer12 format - one channel in one surface with interleaved CRBC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
198
+ CU_EGL_COLOR_FORMAT_BAYER12_CBRC = 0x50, /**< Bayer12 format - one channel in one surface with interleaved CBRC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
199
+ CU_EGL_COLOR_FORMAT_BAYER12_CCCC = 0x51, /**< Bayer12 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
200
+ CU_EGL_COLOR_FORMAT_Y = 0x52, /**< Color format for single Y plane. */
201
+ CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR_2020 = 0x53, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
202
+ CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR_2020 = 0x54, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
203
+ CU_EGL_COLOR_FORMAT_YUV420_PLANAR_2020 = 0x55, /**< Y, U, V each in a separate surface, U/V width = 1/2 Y width, U/V height= 1/2 Y height. */
204
+ CU_EGL_COLOR_FORMAT_YVU420_PLANAR_2020 = 0x56, /**< Y, V, U each in a separate surface, U/V width = 1/2 Y width, U/V height
205
+ = 1/2 Y height. */
206
+ CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR_709 = 0x57, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
207
+ CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR_709 = 0x58, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
208
+ CU_EGL_COLOR_FORMAT_YUV420_PLANAR_709 = 0x59, /**< Y, U, V each in a separate surface, U/V width = 1/2 Y width, U/V height
209
+ = 1/2 Y height. */
210
+ CU_EGL_COLOR_FORMAT_YVU420_PLANAR_709 = 0x5A, /**< Y, V, U each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
211
+ CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_709 = 0x5B, /**< Y10, V10U10 in two surfaces (VU as one surface), U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
212
+ CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_2020 = 0x5C, /**< Y10, V10U10 in two surfaces (VU as one surface), U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
213
+ CU_EGL_COLOR_FORMAT_Y10V10U10_422_SEMIPLANAR_2020 = 0x5D, /**< Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */
214
+ CU_EGL_COLOR_FORMAT_Y10V10U10_422_SEMIPLANAR = 0x5E, /**< Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */
215
+ CU_EGL_COLOR_FORMAT_Y10V10U10_422_SEMIPLANAR_709 = 0x5F, /**< Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */
216
+ CU_EGL_COLOR_FORMAT_Y_ER = 0x60, /**< Extended Range Color format for single Y plane. */
217
+ CU_EGL_COLOR_FORMAT_Y_709_ER = 0x61, /**< Extended Range Color format for single Y plane. */
218
+ CU_EGL_COLOR_FORMAT_Y10_ER = 0x62, /**< Extended Range Color format for single Y10 plane. */
219
+ CU_EGL_COLOR_FORMAT_Y10_709_ER = 0x63, /**< Extended Range Color format for single Y10 plane. */
220
+ CU_EGL_COLOR_FORMAT_Y12_ER = 0x64, /**< Extended Range Color format for single Y12 plane. */
221
+ CU_EGL_COLOR_FORMAT_Y12_709_ER = 0x65, /**< Extended Range Color format for single Y12 plane. */
222
+ CU_EGL_COLOR_FORMAT_YUVA = 0x66, /**< Y, U, V, A four channels in one surface, interleaved as AVUY. */
223
+ CU_EGL_COLOR_FORMAT_YUV = 0x67, /**< Y, U, V three channels in one surface, interleaved as VUY. Only pitch linear format supported. */
224
+ CU_EGL_COLOR_FORMAT_YVYU = 0x68, /**< Y, U, V in one surface, interleaved as YVYU in one channel. */
225
+ CU_EGL_COLOR_FORMAT_VYUY = 0x69, /**< Y, U, V in one surface, interleaved as VYUY in one channel. */
226
+ CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_ER = 0x6A, /**< Extended Range Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
227
+ CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_709_ER = 0x6B, /**< Extended Range Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
228
+ CU_EGL_COLOR_FORMAT_Y10V10U10_444_SEMIPLANAR_ER = 0x6C, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
229
+ CU_EGL_COLOR_FORMAT_Y10V10U10_444_SEMIPLANAR_709_ER = 0x6D, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
230
+ CU_EGL_COLOR_FORMAT_Y12V12U12_420_SEMIPLANAR_ER = 0x6E, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
231
+ CU_EGL_COLOR_FORMAT_Y12V12U12_420_SEMIPLANAR_709_ER = 0x6F, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
232
+ CU_EGL_COLOR_FORMAT_Y12V12U12_444_SEMIPLANAR_ER = 0x70, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
233
+ CU_EGL_COLOR_FORMAT_Y12V12U12_444_SEMIPLANAR_709_ER = 0x71, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
234
+ CU_EGL_COLOR_FORMAT_MAX
235
+ } CUeglColorFormat;
236
+
237
+ /**
238
+ * CUDA EGLFrame structure Descriptor - structure defining one frame of EGL.
239
+ *
240
+ * Each frame may contain one or more planes depending on whether the surface * is Multiplanar or not.
241
+ */
242
+ typedef struct CUeglFrame_st {
243
+ union {
244
+ CUarray pArray[MAX_PLANES]; /**< Array of CUarray corresponding to each plane*/
245
+ void* pPitch[MAX_PLANES]; /**< Array of Pointers corresponding to each plane*/
246
+ } frame;
247
+ unsigned int width; /**< Width of first plane */
248
+ unsigned int height; /**< Height of first plane */
249
+ unsigned int depth; /**< Depth of first plane */
250
+ unsigned int pitch; /**< Pitch of first plane */
251
+ unsigned int planeCount; /**< Number of planes */
252
+ unsigned int numChannels; /**< Number of channels for the plane */
253
+ CUeglFrameType frameType; /**< Array or Pitch */
254
+ CUeglColorFormat eglColorFormat; /**< CUDA EGL Color Format*/
255
+ CUarray_format cuFormat; /**< CUDA Array Format*/
256
+ } CUeglFrame_v1;
257
+ typedef CUeglFrame_v1 CUeglFrame;
258
+
259
+ /**
260
+ * CUDA EGLSream Connection
261
+ */
262
+ typedef struct CUeglStreamConnection_st* CUeglStreamConnection;
263
+
264
+ /** @} */ /* END CUDA_TYPES */
265
+
266
+ /**
267
+ * \file cudaEGL.h
268
+ * \brief Header file for the EGL interoperability functions of the
269
+ * low-level CUDA driver application programming interface.
270
+ */
271
+
272
+ /**
273
+ * \defgroup CUDA_EGL EGL Interoperability
274
+ * \ingroup CUDA_DRIVER
275
+ *
276
+ * ___MANBRIEF___ EGL interoperability functions of the low-level CUDA
277
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
278
+ *
279
+ * This section describes the EGL interoperability functions of the
280
+ * low-level CUDA driver application programming interface.
281
+ *
282
+ * @{
283
+ */
284
+
285
+ /**
286
+ * \brief Registers an EGL image
287
+ *
288
+ * Registers the EGLImageKHR specified by \p image for access by
289
+ * CUDA. A handle to the registered object is returned as \p pCudaResource.
290
+ * Additional Mapping/Unmapping is not required for the registered resource and
291
+ * ::cuGraphicsResourceGetMappedEglFrame can be directly called on the \p pCudaResource.
292
+ *
293
+ * The application will be responsible for synchronizing access to shared objects.
294
+ * The application must ensure that any pending operation which access the objects have completed
295
+ * before passing control to CUDA. This may be accomplished by issuing and waiting for
296
+ * glFinish command on all GLcontexts (for OpenGL and likewise for other APIs).
297
+ * The application will be also responsible for ensuring that any pending operation on the
298
+ * registered CUDA resource has completed prior to executing subsequent commands in other APIs
299
+ * accesing the same memory objects.
300
+ * This can be accomplished by calling cuCtxSynchronize or cuEventSynchronize (preferably).
301
+ *
302
+ * The surface's intended usage is specified using \p flags, as follows:
303
+ *
304
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
305
+ * resource will be used. It is therefore assumed that this resource will be
306
+ * read from and written to by CUDA. This is the default value.
307
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA
308
+ * will not write to this resource.
309
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that
310
+ * CUDA will not read from this resource and will write over the
311
+ * entire contents of the resource, so none of the data previously
312
+ * stored in the resource will be preserved.
313
+ *
314
+ * The EGLImageKHR is an object which can be used to create EGLImage target resource. It is defined as a void pointer.
315
+ * typedef void* EGLImageKHR
316
+ *
317
+ * \param pCudaResource - Pointer to the returned object handle
318
+ * \param image - An EGLImageKHR image which can be used to create target resource.
319
+ * \param flags - Map flags
320
+ *
321
+ * \return
322
+ * ::CUDA_SUCCESS,
323
+ * ::CUDA_ERROR_INVALID_HANDLE,
324
+ * ::CUDA_ERROR_ALREADY_MAPPED,
325
+ * ::CUDA_ERROR_INVALID_CONTEXT,
326
+ *
327
+ * \sa ::cuGraphicsEGLRegisterImage, ::cuGraphicsUnregisterResource,
328
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
329
+ * ::cuGraphicsUnmapResources,
330
+ * ::cudaGraphicsEGLRegisterImage
331
+ */
332
+ CUresult CUDAAPI cuGraphicsEGLRegisterImage(CUgraphicsResource *pCudaResource, EGLImageKHR image, unsigned int flags);
333
+
334
+ /**
335
+ * \brief Connect CUDA to EGLStream as a consumer.
336
+ *
337
+ * Connect CUDA as a consumer to EGLStreamKHR specified by \p stream.
338
+ *
339
+ * The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
340
+ * API to another.
341
+ *
342
+ * \param conn - Pointer to the returned connection handle
343
+ * \param stream - EGLStreamKHR handle
344
+ *
345
+ * \return
346
+ * ::CUDA_SUCCESS,
347
+ * ::CUDA_ERROR_INVALID_HANDLE,
348
+ * ::CUDA_ERROR_INVALID_CONTEXT,
349
+ *
350
+ * \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
351
+ * ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
352
+ * ::cudaEGLStreamConsumerConnect
353
+ */
354
+ CUresult CUDAAPI cuEGLStreamConsumerConnect(CUeglStreamConnection *conn, EGLStreamKHR stream);
355
+
356
+ /**
357
+ * \brief Connect CUDA to EGLStream as a consumer with given flags.
358
+ *
359
+ * Connect CUDA as a consumer to EGLStreamKHR specified by \p stream with specified \p flags defined by CUeglResourceLocationFlags.
360
+ *
361
+ * The flags specify whether the consumer wants to access frames from system memory or video memory.
362
+ * Default is ::CU_EGL_RESOURCE_LOCATION_VIDMEM.
363
+ *
364
+ * \param conn - Pointer to the returned connection handle
365
+ * \param stream - EGLStreamKHR handle
366
+ * \param flags - Flags denote intended location - system or video.
367
+ *
368
+ * \return
369
+ * ::CUDA_SUCCESS,
370
+ * ::CUDA_ERROR_INVALID_HANDLE,
371
+ * ::CUDA_ERROR_INVALID_CONTEXT,
372
+ *
373
+ * \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
374
+ * ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
375
+ * ::cudaEGLStreamConsumerConnectWithFlags
376
+ */
377
+
378
+ CUresult CUDAAPI cuEGLStreamConsumerConnectWithFlags(CUeglStreamConnection *conn, EGLStreamKHR stream, unsigned int flags);
379
+
380
+ /**
381
+ * \brief Disconnect CUDA as a consumer to EGLStream .
382
+ *
383
+ * Disconnect CUDA as a consumer to EGLStreamKHR.
384
+ *
385
+ * \param conn - Conection to disconnect.
386
+ *
387
+ * \return
388
+ * ::CUDA_SUCCESS,
389
+ * ::CUDA_ERROR_INVALID_HANDLE,
390
+ * ::CUDA_ERROR_INVALID_CONTEXT,
391
+ *
392
+ * \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
393
+ * ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
394
+ * ::cudaEGLStreamConsumerDisconnect
395
+ */
396
+ CUresult CUDAAPI cuEGLStreamConsumerDisconnect(CUeglStreamConnection *conn);
397
+
398
+ /**
399
+ * \brief Acquire an image frame from the EGLStream with CUDA as a consumer.
400
+ *
401
+ * Acquire an image frame from EGLStreamKHR. This API can also acquire an old frame presented
402
+ * by the producer unless explicitly disabled by setting EGL_SUPPORT_REUSE_NV flag to EGL_FALSE
403
+ * during stream initialization. By default, EGLStream is created with this flag set to EGL_TRUE.
404
+ * ::cuGraphicsResourceGetMappedEglFrame can be called on \p pCudaResource to get
405
+ * ::CUeglFrame.
406
+ *
407
+ * \param conn - Connection on which to acquire
408
+ * \param pCudaResource - CUDA resource on which the stream frame will be mapped for use.
409
+ * \param pStream - CUDA stream for synchronization and any data migrations
410
+ * implied by ::CUeglResourceLocationFlags.
411
+ * \param timeout - Desired timeout in usec for a new frame to be acquired.
412
+ * If set as ::CUDA_EGL_INFINITE_TIMEOUT, acquire waits infinitely.
413
+ * After timeout occurs CUDA consumer tries to acquire an old frame
414
+ * if available and EGL_SUPPORT_REUSE_NV flag is set.
415
+ *
416
+ * \return
417
+ * ::CUDA_SUCCESS,
418
+ * ::CUDA_ERROR_INVALID_HANDLE,
419
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
420
+ *
421
+ * \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
422
+ * ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
423
+ * ::cudaEGLStreamConsumerAcquireFrame
424
+ */
425
+ CUresult CUDAAPI cuEGLStreamConsumerAcquireFrame(CUeglStreamConnection *conn,
426
+ CUgraphicsResource *pCudaResource, CUstream *pStream, unsigned int timeout);
427
+ /**
428
+ * \brief Releases the last frame acquired from the EGLStream.
429
+ *
430
+ * Release the acquired image frame specified by \p pCudaResource to EGLStreamKHR.
431
+ * If EGL_SUPPORT_REUSE_NV flag is set to EGL_TRUE, at the time of EGL creation
432
+ * this API doesn't release the last frame acquired on the EGLStream.
433
+ * By default, EGLStream is created with this flag set to EGL_TRUE.
434
+ *
435
+ * \param conn - Connection on which to release
436
+ * \param pCudaResource - CUDA resource whose corresponding frame is to be released
437
+ * \param pStream - CUDA stream on which release will be done.
438
+ *
439
+ * \return
440
+ * ::CUDA_SUCCESS,
441
+ * ::CUDA_ERROR_INVALID_HANDLE,
442
+ *
443
+ * \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
444
+ * ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
445
+ * ::cudaEGLStreamConsumerReleaseFrame
446
+ */
447
+ CUresult CUDAAPI cuEGLStreamConsumerReleaseFrame(CUeglStreamConnection *conn,
448
+ CUgraphicsResource pCudaResource, CUstream *pStream);
449
+
450
+ /**
451
+ * \brief Connect CUDA to EGLStream as a producer.
452
+ *
453
+ * Connect CUDA as a producer to EGLStreamKHR specified by \p stream.
454
+ *
455
+ * The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
456
+ * API to another.
457
+ *
458
+ * \param conn - Pointer to the returned connection handle
459
+ * \param stream - EGLStreamKHR handle
460
+ * \param width - width of the image to be submitted to the stream
461
+ * \param height - height of the image to be submitted to the stream
462
+ *
463
+ * \return
464
+ * ::CUDA_SUCCESS,
465
+ * ::CUDA_ERROR_INVALID_HANDLE,
466
+ * ::CUDA_ERROR_INVALID_CONTEXT,
467
+ *
468
+ * \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
469
+ * ::cuEGLStreamProducerPresentFrame,
470
+ * ::cudaEGLStreamProducerConnect
471
+ */
472
+ CUresult CUDAAPI cuEGLStreamProducerConnect(CUeglStreamConnection *conn, EGLStreamKHR stream,
473
+ EGLint width, EGLint height);
474
+
475
+ /**
476
+ * \brief Disconnect CUDA as a producer to EGLStream .
477
+ *
478
+ * Disconnect CUDA as a producer to EGLStreamKHR.
479
+ *
480
+ * \param conn - Conection to disconnect.
481
+ *
482
+ * \return
483
+ * ::CUDA_SUCCESS,
484
+ * ::CUDA_ERROR_INVALID_HANDLE,
485
+ * ::CUDA_ERROR_INVALID_CONTEXT,
486
+ *
487
+ * \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
488
+ * ::cuEGLStreamProducerPresentFrame,
489
+ * ::cudaEGLStreamProducerDisconnect
490
+ */
491
+ CUresult CUDAAPI cuEGLStreamProducerDisconnect(CUeglStreamConnection *conn);
492
+
493
+ /**
494
+ * \brief Present a CUDA eglFrame to the EGLStream with CUDA as a producer.
495
+ *
496
+ * When a frame is presented by the producer, it gets associated with the EGLStream
497
+ * and thus it is illegal to free the frame before the producer is disconnected.
498
+ * If a frame is freed and reused it may lead to undefined behavior.
499
+ *
500
+ * If producer and consumer are on different GPUs (iGPU and dGPU) then frametype
501
+ * ::CU_EGL_FRAME_TYPE_ARRAY is not supported. ::CU_EGL_FRAME_TYPE_PITCH can be used for
502
+ * such cross-device applications.
503
+ *
504
+ * The ::CUeglFrame is defined as:
505
+ * \code
506
+ * typedef struct CUeglFrame_st {
507
+ * union {
508
+ * CUarray pArray[MAX_PLANES];
509
+ * void* pPitch[MAX_PLANES];
510
+ * } frame;
511
+ * unsigned int width;
512
+ * unsigned int height;
513
+ * unsigned int depth;
514
+ * unsigned int pitch;
515
+ * unsigned int planeCount;
516
+ * unsigned int numChannels;
517
+ * CUeglFrameType frameType;
518
+ * CUeglColorFormat eglColorFormat;
519
+ * CUarray_format cuFormat;
520
+ * } CUeglFrame;
521
+ * \endcode
522
+ *
523
+ * For ::CUeglFrame of type ::CU_EGL_FRAME_TYPE_PITCH, the application may present sub-region of a memory
524
+ * allocation. In that case, the pitched pointer will specify the start address of the sub-region in
525
+ * the allocation and corresponding ::CUeglFrame fields will specify the dimensions of the sub-region.
526
+ *
527
+ * \param conn - Connection on which to present the CUDA array
528
+ * \param eglframe - CUDA Eglstream Proucer Frame handle to be sent to the consumer over EglStream.
529
+ * \param pStream - CUDA stream on which to present the frame.
530
+ *
531
+ * \return
532
+ * ::CUDA_SUCCESS,
533
+ * ::CUDA_ERROR_INVALID_HANDLE,
534
+ *
535
+ * \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
536
+ * ::cuEGLStreamProducerReturnFrame,
537
+ * ::cudaEGLStreamProducerPresentFrame
538
+ */
539
+ CUresult CUDAAPI cuEGLStreamProducerPresentFrame(CUeglStreamConnection *conn,
540
+ CUeglFrame eglframe, CUstream *pStream);
541
+
542
+ /**
543
+ * \brief Return the CUDA eglFrame to the EGLStream released by the consumer.
544
+ *
545
+ * This API can potentially return CUDA_ERROR_LAUNCH_TIMEOUT if the consumer has not
546
+ * returned a frame to EGL stream. If timeout is returned the application can retry.
547
+ *
548
+ * \param conn - Connection on which to return
549
+ * \param eglframe - CUDA Eglstream Proucer Frame handle returned from the consumer over EglStream.
550
+ * \param pStream - CUDA stream on which to return the frame.
551
+ *
552
+ * \return
553
+ * ::CUDA_SUCCESS,
554
+ * ::CUDA_ERROR_INVALID_HANDLE,
555
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT
556
+ *
557
+ * \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
558
+ * ::cuEGLStreamProducerPresentFrame,
559
+ * ::cudaEGLStreamProducerReturnFrame
560
+ */
561
+ CUresult CUDAAPI cuEGLStreamProducerReturnFrame(CUeglStreamConnection *conn,
562
+ CUeglFrame *eglframe, CUstream *pStream);
563
+
564
+ /**
565
+ * \brief Get an eglFrame through which to access a registered EGL graphics resource.
566
+ *
567
+ * Returns in \p *eglFrame an eglFrame pointer through which the registered graphics resource
568
+ * \p resource may be accessed.
569
+ * This API can only be called for registered EGL graphics resources.
570
+ *
571
+ * The ::CUeglFrame is defined as:
572
+ * \code
573
+ * typedef struct CUeglFrame_st {
574
+ * union {
575
+ * CUarray pArray[MAX_PLANES];
576
+ * void* pPitch[MAX_PLANES];
577
+ * } frame;
578
+ * unsigned int width;
579
+ * unsigned int height;
580
+ * unsigned int depth;
581
+ * unsigned int pitch;
582
+ * unsigned int planeCount;
583
+ * unsigned int numChannels;
584
+ * CUeglFrameType frameType;
585
+ * CUeglColorFormat eglColorFormat;
586
+ * CUarray_format cuFormat;
587
+ * } CUeglFrame;
588
+ * \endcode
589
+ *
590
+ * If \p resource is not registered then ::CUDA_ERROR_NOT_MAPPED is returned.
591
+ * *
592
+ * \param eglFrame - Returned eglFrame.
593
+ * \param resource - Registered resource to access.
594
+ * \param index - Index for cubemap surfaces.
595
+ * \param mipLevel - Mipmap level for the subresource to access.
596
+ *
597
+ * \return
598
+ * ::CUDA_SUCCESS,
599
+ * ::CUDA_ERROR_DEINITIALIZED,
600
+ * ::CUDA_ERROR_NOT_INITIALIZED,
601
+ * ::CUDA_ERROR_INVALID_CONTEXT,
602
+ * ::CUDA_ERROR_INVALID_VALUE,
603
+ * ::CUDA_ERROR_INVALID_HANDLE,
604
+ * ::CUDA_ERROR_NOT_MAPPED
605
+ *
606
+ * \sa
607
+ * ::cuGraphicsMapResources,
608
+ * ::cuGraphicsSubResourceGetMappedArray,
609
+ * ::cuGraphicsResourceGetMappedPointer,
610
+ * ::cudaGraphicsResourceGetMappedEglFrame
611
+ */
612
+ CUresult CUDAAPI cuGraphicsResourceGetMappedEglFrame(CUeglFrame* eglFrame, CUgraphicsResource resource, unsigned int index, unsigned int mipLevel);
613
+
614
+ /**
615
+ * \brief Creates an event from EGLSync object
616
+ *
617
+ * Creates an event *phEvent from an EGLSyncKHR eglSync with the flags specified
618
+ * via \p flags. Valid flags include:
619
+ * - ::CU_EVENT_DEFAULT: Default event creation flag.
620
+ * - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking
621
+ * synchronization. A CPU thread that uses ::cuEventSynchronize() to wait on
622
+ * an event created with this flag will block until the event has actually
623
+ * been completed.
624
+ *
625
+ * Once the \p eglSync gets destroyed, ::cuEventDestroy is the only API
626
+ * that can be invoked on the event.
627
+ *
628
+ * ::cuEventRecord and TimingData are not supported for events created from EGLSync.
629
+ *
630
+ * The EGLSyncKHR is an opaque handle to an EGL sync object.
631
+ * typedef void* EGLSyncKHR
632
+ *
633
+ * \param phEvent - Returns newly created event
634
+ * \param eglSync - Opaque handle to EGLSync object
635
+ * \param flags - Event creation flags
636
+ *
637
+ * \return
638
+ * ::CUDA_SUCCESS,
639
+ * ::CUDA_ERROR_DEINITIALIZED,
640
+ * ::CUDA_ERROR_NOT_INITIALIZED,
641
+ * ::CUDA_ERROR_INVALID_CONTEXT,
642
+ * ::CUDA_ERROR_INVALID_VALUE,
643
+ * ::CUDA_ERROR_OUT_OF_MEMORY
644
+ *
645
+ * \sa
646
+ * ::cuEventQuery,
647
+ * ::cuEventSynchronize,
648
+ * ::cuEventDestroy
649
+ */
650
+ CUresult CUDAAPI cuEventCreateFromEGLSync(CUevent *phEvent, EGLSyncKHR eglSync, unsigned int flags);
651
+
652
+ /** @} */ /* END CUDA_EGL */
653
+
654
+ #ifdef __cplusplus
655
+ };
656
+ #endif
657
+
658
+ #endif
659
+
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_egl_interop.h ADDED
@@ -0,0 +1,642 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_EGL_INTEROP_H__)
51
+ #define __CUDA_EGL_INTEROP_H__
52
+
53
+ #include "cuda_runtime_api.h"
54
+ #include "cuda_runtime.h"
55
+ #include "cudart_platform.h"
56
+ #include "EGL/egl.h"
57
+ #include "EGL/eglext.h"
58
+
59
+ #if defined(__cplusplus)
60
+ extern "C" {
61
+ #endif /* __cplusplus */
62
+
63
+ /**
64
+ * \addtogroup CUDART_TYPES
65
+ * @{
66
+ */
67
+
68
+ /**
69
+ * Maximum number of planes per frame
70
+ */
71
+ #define CUDA_EGL_MAX_PLANES 3
72
+
73
+ /**
74
+ * CUDA EglFrame type - array or pointer
75
+ */
76
+ typedef enum cudaEglFrameType_enum
77
+ {
78
+ cudaEglFrameTypeArray = 0, /**< Frame type CUDA array */
79
+ cudaEglFrameTypePitch = 1, /**< Frame type CUDA pointer */
80
+ } cudaEglFrameType;
81
+
82
+ /**
83
+ * Resource location flags- sysmem or vidmem
84
+ *
85
+ * For CUDA context on iGPU, since video and system memory are equivalent -
86
+ * these flags will not have an effect on the execution.
87
+ *
88
+ * For CUDA context on dGPU, applications can use the flag ::cudaEglResourceLocationFlags
89
+ * to give a hint about the desired location.
90
+ *
91
+ * ::cudaEglResourceLocationSysmem - the frame data is made resident on the system memory
92
+ * to be accessed by CUDA.
93
+ *
94
+ * ::cudaEglResourceLocationVidmem - the frame data is made resident on the dedicated
95
+ * video memory to be accessed by CUDA.
96
+ *
97
+ * There may be an additional latency due to new allocation and data migration,
98
+ * if the frame is produced on a different memory.
99
+ */
100
+ typedef enum cudaEglResourceLocationFlags_enum {
101
+ cudaEglResourceLocationSysmem = 0x00, /**< Resource location sysmem */
102
+ cudaEglResourceLocationVidmem = 0x01, /**< Resource location vidmem */
103
+ } cudaEglResourceLocationFlags;
104
+
105
+ /**
106
+ * CUDA EGL Color Format - The different planar and multiplanar formats currently supported for CUDA_EGL interops.
107
+ */
108
+ typedef enum cudaEglColorFormat_enum {
109
+ cudaEglColorFormatYUV420Planar = 0, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
110
+ cudaEglColorFormatYUV420SemiPlanar = 1, /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV420Planar. */
111
+ cudaEglColorFormatYUV422Planar = 2, /**< Y, U, V each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */
112
+ cudaEglColorFormatYUV422SemiPlanar = 3, /**< Y, UV in two surfaces with VU byte ordering, width, height ratio same as YUV422Planar. */
113
+ cudaEglColorFormatARGB = 6, /**< R/G/B/A four channels in one surface with BGRA byte ordering. */
114
+ cudaEglColorFormatRGBA = 7, /**< R/G/B/A four channels in one surface with ABGR byte ordering. */
115
+ cudaEglColorFormatL = 8, /**< single luminance channel in one surface. */
116
+ cudaEglColorFormatR = 9, /**< single color channel in one surface. */
117
+ cudaEglColorFormatYUV444Planar = 10, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
118
+ cudaEglColorFormatYUV444SemiPlanar = 11, /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV444Planar. */
119
+ cudaEglColorFormatYUYV422 = 12, /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
120
+ cudaEglColorFormatUYVY422 = 13, /**< Y, U, V in one surface, interleaved as YUYV in one channel. */
121
+ cudaEglColorFormatABGR = 14, /**< R/G/B/A four channels in one surface with RGBA byte ordering. */
122
+ cudaEglColorFormatBGRA = 15, /**< R/G/B/A four channels in one surface with ARGB byte ordering. */
123
+ cudaEglColorFormatA = 16, /**< Alpha color format - one channel in one surface. */
124
+ cudaEglColorFormatRG = 17, /**< R/G color format - two channels in one surface with GR byte ordering */
125
+ cudaEglColorFormatAYUV = 18, /**< Y, U, V, A four channels in one surface, interleaved as VUYA. */
126
+ cudaEglColorFormatYVU444SemiPlanar = 19, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
127
+ cudaEglColorFormatYVU422SemiPlanar = 20, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
128
+ cudaEglColorFormatYVU420SemiPlanar = 21, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
129
+ cudaEglColorFormatY10V10U10_444SemiPlanar = 22, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
130
+ cudaEglColorFormatY10V10U10_420SemiPlanar = 23, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
131
+ cudaEglColorFormatY12V12U12_444SemiPlanar = 24, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
132
+ cudaEglColorFormatY12V12U12_420SemiPlanar = 25, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
133
+ cudaEglColorFormatVYUY_ER = 26, /**< Extended Range Y, U, V in one surface, interleaved as YVYU in one channel. */
134
+ cudaEglColorFormatUYVY_ER = 27, /**< Extended Range Y, U, V in one surface, interleaved as YUYV in one channel. */
135
+ cudaEglColorFormatYUYV_ER = 28, /**< Extended Range Y, U, V in one surface, interleaved as UYVY in one channel. */
136
+ cudaEglColorFormatYVYU_ER = 29, /**< Extended Range Y, U, V in one surface, interleaved as VYUY in one channel. */
137
+ cudaEglColorFormatYUVA_ER = 31, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as AVUY. */
138
+ cudaEglColorFormatAYUV_ER = 32, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as VUYA. */
139
+ cudaEglColorFormatYUV444Planar_ER = 33, /**< Extended Range Y, U, V in three surfaces, U/V width = Y width, U/V height = Y height. */
140
+ cudaEglColorFormatYUV422Planar_ER = 34, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
141
+ cudaEglColorFormatYUV420Planar_ER = 35, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
142
+ cudaEglColorFormatYUV444SemiPlanar_ER = 36, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = Y width, U/V height = Y height. */
143
+ cudaEglColorFormatYUV422SemiPlanar_ER = 37, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
144
+ cudaEglColorFormatYUV420SemiPlanar_ER = 38, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
145
+ cudaEglColorFormatYVU444Planar_ER = 39, /**< Extended Range Y, V, U in three surfaces, U/V width = Y width, U/V height = Y height. */
146
+ cudaEglColorFormatYVU422Planar_ER = 40, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
147
+ cudaEglColorFormatYVU420Planar_ER = 41, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
148
+ cudaEglColorFormatYVU444SemiPlanar_ER = 42, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
149
+ cudaEglColorFormatYVU422SemiPlanar_ER = 43, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
150
+ cudaEglColorFormatYVU420SemiPlanar_ER = 44, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
151
+ cudaEglColorFormatBayerRGGB = 45, /**< Bayer format - one channel in one surface with interleaved RGGB ordering. */
152
+ cudaEglColorFormatBayerBGGR = 46, /**< Bayer format - one channel in one surface with interleaved BGGR ordering. */
153
+ cudaEglColorFormatBayerGRBG = 47, /**< Bayer format - one channel in one surface with interleaved GRBG ordering. */
154
+ cudaEglColorFormatBayerGBRG = 48, /**< Bayer format - one channel in one surface with interleaved GBRG ordering. */
155
+ cudaEglColorFormatBayer10RGGB = 49, /**< Bayer10 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
156
+ cudaEglColorFormatBayer10BGGR = 50, /**< Bayer10 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
157
+ cudaEglColorFormatBayer10GRBG = 51, /**< Bayer10 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
158
+ cudaEglColorFormatBayer10GBRG = 52, /**< Bayer10 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
159
+ cudaEglColorFormatBayer12RGGB = 53, /**< Bayer12 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
160
+ cudaEglColorFormatBayer12BGGR = 54, /**< Bayer12 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
161
+ cudaEglColorFormatBayer12GRBG = 55, /**< Bayer12 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
162
+ cudaEglColorFormatBayer12GBRG = 56, /**< Bayer12 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
163
+ cudaEglColorFormatBayer14RGGB = 57, /**< Bayer14 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
164
+ cudaEglColorFormatBayer14BGGR = 58, /**< Bayer14 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
165
+ cudaEglColorFormatBayer14GRBG = 59, /**< Bayer14 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
166
+ cudaEglColorFormatBayer14GBRG = 60, /**< Bayer14 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
167
+ cudaEglColorFormatBayer20RGGB = 61, /**< Bayer20 format - one channel in one surface with interleaved RGGB ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
168
+ cudaEglColorFormatBayer20BGGR = 62, /**< Bayer20 format - one channel in one surface with interleaved BGGR ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
169
+ cudaEglColorFormatBayer20GRBG = 63, /**< Bayer20 format - one channel in one surface with interleaved GRBG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
170
+ cudaEglColorFormatBayer20GBRG = 64, /**< Bayer20 format - one channel in one surface with interleaved GBRG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
171
+ cudaEglColorFormatYVU444Planar = 65, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
172
+ cudaEglColorFormatYVU422Planar = 66, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */
173
+ cudaEglColorFormatYVU420Planar = 67, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
174
+ cudaEglColorFormatBayerIspRGGB = 68, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved RGGB ordering and mapped to opaque integer datatype. */
175
+ cudaEglColorFormatBayerIspBGGR = 69, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved BGGR ordering and mapped to opaque integer datatype. */
176
+ cudaEglColorFormatBayerIspGRBG = 70, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GRBG ordering and mapped to opaque integer datatype. */
177
+ cudaEglColorFormatBayerIspGBRG = 71, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GBRG ordering and mapped to opaque integer datatype. */
178
+ cudaEglColorFormatBayerBCCR = 72, /**< Bayer format - one channel in one surface with interleaved BCCR ordering. */
179
+ cudaEglColorFormatBayerRCCB = 73, /**< Bayer format - one channel in one surface with interleaved RCCB ordering. */
180
+ cudaEglColorFormatBayerCRBC = 74, /**< Bayer format - one channel in one surface with interleaved CRBC ordering. */
181
+ cudaEglColorFormatBayerCBRC = 75, /**< Bayer format - one channel in one surface with interleaved CBRC ordering. */
182
+ cudaEglColorFormatBayer10CCCC = 76, /**< Bayer10 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
183
+ cudaEglColorFormatBayer12BCCR = 77, /**< Bayer12 format - one channel in one surface with interleaved BCCR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
184
+ cudaEglColorFormatBayer12RCCB = 78, /**< Bayer12 format - one channel in one surface with interleaved RCCB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
185
+ cudaEglColorFormatBayer12CRBC = 79, /**< Bayer12 format - one channel in one surface with interleaved CRBC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
186
+ cudaEglColorFormatBayer12CBRC = 80, /**< Bayer12 format - one channel in one surface with interleaved CBRC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
187
+ cudaEglColorFormatBayer12CCCC = 81, /**< Bayer12 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
188
+ cudaEglColorFormatY = 82, /**< Color format for single Y plane. */
189
+ cudaEglColorFormatYUV420SemiPlanar_2020 = 83, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
190
+ cudaEglColorFormatYVU420SemiPlanar_2020 = 84, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
191
+ cudaEglColorFormatYUV420Planar_2020 = 85, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
192
+ cudaEglColorFormatYVU420Planar_2020 = 86, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
193
+ cudaEglColorFormatYUV420SemiPlanar_709 = 87, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
194
+ cudaEglColorFormatYVU420SemiPlanar_709 = 88, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
195
+ cudaEglColorFormatYUV420Planar_709 = 89, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
196
+ cudaEglColorFormatYVU420Planar_709 = 90, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
197
+ cudaEglColorFormatY10V10U10_420SemiPlanar_709 = 91, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
198
+ cudaEglColorFormatY10V10U10_420SemiPlanar_2020 = 92, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
199
+ cudaEglColorFormatY10V10U10_422SemiPlanar_2020 = 93, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */
200
+ cudaEglColorFormatY10V10U10_422SemiPlanar = 94, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */
201
+ cudaEglColorFormatY10V10U10_422SemiPlanar_709 = 95, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */
202
+ cudaEglColorFormatY_ER = 96, /**< Extended Range Color format for single Y plane. */
203
+ cudaEglColorFormatY_709_ER = 97, /**< Extended Range Color format for single Y plane. */
204
+ cudaEglColorFormatY10_ER = 98, /**< Extended Range Color format for single Y10 plane. */
205
+ cudaEglColorFormatY10_709_ER = 99, /**< Extended Range Color format for single Y10 plane. */
206
+ cudaEglColorFormatY12_ER = 100, /**< Extended Range Color format for single Y12 plane. */
207
+ cudaEglColorFormatY12_709_ER = 101, /**< Extended Range Color format for single Y12 plane. */
208
+ cudaEglColorFormatYUVA = 102, /**< Y, U, V, A four channels in one surface, interleaved as AVUY. */
209
+ cudaEglColorFormatYVYU = 104, /**< Y, U, V in one surface, interleaved as YVYU in one channel. */
210
+ cudaEglColorFormatVYUY = 105, /**< Y, U, V in one surface, interleaved as VYUY in one channel. */
211
+ cudaEglColorFormatY10V10U10_420SemiPlanar_ER = 106, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
212
+ cudaEglColorFormatY10V10U10_420SemiPlanar_709_ER = 107, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
213
+ cudaEglColorFormatY10V10U10_444SemiPlanar_ER = 108, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
214
+ cudaEglColorFormatY10V10U10_444SemiPlanar_709_ER = 109, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
215
+ cudaEglColorFormatY12V12U12_420SemiPlanar_ER = 110, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
216
+ cudaEglColorFormatY12V12U12_420SemiPlanar_709_ER = 111, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
217
+ cudaEglColorFormatY12V12U12_444SemiPlanar_ER = 112, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
218
+ cudaEglColorFormatY12V12U12_444SemiPlanar_709_ER = 113, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
219
+ } cudaEglColorFormat;
220
+
221
+ /**
222
+ * CUDA EGL Plane Descriptor - structure defining each plane of a CUDA EGLFrame
223
+ */
224
+ typedef struct cudaEglPlaneDesc_st {
225
+ unsigned int width; /**< Width of plane */
226
+ unsigned int height; /**< Height of plane */
227
+ unsigned int depth; /**< Depth of plane */
228
+ unsigned int pitch; /**< Pitch of plane */
229
+ unsigned int numChannels; /**< Number of channels for the plane */
230
+ struct cudaChannelFormatDesc channelDesc; /**< Channel Format Descriptor */
231
+ unsigned int reserved[4]; /**< Reserved for future use */
232
+ } cudaEglPlaneDesc;
233
+
234
+ /**
235
+ * CUDA EGLFrame Descriptor - structure defining one frame of EGL.
236
+ *
237
+ * Each frame may contain one or more planes depending on whether the surface is Multiplanar or not.
238
+ * Each plane of EGLFrame is represented by ::cudaEglPlaneDesc which is defined as:
239
+ * \code
240
+ * typedef struct cudaEglPlaneDesc_st {
241
+ * unsigned int width;
242
+ * unsigned int height;
243
+ * unsigned int depth;
244
+ * unsigned int pitch;
245
+ * unsigned int numChannels;
246
+ * struct cudaChannelFormatDesc channelDesc;
247
+ * unsigned int reserved[4];
248
+ * } cudaEglPlaneDesc;
249
+ * \endcode
250
+
251
+ */
252
+ typedef struct cudaEglFrame_st {
253
+ union {
254
+ cudaArray_t pArray[CUDA_EGL_MAX_PLANES]; /**< Array of CUDA arrays corresponding to each plane*/
255
+ struct cudaPitchedPtr pPitch[CUDA_EGL_MAX_PLANES]; /**< Array of Pointers corresponding to each plane*/
256
+ } frame;
257
+ cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES]; /**< CUDA EGL Plane Descriptor ::cudaEglPlaneDesc*/
258
+ unsigned int planeCount; /**< Number of planes */
259
+ cudaEglFrameType frameType; /**< Array or Pitch */
260
+ cudaEglColorFormat eglColorFormat; /**< CUDA EGL Color Format*/
261
+ } cudaEglFrame;
262
+
263
+ /**
264
+ * CUDA EGLSream Connection
265
+ */
266
+ typedef struct CUeglStreamConnection_st *cudaEglStreamConnection;
267
+
268
+ /** @} */ /* END CUDART_TYPES */
269
+
270
+ /**
271
+ * \addtogroup CUDART_EGL EGL Interoperability
272
+ * This section describes the EGL interoperability functions of the CUDA
273
+ * runtime application programming interface.
274
+ *
275
+ * @{
276
+ */
277
+
278
+ /**
279
+ * \brief Registers an EGL image
280
+ *
281
+ * Registers the EGLImageKHR specified by \p image for access by
282
+ * CUDA. A handle to the registered object is returned as \p pCudaResource.
283
+ * Additional Mapping/Unmapping is not required for the registered resource and
284
+ * ::cudaGraphicsResourceGetMappedEglFrame can be directly called on the \p pCudaResource.
285
+ *
286
+ * The application will be responsible for synchronizing access to shared objects.
287
+ * The application must ensure that any pending operation which access the objects have completed
288
+ * before passing control to CUDA. This may be accomplished by issuing and waiting for
289
+ * glFinish command on all GLcontexts (for OpenGL and likewise for other APIs).
290
+ * The application will be also responsible for ensuring that any pending operation on the
291
+ * registered CUDA resource has completed prior to executing subsequent commands in other APIs
292
+ * accesing the same memory objects.
293
+ * This can be accomplished by calling cuCtxSynchronize or cuEventSynchronize (preferably).
294
+ *
295
+ * The surface's intended usage is specified using \p flags, as follows:
296
+ *
297
+ * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
298
+ * resource will be used. It is therefore assumed that this resource will be
299
+ * read from and written to by CUDA. This is the default value.
300
+ * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
301
+ * will not write to this resource.
302
+ * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
303
+ * CUDA will not read from this resource and will write over the
304
+ * entire contents of the resource, so none of the data previously
305
+ * stored in the resource will be preserved.
306
+ *
307
+ * The EGLImageKHR is an object which can be used to create EGLImage target resource. It is defined as a void pointer.
308
+ * typedef void* EGLImageKHR
309
+ *
310
+ * \param pCudaResource - Pointer to the returned object handle
311
+ * \param image - An EGLImageKHR image which can be used to create target resource.
312
+ * \param flags - Map flags
313
+ *
314
+ * \return
315
+ * ::cudaSuccess,
316
+ * ::cudaErrorInvalidResourceHandle,
317
+ * ::cudaErrorInvalidValue,
318
+ * ::cudaErrorUnknown
319
+ *
320
+ * \sa
321
+ * ::cudaGraphicsUnregisterResource,
322
+ * ::cudaGraphicsResourceGetMappedEglFrame,
323
+ * ::cuGraphicsEGLRegisterImage
324
+ */
325
+ extern __host__ cudaError_t CUDARTAPI cudaGraphicsEGLRegisterImage(struct cudaGraphicsResource **pCudaResource, EGLImageKHR image, unsigned int flags);
326
+
327
+ /**
328
+ * \brief Connect CUDA to EGLStream as a consumer.
329
+ *
330
+ * Connect CUDA as a consumer to EGLStreamKHR specified by \p eglStream.
331
+ *
332
+ * The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
333
+ * API to another.
334
+ *
335
+ * \param conn - Pointer to the returned connection handle
336
+ * \param eglStream - EGLStreamKHR handle
337
+ *
338
+ * \return
339
+ * ::cudaSuccess,
340
+ * ::cudaErrorInvalidValue,
341
+ * ::cudaErrorUnknown
342
+ *
343
+ * \sa
344
+ * ::cudaEGLStreamConsumerDisconnect,
345
+ * ::cudaEGLStreamConsumerAcquireFrame,
346
+ * ::cudaEGLStreamConsumerReleaseFrame,
347
+ * ::cuEGLStreamConsumerConnect
348
+ */
349
+ extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerConnect(cudaEglStreamConnection *conn, EGLStreamKHR eglStream);
350
+
351
+ /**
352
+ * \brief Connect CUDA to EGLStream as a consumer with given flags.
353
+ *
354
+ * Connect CUDA as a consumer to EGLStreamKHR specified by \p stream with specified \p flags defined by
355
+ * ::cudaEglResourceLocationFlags.
356
+ *
357
+ * The flags specify whether the consumer wants to access frames from system memory or video memory.
358
+ * Default is ::cudaEglResourceLocationVidmem.
359
+ *
360
+ * \param conn - Pointer to the returned connection handle
361
+ * \param eglStream - EGLStreamKHR handle
362
+ * \param flags - Flags denote intended location - system or video.
363
+ *
364
+ * \return
365
+ * ::cudaSuccess,
366
+ * ::cudaErrorInvalidValue,
367
+ * ::cudaErrorUnknown
368
+ *
369
+ * \sa
370
+ * ::cudaEGLStreamConsumerDisconnect,
371
+ * ::cudaEGLStreamConsumerAcquireFrame,
372
+ * ::cudaEGLStreamConsumerReleaseFrame,
373
+ * ::cuEGLStreamConsumerConnectWithFlags
374
+ */
375
+ extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerConnectWithFlags(cudaEglStreamConnection *conn, EGLStreamKHR eglStream, unsigned int flags);
376
+
377
+ /**
378
+ * \brief Disconnect CUDA as a consumer to EGLStream .
379
+ *
380
+ * Disconnect CUDA as a consumer to EGLStreamKHR.
381
+ *
382
+ * \param conn - Conection to disconnect.
383
+ *
384
+ * \return
385
+ * ::cudaSuccess,
386
+ * ::cudaErrorInvalidValue,
387
+ * ::cudaErrorUnknown
388
+ *
389
+ * \sa
390
+ * ::cudaEGLStreamConsumerConnect,
391
+ * ::cudaEGLStreamConsumerAcquireFrame,
392
+ * ::cudaEGLStreamConsumerReleaseFrame,
393
+ * ::cuEGLStreamConsumerDisconnect
394
+ */
395
+ extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerDisconnect(cudaEglStreamConnection *conn);
396
+
397
+ /**
398
+ * \brief Acquire an image frame from the EGLStream with CUDA as a consumer.
399
+ *
400
+ * Acquire an image frame from EGLStreamKHR.
401
+ * ::cudaGraphicsResourceGetMappedEglFrame can be called on \p pCudaResource to get
402
+ * ::cudaEglFrame.
403
+ *
404
+ * \param conn - Connection on which to acquire
405
+ * \param pCudaResource - CUDA resource on which the EGLStream frame will be mapped for use.
406
+ * \param pStream - CUDA stream for synchronization and any data migrations
407
+ * implied by ::cudaEglResourceLocationFlags.
408
+ * \param timeout - Desired timeout in usec.
409
+ *
410
+ * \return
411
+ * ::cudaSuccess,
412
+ * ::cudaErrorInvalidValue,
413
+ * ::cudaErrorUnknown,
414
+ * ::cudaErrorLaunchTimeout
415
+ *
416
+ * \sa
417
+ * ::cudaEGLStreamConsumerConnect,
418
+ * ::cudaEGLStreamConsumerDisconnect,
419
+ * ::cudaEGLStreamConsumerReleaseFrame,
420
+ * ::cuEGLStreamConsumerAcquireFrame
421
+ */
422
+
423
+ extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerAcquireFrame(cudaEglStreamConnection *conn,
424
+ cudaGraphicsResource_t *pCudaResource, cudaStream_t *pStream, unsigned int timeout);
425
+ /**
426
+ * \brief Releases the last frame acquired from the EGLStream.
427
+ *
428
+ * Release the acquired image frame specified by \p pCudaResource to EGLStreamKHR.
429
+ *
430
+ * \param conn - Connection on which to release
431
+ * \param pCudaResource - CUDA resource whose corresponding frame is to be released
432
+ * \param pStream - CUDA stream on which release will be done.
433
+ *
434
+ * \return
435
+ * ::cudaSuccess,
436
+ * ::cudaErrorInvalidValue,
437
+ * ::cudaErrorUnknown
438
+ *
439
+ * \sa
440
+ * ::cudaEGLStreamConsumerConnect,
441
+ * ::cudaEGLStreamConsumerDisconnect,
442
+ * ::cudaEGLStreamConsumerAcquireFrame,
443
+ * ::cuEGLStreamConsumerReleaseFrame
444
+ */
445
+ extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerReleaseFrame(cudaEglStreamConnection *conn,
446
+ cudaGraphicsResource_t pCudaResource, cudaStream_t *pStream);
447
+
448
+ /**
449
+ * \brief Connect CUDA to EGLStream as a producer.
450
+ *
451
+ * Connect CUDA as a producer to EGLStreamKHR specified by \p stream.
452
+ *
453
+ * The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
454
+ * API to another.
455
+ *
456
+ * \param conn - Pointer to the returned connection handle
457
+ * \param eglStream - EGLStreamKHR handle
458
+ * \param width - width of the image to be submitted to the stream
459
+ * \param height - height of the image to be submitted to the stream
460
+ *
461
+ * \return
462
+ * ::cudaSuccess,
463
+ * ::cudaErrorInvalidValue,
464
+ * ::cudaErrorUnknown
465
+ *
466
+ * \sa
467
+ * ::cudaEGLStreamProducerDisconnect,
468
+ * ::cudaEGLStreamProducerPresentFrame,
469
+ * ::cudaEGLStreamProducerReturnFrame,
470
+ * ::cuEGLStreamProducerConnect
471
+ */
472
+ extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerConnect(cudaEglStreamConnection *conn,
473
+ EGLStreamKHR eglStream, EGLint width, EGLint height);
474
+
475
+ /**
476
+ * \brief Disconnect CUDA as a producer to EGLStream .
477
+ *
478
+ * Disconnect CUDA as a producer to EGLStreamKHR.
479
+ *
480
+ * \param conn - Conection to disconnect.
481
+ *
482
+ * \return
483
+ * ::cudaSuccess,
484
+ * ::cudaErrorInvalidValue,
485
+ * ::cudaErrorUnknown
486
+ *
487
+ * \sa
488
+ * ::cudaEGLStreamProducerConnect,
489
+ * ::cudaEGLStreamProducerPresentFrame,
490
+ * ::cudaEGLStreamProducerReturnFrame,
491
+ * ::cuEGLStreamProducerDisconnect
492
+ */
493
+ extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerDisconnect(cudaEglStreamConnection *conn);
494
+
495
+ /**
496
+ * \brief Present a CUDA eglFrame to the EGLStream with CUDA as a producer.
497
+ *
498
+ * The ::cudaEglFrame is defined as:
499
+ * \code
500
+ * typedef struct cudaEglFrame_st {
501
+ * union {
502
+ * cudaArray_t pArray[CUDA_EGL_MAX_PLANES];
503
+ * struct cudaPitchedPtr pPitch[CUDA_EGL_MAX_PLANES];
504
+ * } frame;
505
+ * cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES];
506
+ * unsigned int planeCount;
507
+ * cudaEglFrameType frameType;
508
+ * cudaEglColorFormat eglColorFormat;
509
+ * } cudaEglFrame;
510
+ * \endcode
511
+ *
512
+ * For ::cudaEglFrame of type ::cudaEglFrameTypePitch, the application may present sub-region of a memory
513
+ * allocation. In that case, ::cudaPitchedPtr::ptr will specify the start address of the sub-region in
514
+ * the allocation and ::cudaEglPlaneDesc will specify the dimensions of the sub-region.
515
+ *
516
+ * \param conn - Connection on which to present the CUDA array
517
+ * \param eglframe - CUDA Eglstream Proucer Frame handle to be sent to the consumer over EglStream.
518
+ * \param pStream - CUDA stream on which to present the frame.
519
+ *
520
+ * \return
521
+ * ::cudaSuccess,
522
+ * ::cudaErrorInvalidValue,
523
+ * ::cudaErrorUnknown
524
+ *
525
+ * \sa
526
+ * ::cudaEGLStreamProducerConnect,
527
+ * ::cudaEGLStreamProducerDisconnect,
528
+ * ::cudaEGLStreamProducerReturnFrame,
529
+ * ::cuEGLStreamProducerPresentFrame
530
+ */
531
+ extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerPresentFrame(cudaEglStreamConnection *conn,
532
+ cudaEglFrame eglframe, cudaStream_t *pStream);
533
+
534
+ /**
535
+ * \brief Return the CUDA eglFrame to the EGLStream last released by the consumer.
536
+ *
537
+ * This API can potentially return cudaErrorLaunchTimeout if the consumer has not
538
+ * returned a frame to EGL stream. If timeout is returned the application can retry.
539
+ *
540
+ * \param conn - Connection on which to present the CUDA array
541
+ * \param eglframe - CUDA Eglstream Proucer Frame handle returned from the consumer over EglStream.
542
+ * \param pStream - CUDA stream on which to return the frame.
543
+ *
544
+ * \return
545
+ * ::cudaSuccess,
546
+ * ::cudaErrorLaunchTimeout,
547
+ * ::cudaErrorInvalidValue,
548
+ * ::cudaErrorUnknown
549
+ *
550
+ * \sa
551
+ * ::cudaEGLStreamProducerConnect,
552
+ * ::cudaEGLStreamProducerDisconnect,
553
+ * ::cudaEGLStreamProducerPresentFrame,
554
+ * ::cuEGLStreamProducerReturnFrame
555
+ */
556
+ extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerReturnFrame(cudaEglStreamConnection *conn,
557
+ cudaEglFrame *eglframe, cudaStream_t *pStream);
558
+
559
+ /**
560
+ * \brief Get an eglFrame through which to access a registered EGL graphics resource.
561
+ *
562
+ * Returns in \p *eglFrame an eglFrame pointer through which the registered graphics resource
563
+ * \p resource may be accessed.
564
+ * This API can only be called for EGL graphics resources.
565
+ *
566
+ * The ::cudaEglFrame is defined as
567
+ * \code
568
+ * typedef struct cudaEglFrame_st {
569
+ * union {
570
+ * cudaArray_t pArray[CUDA_EGL_MAX_PLANES];
571
+ * struct cudaPitchedPtr pPitch[CUDA_EGL_MAX_PLANES];
572
+ * } frame;
573
+ * cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES];
574
+ * unsigned int planeCount;
575
+ * cudaEglFrameType frameType;
576
+ * cudaEglColorFormat eglColorFormat;
577
+ * } cudaEglFrame;
578
+ * \endcode
579
+ *
580
+ *
581
+ * \param eglFrame - Returned eglFrame.
582
+ * \param resource - Registered resource to access.
583
+ * \param index - Index for cubemap surfaces.
584
+ * \param mipLevel - Mipmap level for the subresource to access.
585
+ *
586
+ * \return
587
+ * ::cudaSuccess,
588
+ * ::cudaErrorInvalidValue,
589
+ * ::cudaErrorUnknown
590
+ *
591
+ * \note Note that in case of multiplanar \p *eglFrame, pitch of only first plane (unsigned int cudaEglPlaneDesc::pitch) is to be considered by the application.
592
+ *
593
+ * \sa
594
+ * ::cudaGraphicsSubResourceGetMappedArray,
595
+ * ::cudaGraphicsResourceGetMappedPointer,
596
+ * ::cuGraphicsResourceGetMappedEglFrame
597
+ */
598
+ extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedEglFrame(cudaEglFrame* eglFrame,
599
+ cudaGraphicsResource_t resource, unsigned int index, unsigned int mipLevel);
600
+
601
+ /**
602
+ * \brief Creates an event from EGLSync object
603
+ *
604
+ * Creates an event *phEvent from an EGLSyncKHR eglSync with the flages specified
605
+ * via \p flags. Valid flags include:
606
+ * - ::cudaEventDefault: Default event creation flag.
607
+ * - ::cudaEventBlockingSync: Specifies that the created event should use blocking
608
+ * synchronization. A CPU thread that uses ::cudaEventSynchronize() to wait on
609
+ * an event created with this flag will block until the event has actually
610
+ * been completed.
611
+ *
612
+ * ::cudaEventRecord and TimingData are not supported for events created from EGLSync.
613
+ *
614
+ * The EGLSyncKHR is an opaque handle to an EGL sync object.
615
+ * typedef void* EGLSyncKHR
616
+ *
617
+ * \param phEvent - Returns newly created event
618
+ * \param eglSync - Opaque handle to EGLSync object
619
+ * \param flags - Event creation flags
620
+ *
621
+ * \return
622
+ * ::cudaSuccess,
623
+ * ::cudaErrorInitializationError,
624
+ * ::cudaErrorInvalidValue,
625
+ * ::cudaErrorLaunchFailure,
626
+ * ::cudaErrorMemoryAllocation
627
+ *
628
+ * \sa
629
+ * ::cudaEventQuery,
630
+ * ::cudaEventSynchronize,
631
+ * ::cudaEventDestroy
632
+ */
633
+ extern __host__ cudaError_t CUDARTAPI cudaEventCreateFromEGLSync(cudaEvent_t *phEvent, EGLSyncKHR eglSync, unsigned int flags);
634
+
635
+ /** @} */ /* END CUDART_EGL */
636
+
637
+ #if defined(__cplusplus)
638
+ }
639
+ #endif /* __cplusplus */
640
+
641
+ #endif /* __CUDA_EGL_INTEROP_H__ */
642
+
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp8.hpp ADDED
@@ -0,0 +1,1546 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2022 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_FP8_HPP__)
51
+ #define __CUDA_FP8_HPP__
52
+
53
+ #if !defined(__CUDA_FP8_H__)
54
+ #error "Do not include this file directly. Instead, include cuda_fp8.h."
55
+ #endif
56
+
57
+ /* C++ header for std::memcpy (used for type punning in host-side
58
+ * implementations). When compiling as a CUDA source file memcpy is provided
59
+ * implicitly. !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
60
+ */
61
+ #if defined(__cplusplus) && !defined(__CUDACC__)
62
+ #include <cstring>
63
+ #elif !defined(__cplusplus) && !defined(__CUDACC__)
64
+ #include <string.h>
65
+ #endif /* defined(__cplusplus) && !defined(__CUDACC__) */
66
+
67
+ /* Set up structure-alignment attribute */
68
+ #if !(defined __CUDA_ALIGN__)
69
+ #if defined(__CUDACC__)
70
+ #define __CUDA_ALIGN__(align) __align__(align)
71
+ #else
72
+ /* Define alignment macro based on compiler type (cannot assume C11 "_Alignas"
73
+ * is available) */
74
+ #if __cplusplus >= 201103L
75
+ #define __CUDA_ALIGN__(n) \
76
+ alignas(n) /* C++11 kindly gives us a keyword for this */
77
+ #else /* !defined(__CPP_VERSION_AT_LEAST_11_FP8)*/
78
+ #if defined(__GNUC__)
79
+ #define __CUDA_ALIGN__(n) __attribute__((aligned(n)))
80
+ #elif defined(_MSC_VER)
81
+ #define __CUDA_ALIGN__(n) __declspec(align(n))
82
+ #else
83
+ #define __CUDA_ALIGN__(n)
84
+ #endif /* defined(__GNUC__) */
85
+ #endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
86
+ #endif /* defined(__CUDACC__) */
87
+ #endif /* !(defined __CUDA_ALIGN__) */
88
+
89
+ #if !(defined __CPP_VERSION_AT_LEAST_11_FP8)
90
+ /* need c++11 for explicit operators */
91
+ #define __CUDA_NO_FP8_CONVERSION_OPERATORS__
92
+ #endif
93
+
94
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
95
+ __nv_cvt_double_to_fp8(const double x, const __nv_saturation_t saturate,
96
+ const __nv_fp8_interpretation_t fp8_interpretation) {
97
+ unsigned char res;
98
+ unsigned long long int xbits;
99
+
100
+ #if defined(__CUDACC__) || (!defined __cplusplus)
101
+ (void)memcpy(&xbits, &x, sizeof(x));
102
+ #else
103
+ (void)std::memcpy(&xbits, &x, sizeof(x));
104
+ #endif
105
+ unsigned char FP8_MAXNORM;
106
+ unsigned char FP8_MANTISSA_MASK;
107
+ unsigned short int FP8_EXP_BIAS;
108
+ unsigned long long int FP8_SIGNIFICAND_BITS;
109
+ const unsigned long long int DP_INF_BITS = 0x7FF0000000000000ULL;
110
+ unsigned long long int FP8_MINDENORM_O2;
111
+ unsigned long long int FP8_OVERFLOW_THRESHOLD;
112
+ unsigned long long int FP8_MINNORM;
113
+
114
+ if (fp8_interpretation == __NV_E4M3) {
115
+ FP8_EXP_BIAS = 7U;
116
+ FP8_SIGNIFICAND_BITS = 4ULL;
117
+ FP8_MANTISSA_MASK = 0x7U;
118
+ FP8_MINDENORM_O2 = 0x3F50000000000000ULL; // mindenorm/2 = 2^-10
119
+ FP8_OVERFLOW_THRESHOLD =
120
+ 0x407D000000000000ULL; // maxnorm + 1/2ulp = 0x1.Cp+8 + 0x1p+4
121
+ FP8_MAXNORM = 0x7EU;
122
+ FP8_MINNORM = 0x3F90000000000000ULL; // minnorm = 2^-6
123
+ } else { //__NV_E5M2
124
+ FP8_EXP_BIAS = 15U;
125
+ FP8_SIGNIFICAND_BITS = 3ULL;
126
+ FP8_MANTISSA_MASK = 0x3U;
127
+ FP8_MINDENORM_O2 = 0x3EE0000000000000ULL; // mindenorm/2 = 2^-17
128
+ FP8_OVERFLOW_THRESHOLD =
129
+ 0x40EE000000000000ULL -
130
+ 1ULL; // maxnorm + 1/2ulp = 0x1.Ep+15, and -1 to have common code
131
+ FP8_MAXNORM = 0x7BU;
132
+ FP8_MINNORM = 0x3F10000000000000ULL; // minnorm = 2^-14
133
+ }
134
+
135
+ // 1/2 LSB of the target format, positioned in double precision mantissa
136
+ // helpful in midpoints detection during round-to-nearest-even step
137
+ const unsigned long long int FP8_DP_HALF_ULP =
138
+ (unsigned long long int)1ULL << (53ULL - FP8_SIGNIFICAND_BITS - 1ULL);
139
+ // prepare sign bit in target format
140
+ unsigned char sign = (unsigned char)((xbits >> 63ULL) << 7U);
141
+ // prepare exponent field in target format
142
+ unsigned char exp =
143
+ (unsigned char)((((unsigned short int)(xbits >> 52ULL)) & 0x7FFU) -
144
+ 1023U + FP8_EXP_BIAS);
145
+ // round mantissa to target format width, rounding towards zero
146
+ unsigned char mantissa =
147
+ (unsigned char)(xbits >> (53ULL - FP8_SIGNIFICAND_BITS)) &
148
+ FP8_MANTISSA_MASK;
149
+ unsigned long long int absx = xbits & 0x7FFFFFFFFFFFFFFFULL;
150
+
151
+ if (absx <= FP8_MINDENORM_O2) {
152
+ // zero or underflow
153
+ res = 0U;
154
+ } else if (absx > DP_INF_BITS) {
155
+ // NaN
156
+ if (fp8_interpretation == __NV_E4M3) {
157
+ res = 0x7FU;
158
+ } else {
159
+ // NaN --> QNaN
160
+ res = 0x7EU | mantissa;
161
+ }
162
+ } else if (absx > FP8_OVERFLOW_THRESHOLD) {
163
+ if (saturate == __NV_SATFINITE) {
164
+ res = FP8_MAXNORM;
165
+ } else {
166
+ // __NV_NOSAT
167
+ if (fp8_interpretation == __NV_E4M3) {
168
+ // no Inf in E4M3
169
+ res = 0x7FU; // NaN
170
+ } else {
171
+ res = 0x7CU; // Inf in E5M2
172
+ }
173
+ }
174
+ } else if (absx >= FP8_MINNORM) {
175
+ res = (unsigned char)((exp << (FP8_SIGNIFICAND_BITS - 1U)) | mantissa);
176
+ // rounded-off bits
177
+ unsigned long long int round =
178
+ xbits & ((FP8_DP_HALF_ULP << 1ULL) - 1ULL);
179
+ // round-to-nearest-even adjustment
180
+ if ((round > FP8_DP_HALF_ULP) ||
181
+ ((round == FP8_DP_HALF_ULP) && (mantissa & 1U))) {
182
+ res = (unsigned char)(res + 1U);
183
+ }
184
+ } else // Denormal range
185
+ {
186
+ unsigned char shift = (unsigned char)(1U - exp);
187
+ // add implicit leading bit
188
+ mantissa |= (unsigned char)(1U << (FP8_SIGNIFICAND_BITS - 1U));
189
+ // additional round-off due to denormalization
190
+ res = (unsigned char)(mantissa >> shift);
191
+
192
+ // rounded-off bits, including implicit leading bit
193
+ unsigned long long int round =
194
+ (xbits | ((unsigned long long int)1ULL << (53ULL - 1ULL))) &
195
+ ((FP8_DP_HALF_ULP << (shift + 1ULL)) - 1ULL);
196
+ // round-to-nearest-even adjustment
197
+ if ((round > (FP8_DP_HALF_ULP << shift)) ||
198
+ ((round == (FP8_DP_HALF_ULP << shift)) && (res & 1U))) {
199
+ res = (unsigned char)(res + 1U);
200
+ }
201
+ }
202
+
203
+ res |= sign;
204
+
205
+ return (__nv_fp8_storage_t)res;
206
+ }
207
+
208
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
209
+ __nv_cvt_double2_to_fp8x2(const double2 x, const __nv_saturation_t saturate,
210
+ const __nv_fp8_interpretation_t fp8_interpretation) {
211
+ __nv_fp8x2_storage_t storage = (__nv_fp8x2_storage_t)__nv_cvt_double_to_fp8(
212
+ x.y, saturate, fp8_interpretation);
213
+ storage = (__nv_fp8x2_storage_t)(storage << 8U);
214
+ storage = (__nv_fp8x2_storage_t)(storage |
215
+ __nv_cvt_double_to_fp8(
216
+ x.x, saturate, fp8_interpretation));
217
+ return storage;
218
+ }
219
+
220
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
221
+ __nv_cvt_float_to_fp8(const float x, const __nv_saturation_t saturate,
222
+ const __nv_fp8_interpretation_t fp8_interpretation) {
223
+ __nv_fp8_storage_t res = 0U;
224
+ #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
225
+ if (saturate == __NV_SATFINITE) {
226
+ __nv_fp8x2_storage_t storage;
227
+ if (fp8_interpretation == __NV_E5M2) {
228
+ asm("{cvt.rn.satfinite.e5m2x2.f32 %0, %2, %1;}\n"
229
+ : "=h"(storage)
230
+ : "f"(x), "f"(0.0f));
231
+ } else {
232
+ asm("{cvt.rn.satfinite.e4m3x2.f32 %0, %2, %1;}\n"
233
+ : "=h"(storage)
234
+ : "f"(x), "f"(0.0f));
235
+ }
236
+ res = (__nv_fp8_storage_t)storage;
237
+ } else
238
+ #endif
239
+ {
240
+ unsigned int xbits;
241
+ #if defined(__CUDACC__) || (!defined __cplusplus)
242
+ (void)memcpy(&xbits, &x, sizeof(x));
243
+ #else
244
+ (void)std::memcpy(&xbits, &x, sizeof(x));
245
+ #endif
246
+
247
+ // isnan
248
+ if ((xbits & 0x7FFFFFFFU) > 0x7F800000U) {
249
+ // Canonical NaN
250
+ xbits = 0x7FFFFFFFU;
251
+ }
252
+
253
+ float fx;
254
+ #if defined(__CUDACC__) || (!defined __cplusplus)
255
+ (void)memcpy(&fx, &xbits, sizeof(xbits));
256
+ #else
257
+ (void)std::memcpy(&fx, &xbits, sizeof(xbits));
258
+ #endif
259
+
260
+ const double dx = (double)fx;
261
+ res = __nv_cvt_double_to_fp8(dx, saturate, fp8_interpretation);
262
+ }
263
+ return res;
264
+ }
265
+
266
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
267
+ __nv_cvt_float2_to_fp8x2(const float2 x, const __nv_saturation_t saturate,
268
+ const __nv_fp8_interpretation_t fp8_interpretation) {
269
+ __nv_fp8x2_storage_t storage;
270
+ #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
271
+ if (saturate == __NV_SATFINITE) {
272
+ if (fp8_interpretation == __NV_E5M2) {
273
+ asm("{cvt.rn.satfinite.e5m2x2.f32 %0, %2, %1;}\n"
274
+ : "=h"(storage)
275
+ : "f"(x.x), "f"(x.y));
276
+ } else {
277
+ asm("{cvt.rn.satfinite.e4m3x2.f32 %0, %2, %1;}\n"
278
+ : "=h"(storage)
279
+ : "f"(x.x), "f"(x.y));
280
+ }
281
+ } else
282
+ #endif
283
+ {
284
+ storage = (__nv_fp8x2_storage_t)__nv_cvt_float_to_fp8(
285
+ x.y, saturate, fp8_interpretation);
286
+ storage = (__nv_fp8x2_storage_t)(storage << 8U);
287
+ storage = (__nv_fp8x2_storage_t)(storage | __nv_cvt_float_to_fp8(
288
+ x.x, saturate,
289
+ fp8_interpretation));
290
+ }
291
+ return storage;
292
+ }
293
+
294
+ __CUDA_HOSTDEVICE_FP8_DECL__ float
295
+ __internal_halfraw_to_float(const __half_raw x) {
296
+ float f;
297
+ #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
298
+ asm("{cvt.f32.f16 %0, %1;}\n" : "=f"(f) : "h"(x.x));
299
+ #else
300
+ const unsigned int ux = (unsigned int)x.x;
301
+ unsigned int sign = (ux >> 15U) & 1U;
302
+ unsigned int exponent = (ux >> 10U) & 0x1fU;
303
+ unsigned int mantissa = (ux & 0x3ffU) << 13U;
304
+ if (exponent == 0x1fU) { /* NaN or Inf */
305
+ /* discard sign of a NaN */
306
+ sign = ((mantissa != 0U) ? (sign >> 1U) : sign);
307
+ mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U);
308
+ exponent = 0xffU;
309
+ } else if (exponent == 0U) { /* Denorm or Zero */
310
+ if (mantissa != 0U) {
311
+ unsigned int msb;
312
+ exponent = 0x71U;
313
+ do {
314
+ msb = (mantissa & 0x400000U);
315
+ mantissa <<= 1U; /* normalize */
316
+ --exponent;
317
+ } while (msb == 0U);
318
+ mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
319
+ }
320
+ } else {
321
+ exponent += 0x70U;
322
+ }
323
+ const unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa);
324
+ #if defined(__CUDACC__) || (!defined __cplusplus)
325
+ (void)memcpy(&f, &u, sizeof(u));
326
+ #else
327
+ (void)std::memcpy(&f, &u, sizeof(u));
328
+ #endif
329
+ #endif /* (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) */
330
+ return f;
331
+ }
332
+
333
+ __CUDA_HOSTDEVICE_FP8_DECL__ float2
334
+ __internal_halfraw2_to_float2(const __half2_raw x) {
335
+ __half_raw raw;
336
+ float2 res;
337
+ raw.x = x.x;
338
+ res.x = __internal_halfraw_to_float(raw);
339
+ raw.x = x.y;
340
+ res.y = __internal_halfraw_to_float(raw);
341
+ return res;
342
+ }
343
+
344
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
345
+ __nv_cvt_halfraw_to_fp8(const __half_raw x, const __nv_saturation_t saturate,
346
+ const __nv_fp8_interpretation_t fp8_interpretation) {
347
+ __nv_fp8_storage_t res = 0U;
348
+ #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
349
+ if (saturate == __NV_SATFINITE) {
350
+ unsigned int half2_storage = (unsigned int)(x.x);
351
+ __nv_fp8x2_storage_t tmp;
352
+ if (fp8_interpretation == __NV_E5M2) {
353
+ asm("{cvt.rn.satfinite.e5m2x2.f16x2 %0, %1;}\n"
354
+ : "=h"(tmp)
355
+ : "r"(half2_storage));
356
+ } else {
357
+ asm("{cvt.rn.satfinite.e4m3x2.f16x2 %0, %1;}\n"
358
+ : "=h"(tmp)
359
+ : "r"(half2_storage));
360
+ }
361
+ res = (__nv_fp8_storage_t)tmp;
362
+ } else
363
+ #endif
364
+ {
365
+ float fx = __internal_halfraw_to_float(x);
366
+ res = __nv_cvt_float_to_fp8(fx, saturate, fp8_interpretation);
367
+ }
368
+ return res;
369
+ }
370
+
371
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_halfraw2_to_fp8x2(
372
+ const __half2_raw x, const __nv_saturation_t saturate,
373
+ const __nv_fp8_interpretation_t fp8_interpretation) {
374
+ __nv_fp8x2_storage_t tmp;
375
+ #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
376
+ if (saturate == __NV_SATFINITE) {
377
+ unsigned int half2_storage;
378
+ (void)memcpy(&half2_storage, &x, sizeof(x));
379
+
380
+ if (fp8_interpretation == __NV_E5M2) {
381
+ asm("{cvt.rn.satfinite.e5m2x2.f16x2 %0, %1;}\n"
382
+ : "=h"(tmp)
383
+ : "r"(half2_storage));
384
+ } else {
385
+ asm("{cvt.rn.satfinite.e4m3x2.f16x2 %0, %1;}\n"
386
+ : "=h"(tmp)
387
+ : "r"(half2_storage));
388
+ }
389
+ } else
390
+ #endif
391
+ {
392
+ __half_raw raw;
393
+ raw.x = x.x;
394
+ __nv_fp8_storage_t lo =
395
+ __nv_cvt_halfraw_to_fp8(raw, saturate, fp8_interpretation);
396
+ raw.x = x.y;
397
+ __nv_fp8_storage_t hi =
398
+ __nv_cvt_halfraw_to_fp8(raw, saturate, fp8_interpretation);
399
+ tmp = hi;
400
+ tmp = (__nv_fp8x2_storage_t)(tmp << 8U);
401
+ tmp = (__nv_fp8x2_storage_t)(tmp | lo);
402
+ }
403
+ return tmp;
404
+ }
405
+
406
+ __CUDA_HOSTDEVICE_FP8_DECL__ float
407
+ __internal_bf16raw_to_float(const __nv_bfloat16_raw x) {
408
+ const unsigned int ux = ((unsigned int)x.x) << 16U;
409
+ float fx;
410
+ #if defined(__CUDACC__) || (!defined __cplusplus)
411
+ (void)memcpy(&fx, &ux, sizeof(ux));
412
+ #else
413
+ (void)std::memcpy(&fx, &ux, sizeof(ux));
414
+ #endif
415
+ return fx;
416
+ }
417
+
418
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_bfloat16_raw
419
+ __internal_float_to_bf16raw_rz(const float x) {
420
+ unsigned int ux;
421
+ __nv_bfloat16_raw r;
422
+ #if defined(__CUDACC__) || (!defined __cplusplus)
423
+ (void)memcpy(&ux, &x, sizeof(x));
424
+ #else
425
+ (void)std::memcpy(&ux, &x, sizeof(x));
426
+ #endif
427
+ r.x = (unsigned short int)(ux >> 16U);
428
+ return r;
429
+ }
430
+
431
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_bfloat16raw_to_fp8(
432
+ const __nv_bfloat16_raw x, const __nv_saturation_t saturate,
433
+ const __nv_fp8_interpretation_t fp8_interpretation) {
434
+ const float fx = __internal_bf16raw_to_float(x);
435
+ const __nv_fp8_storage_t res =
436
+ __nv_cvt_float_to_fp8(fx, saturate, fp8_interpretation);
437
+ return res;
438
+ }
439
+
440
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
441
+ __nv_cvt_bfloat16raw2_to_fp8x2(
442
+ const __nv_bfloat162_raw x, const __nv_saturation_t saturate,
443
+ const __nv_fp8_interpretation_t fp8_interpretation) {
444
+ __nv_bfloat16_raw raw;
445
+ raw.x = x.y;
446
+ __nv_fp8x2_storage_t storage =
447
+ (__nv_fp8x2_storage_t)__nv_cvt_bfloat16raw_to_fp8(raw, saturate,
448
+ fp8_interpretation);
449
+ storage = (__nv_fp8x2_storage_t)(storage << 8U);
450
+ raw.x = x.x;
451
+ storage = (__nv_fp8x2_storage_t)(storage |
452
+ __nv_cvt_bfloat16raw_to_fp8(
453
+ raw, saturate, fp8_interpretation));
454
+ return storage;
455
+ }
456
+
457
+ __CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw
458
+ __nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x,
459
+ const __nv_fp8_interpretation_t fp8_interpretation);
460
+ __CUDA_HOSTDEVICE_FP8_DECL__ __half_raw
461
+ __nv_cvt_fp8_to_halfraw(const __nv_fp8_storage_t x,
462
+ const __nv_fp8_interpretation_t fp8_interpretation) {
463
+ __half_raw res;
464
+ res.x = 0U;
465
+ #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
466
+ res.x =
467
+ __nv_cvt_fp8x2_to_halfraw2((__nv_fp8x2_storage_t)x, fp8_interpretation)
468
+ .x;
469
+ #else
470
+ unsigned short int ur = (unsigned short int)x;
471
+ ur = (unsigned short int)(ur << 8U);
472
+
473
+ if (fp8_interpretation == __NV_E5M2) {
474
+ if ((ur & 0x7FFFU) > 0x7C00U) {
475
+ /* If NaN, return canonical NaN */
476
+ ur = 0x7FFFU;
477
+ }
478
+ } else { // __NV_E4M3
479
+ unsigned short int sign = ur & 0x8000U;
480
+ unsigned short int exponent =
481
+ (unsigned short int)(((ur & 0x7800U) >> 1U) + 0x2000U);
482
+ unsigned short int mantissa = (ur & 0x0700U) >> 1U;
483
+ unsigned char absx = 0x7FU & (unsigned char)x;
484
+
485
+ if (absx == 0x7FU) // NaN
486
+ {
487
+ ur = 0x7FFFU; // fp16 canonical NaN, discard sign
488
+ } else if (exponent == 0x2000U) {
489
+ // zero or denormal
490
+ if (mantissa != 0U) {
491
+ // normalize
492
+ mantissa = (unsigned short int)(mantissa << 1U);
493
+ while ((mantissa & 0x0400U) == 0U) {
494
+ mantissa = (unsigned short int)(mantissa << 1U);
495
+ exponent = (unsigned short int)(exponent - 0x0400U);
496
+ }
497
+ // discard implicit leading bit
498
+ mantissa &= 0x03FFU;
499
+ } else { // Zero
500
+ exponent = 0U;
501
+ }
502
+
503
+ ur = (sign | exponent) | mantissa;
504
+ } else {
505
+ ur = (sign | exponent) | mantissa;
506
+ }
507
+ }
508
+ res.x = ur;
509
+ #endif
510
+ return res;
511
+ }
512
+
513
+ __CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw
514
+ __nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x,
515
+ const __nv_fp8_interpretation_t fp8_interpretation) {
516
+ __half2_raw res;
517
+ #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
518
+ unsigned int half2_storage;
519
+ if (fp8_interpretation == __NV_E5M2) {
520
+ asm("{cvt.rn.f16x2.e5m2x2 %0, %1;}\n" : "=r"(half2_storage) : "h"(x));
521
+ } else {
522
+ asm("{cvt.rn.f16x2.e4m3x2 %0, %1;}\n" : "=r"(half2_storage) : "h"(x));
523
+ }
524
+ (void)memcpy(&res, &half2_storage, sizeof(half2_storage));
525
+ #else
526
+ res.x =
527
+ __nv_cvt_fp8_to_halfraw((__nv_fp8_storage_t)x, fp8_interpretation).x;
528
+ res.y = __nv_cvt_fp8_to_halfraw((__nv_fp8_storage_t)(x >> 8U),
529
+ fp8_interpretation)
530
+ .x;
531
+ #endif
532
+ return res;
533
+ }
534
+
535
+ /* All other definitions in this file are only visible to C++ compilers */
536
+ #if defined(__cplusplus)
537
+
538
+ /**
539
+ * \defgroup CUDA_MATH_FP8_E5M2_STRUCT C++ struct for handling fp8 data type of e5m2 kind.
540
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
541
+ */
542
+
543
+ /**
544
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
545
+ * \brief __nv_fp8_e5m2 datatype
546
+ *
547
+ * \details This structure implements the datatype for handling
548
+ * \p fp8 floating-point numbers of \p e5m2 kind:
549
+ * with 1 sign, 5 exponent, 1 implicit and 2 explicit mantissa bits.
550
+ *
551
+ * The structure implements converting constructors and operators.
552
+ */
553
+ struct __CUDA_ALIGN__(1) __nv_fp8_e5m2 {
554
+ public:
555
+ /**
556
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
557
+ * Storage variable contains the \p fp8 floating-point data.
558
+ */
559
+ __nv_fp8_storage_t __x;
560
+
561
+ /**
562
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
563
+ * Constructor by default.
564
+ */
565
+ #if defined(__CPP_VERSION_AT_LEAST_11_FP8)
566
+ __nv_fp8_e5m2() = default;
567
+ #else
568
+ __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2() {}
569
+ #endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
570
+
571
+ #if !defined(__CUDA_NO_FP8_CONVERSIONS__)
572
+
573
+ /* Construct from wider FP types */
574
+ /* Note we do avoid constructor init-list because of special host/device
575
+ * compilation rules */
576
+
577
+ /**
578
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
579
+ * Constructor from \p __half data type, relies on \p __NV_SATFINITE
580
+ * behavior for out-of-range values.
581
+ */
582
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const __half f) {
583
+ __x = __nv_cvt_halfraw_to_fp8(static_cast<__half_raw>(f),
584
+ __NV_SATFINITE, __NV_E5M2);
585
+ }
586
+ /**
587
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
588
+ * Constructor from \p __nv_bfloat16 data type, relies on \p __NV_SATFINITE
589
+ * behavior for out-of-range values.
590
+ */
591
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const __nv_bfloat16 f) {
592
+ __x = __nv_cvt_bfloat16raw_to_fp8(static_cast<__nv_bfloat16_raw>(f),
593
+ __NV_SATFINITE, __NV_E5M2);
594
+ }
595
+ /**
596
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
597
+ * Constructor from \p float data type, relies on \p __NV_SATFINITE behavior
598
+ * for out-of-range values.
599
+ */
600
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const float f) {
601
+ __x = __nv_cvt_float_to_fp8(f, __NV_SATFINITE, __NV_E5M2);
602
+ }
603
+ /**
604
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
605
+ * Constructor from \p double data type, relies on \p __NV_SATFINITE
606
+ * behavior for out-of-range values.
607
+ */
608
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const double f) {
609
+ __x = __nv_cvt_double_to_fp8(f, __NV_SATFINITE, __NV_E5M2);
610
+ }
611
+
612
+ /* Converts from integral */
613
+
614
+ /**
615
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
616
+ * Constructor from \p unsigned \p short \p int data type, relies on \p
617
+ * __NV_SATFINITE behavior for out-of-range values.
618
+ */
619
+ explicit __CUDA_HOSTDEVICE_FP8__
620
+ __nv_fp8_e5m2(const unsigned short int val) {
621
+ __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
622
+ }
623
+ /**
624
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
625
+ * Constructor from \p unsigned \p int data type, relies on \p
626
+ * __NV_SATFINITE behavior for out-of-range values.
627
+ */
628
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const unsigned int val) {
629
+ __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
630
+ }
631
+ /**
632
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
633
+ * Constructor from \p unsigned \p long \p long \p int data type, relies on
634
+ * \p __NV_SATFINITE behavior for out-of-range values.
635
+ */
636
+ explicit __CUDA_HOSTDEVICE_FP8__
637
+ __nv_fp8_e5m2(const unsigned long long int val) {
638
+ __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
639
+ }
640
+
641
+ /**
642
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
643
+ * Constructor from \p short \p int data type.
644
+ */
645
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const short int val) {
646
+ __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
647
+ }
648
+ /**
649
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
650
+ * Constructor from \p int data type, relies on \p __NV_SATFINITE behavior
651
+ * for out-of-range values.
652
+ */
653
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const int val) {
654
+ __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
655
+ }
656
+ /**
657
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
658
+ * Constructor from \p long \p long \p int data type, relies on \p
659
+ * __NV_SATFINITE behavior for out-of-range values.
660
+ */
661
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const long long int val) {
662
+ __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
663
+ }
664
+
665
+ #if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
666
+ /* Widening FP converts */
667
+ /**
668
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
669
+ * Conversion operator to \p __half data type.
670
+ */
671
+ explicit __CUDA_HOSTDEVICE_FP8__ operator __half() const {
672
+ return static_cast<__half>(__nv_cvt_fp8_to_halfraw(__x, __NV_E5M2));
673
+ }
674
+ /**
675
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
676
+ * Conversion operator to \p float data type.
677
+ */
678
+ explicit __CUDA_HOSTDEVICE_FP8__ operator float() const {
679
+ return __internal_halfraw_to_float(
680
+ __nv_cvt_fp8_to_halfraw(__x, __NV_E5M2));
681
+ }
682
+ /**
683
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
684
+ * Conversion operator to \p __nv_bfloat16 data type.
685
+ */
686
+ explicit __CUDA_HOSTDEVICE_FP8__ operator __nv_bfloat16() const {
687
+ return static_cast<__nv_bfloat16>(
688
+ __internal_float_to_bf16raw_rz(float(*this)));
689
+ }
690
+ /**
691
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
692
+ * Conversion operator to \p double data type.
693
+ */
694
+ explicit __CUDA_HOSTDEVICE_FP8__ operator double() const {
695
+ return static_cast<double>(float(*this));
696
+ }
697
+
698
+ /* Convert to integral */
699
+
700
+ /**
701
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
702
+ * Conversion operator to \p unsigned \p char data type.
703
+ * Clamps negative and too large inputs to the output range.
704
+ * \p NaN inputs convert to \p zero.
705
+ */
706
+ explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned char() const {
707
+ unsigned char i;
708
+ const float f = float(*this);
709
+ const unsigned char max_val = 0xFFU;
710
+ const unsigned char min_val = 0U;
711
+ const unsigned char bits = (*this).__x;
712
+ // saturation fixup
713
+ if ((bits & 0x7FU) > 0x7CU) {
714
+ // NaN
715
+ i = 0;
716
+ } else if (f > static_cast<float>(max_val)) {
717
+ // saturate maximum
718
+ i = max_val;
719
+ } else if (f < static_cast<float>(min_val)) {
720
+ // saturate minimum
721
+ i = min_val;
722
+ } else {
723
+ // normal value
724
+ i = static_cast<unsigned char>(f);
725
+ }
726
+ return i;
727
+ }
728
+ /**
729
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
730
+ * Conversion operator to \p unsigned \p short \p int data type.
731
+ * Clamps negative and too large inputs to the output range.
732
+ * \p NaN inputs convert to \p zero.
733
+ */
734
+ explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned short int() const {
735
+ return __half2ushort_rz(__half(*this));
736
+ }
737
+ /**
738
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
739
+ * Conversion operator to \p unsigned \p int data type.
740
+ * Clamps negative and too large inputs to the output range.
741
+ * \p NaN inputs convert to \p zero.
742
+ */
743
+ explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned int() const {
744
+ return __half2uint_rz(__half(*this));
745
+ }
746
+ /**
747
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
748
+ * Conversion operator to \p unsigned \p long \p long \p int data type.
749
+ * Clamps negative and too large inputs to the output range.
750
+ * \p NaN inputs convert to \p 0x8000000000000000ULL.
751
+ */
752
+ explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long long int() const {
753
+ return __half2ull_rz(__half(*this));
754
+ }
755
+
756
+ /**
757
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
758
+ * Conversion operator to \p signed \p char data type.
759
+ * Clamps too large inputs to the output range.
760
+ * \p NaN inputs convert to \p zero.
761
+ */
762
+ explicit __CUDA_HOSTDEVICE_FP8__ operator signed char() const {
763
+ signed char i;
764
+ const float f = float(*this);
765
+ const signed char max_val = (signed char)0x7FU;
766
+ const signed char min_val = (signed char)0x80U;
767
+ const unsigned char bits = (*this).__x;
768
+ // saturation fixup
769
+ if ((bits & 0x7FU) > 0x7CU) {
770
+ // NaN
771
+ i = 0;
772
+ } else if (f > static_cast<float>(max_val)) {
773
+ // saturate maximum
774
+ i = max_val;
775
+ } else if (f < static_cast<float>(min_val)) {
776
+ // saturate minimum
777
+ i = min_val;
778
+ } else {
779
+ // normal value
780
+ i = static_cast<signed char>(f);
781
+ }
782
+ return i;
783
+ }
784
+ /**
785
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
786
+ * Conversion operator to \p short \p int data type.
787
+ * Clamps too large inputs to the output range.
788
+ * \p NaN inputs convert to \p zero.
789
+ */
790
+ explicit __CUDA_HOSTDEVICE_FP8__ operator short int() const {
791
+ return __half2short_rz(__half(*this));
792
+ }
793
+ /**
794
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
795
+ * Conversion operator to \p int data type.
796
+ * Clamps too large inputs to the output range.
797
+ * \p NaN inputs convert to \p zero.
798
+ */
799
+ explicit __CUDA_HOSTDEVICE_FP8__ operator int() const {
800
+ return __half2int_rz(__half(*this));
801
+ }
802
+ /**
803
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
804
+ * Conversion operator to \p long \p long \p int data type.
805
+ * Clamps too large inputs to the output range.
806
+ * \p NaN inputs convert to \p 0x8000000000000000LL.
807
+ */
808
+ explicit __CUDA_HOSTDEVICE_FP8__ operator long long int() const {
809
+ return __half2ll_rz(__half(*this));
810
+ }
811
+
812
+ /**
813
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
814
+ * Conversion operator to \p bool data type.
815
+ * +0 and -0 inputs convert to \p false.
816
+ * Non-zero inputs convert to \p true.
817
+ */
818
+ explicit __CUDA_HOSTDEVICE_FP8__ operator bool() const {
819
+ return (__x & 0x7FU) != 0U;
820
+ }
821
+ #endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
822
+ #endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
823
+ };
824
+
825
+ /**
826
+ * \defgroup CUDA_MATH_FP8X2_E5M2_STRUCT C++ struct for handling vector type of two fp8 values of e5m2 kind.
827
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
828
+ */
829
+
830
+ /**
831
+ * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
832
+ * \brief __nv_fp8x2_e5m2 datatype
833
+ *
834
+ * \details This structure implements the datatype for handling two
835
+ * \p fp8 floating-point numbers of \p e5m2 kind each:
836
+ * with 1 sign, 5 exponent, 1 implicit and 2 explicit mantissa bits.
837
+ *
838
+ * The structure implements converting constructors and operators.
839
+ */
840
+ struct __CUDA_ALIGN__(2) __nv_fp8x2_e5m2 {
841
+ public:
842
+ /**
843
+ * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
844
+ * Storage variable contains the vector of two \p fp8 floating-point data
845
+ * values.
846
+ */
847
+ __nv_fp8x2_storage_t __x;
848
+
849
+ /**
850
+ * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
851
+ * Constructor by default.
852
+ */
853
+ #if defined(__CPP_VERSION_AT_LEAST_11_FP8)
854
+ __nv_fp8x2_e5m2() = default;
855
+ #else
856
+ __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2() {}
857
+ #endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
858
+
859
+ #if !defined(__CUDA_NO_FP8_CONVERSIONS__)
860
+
861
+ /* Construct from wider types */
862
+
863
+ /**
864
+ * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
865
+ * Constructor from \p __half2 data type, relies on \p __NV_SATFINITE
866
+ * behavior for out-of-range values.
867
+ */
868
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const __half2 f) {
869
+ __x = __nv_cvt_halfraw2_to_fp8x2(static_cast<__half2_raw>(f),
870
+ __NV_SATFINITE, __NV_E5M2);
871
+ }
872
+ /**
873
+ * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
874
+ * Constructor from \p __nv_bfloat162 data type, relies on \p __NV_SATFINITE
875
+ * behavior for out-of-range values.
876
+ */
877
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const __nv_bfloat162 f) {
878
+ __x = __nv_cvt_bfloat16raw2_to_fp8x2(static_cast<__nv_bfloat162_raw>(f),
879
+ __NV_SATFINITE, __NV_E5M2);
880
+ }
881
+ /**
882
+ * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
883
+ * Constructor from \p float2 data type, relies on \p __NV_SATFINITE
884
+ * behavior for out-of-range values.
885
+ */
886
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const float2 f) {
887
+ __x = __nv_cvt_float2_to_fp8x2(f, __NV_SATFINITE, __NV_E5M2);
888
+ }
889
+ /**
890
+ * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
891
+ * Constructor from \p double2 data type, relies on \p __NV_SATFINITE
892
+ * behavior for out-of-range values.
893
+ */
894
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const double2 f) {
895
+ __x = __nv_cvt_double2_to_fp8x2(f, __NV_SATFINITE, __NV_E5M2);
896
+ }
897
+
898
+ #if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
899
+ /* Widening converts */
900
+ /**
901
+ * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
902
+ * Conversion operator to \p __half2 data type.
903
+ */
904
+ explicit __CUDA_HOSTDEVICE_FP8__ operator __half2() const {
905
+ return static_cast<__half2>(__nv_cvt_fp8x2_to_halfraw2(__x, __NV_E5M2));
906
+ }
907
+ /**
908
+ * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
909
+ * Conversion operator to \p float2 data type.
910
+ */
911
+ explicit __CUDA_HOSTDEVICE_FP8__ operator float2() const {
912
+ return __internal_halfraw2_to_float2(
913
+ __nv_cvt_fp8x2_to_halfraw2(__x, __NV_E5M2));
914
+ }
915
+ #endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
916
+ #endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
917
+ };
918
+
919
+ __CUDA_HOSTDEVICE_FP8_DECL__ unsigned int
920
+ __internal_pack_u16x2_to_u32(const unsigned short int src_lo,
921
+ const unsigned short int src_hi) {
922
+ unsigned int dst;
923
+ #if (defined __CUDACC__) && (defined __CUDA_ARCH__)
924
+ asm("{ mov.b32 %0, {%1,%2};}\n" : "=r"(dst) : "h"(src_lo), "h"(src_hi));
925
+ #else
926
+ dst = (static_cast<unsigned int>(src_hi) << 16U) |
927
+ static_cast<unsigned int>(src_lo);
928
+ #endif
929
+ return dst;
930
+ }
931
+
932
+ /**
933
+ * \defgroup CUDA_MATH_FP8X4_E5M2_STRUCT C++ struct for handling vector type of four fp8 values of e5m2 kind.
934
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
935
+ */
936
+
937
+ /**
938
+ * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
939
+ * \brief __nv_fp8x4_e5m2 datatype
940
+ *
941
+ * \details This structure implements the datatype for handling four
942
+ * \p fp8 floating-point numbers of \p e5m2 kind each:
943
+ * with 1 sign, 5 exponent, 1 implicit and 2 explicit mantissa bits.
944
+ *
945
+ * The structure implements converting constructors and operators.
946
+ */
947
+ struct __CUDA_ALIGN__(4) __nv_fp8x4_e5m2 {
948
+ public:
949
+ /**
950
+ * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
951
+ * Storage variable contains the vector of four \p fp8 floating-point data
952
+ * values.
953
+ */
954
+ __nv_fp8x4_storage_t __x;
955
+
956
+ /**
957
+ * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
958
+ * Constructor by default.
959
+ */
960
+ #if defined(__CPP_VERSION_AT_LEAST_11_FP8)
961
+ __nv_fp8x4_e5m2() = default;
962
+ #else
963
+ __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2() {}
964
+ #endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
965
+
966
+ #if !defined(__CUDA_NO_FP8_CONVERSIONS__)
967
+
968
+ /* Construct from wider types */
969
+
970
+ /**
971
+ * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
972
+ * Constructor from a pair of \p __half2 data type values,
973
+ * relies on \p __NV_SATFINITE behavior for out-of-range values.
974
+ */
975
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const __half2 flo,
976
+ const __half2 fhi) {
977
+ const __nv_fp8x2_storage_t rlo = __nv_cvt_halfraw2_to_fp8x2(
978
+ static_cast<__half2_raw>(flo), __NV_SATFINITE, __NV_E5M2);
979
+ const __nv_fp8x2_storage_t rhi = __nv_cvt_halfraw2_to_fp8x2(
980
+ static_cast<__half2_raw>(fhi), __NV_SATFINITE, __NV_E5M2);
981
+ __x = __internal_pack_u16x2_to_u32(rlo, rhi);
982
+ }
983
+ /**
984
+ * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
985
+ * Constructor from a pair of \p __nv_bfloat162 data type values,
986
+ * relies on \p __NV_SATFINITE behavior for out-of-range values.
987
+ */
988
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const __nv_bfloat162 flo,
989
+ const __nv_bfloat162 fhi) {
990
+ const __nv_fp8x2_storage_t rlo = __nv_cvt_bfloat16raw2_to_fp8x2(
991
+ static_cast<__nv_bfloat162_raw>(flo), __NV_SATFINITE, __NV_E5M2);
992
+ const __nv_fp8x2_storage_t rhi = __nv_cvt_bfloat16raw2_to_fp8x2(
993
+ static_cast<__nv_bfloat162_raw>(fhi), __NV_SATFINITE, __NV_E5M2);
994
+ __x = __internal_pack_u16x2_to_u32(rlo, rhi);
995
+ }
996
+ /**
997
+ * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
998
+ * Constructor from \p float4 vector data type,
999
+ * relies on \p __NV_SATFINITE behavior for out-of-range values.
1000
+ */
1001
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const float4 f) {
1002
+ const float2 flo = {f.x, f.y};
1003
+ const float2 fhi = {f.z, f.w};
1004
+ const __nv_fp8x2_storage_t rlo =
1005
+ __nv_cvt_float2_to_fp8x2(flo, __NV_SATFINITE, __NV_E5M2);
1006
+ const __nv_fp8x2_storage_t rhi =
1007
+ __nv_cvt_float2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E5M2);
1008
+ __x = __internal_pack_u16x2_to_u32(rlo, rhi);
1009
+ }
1010
+ /**
1011
+ * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
1012
+ * Constructor from \p double4 vector data type,
1013
+ * relies on \p __NV_SATFINITE behavior for out-of-range values.
1014
+ */
1015
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const double4 f) {
1016
+ const double2 flo = {f.x, f.y};
1017
+ const double2 fhi = {f.z, f.w};
1018
+ const __nv_fp8x2_storage_t rlo =
1019
+ __nv_cvt_double2_to_fp8x2(flo, __NV_SATFINITE, __NV_E5M2);
1020
+ const __nv_fp8x2_storage_t rhi =
1021
+ __nv_cvt_double2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E5M2);
1022
+ __x = __internal_pack_u16x2_to_u32(rlo, rhi);
1023
+ }
1024
+
1025
+ #if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
1026
+ /* Widening converts */
1027
+
1028
+ /**
1029
+ * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
1030
+ * Conversion operator to \p float4 vector data type.
1031
+ */
1032
+ explicit __CUDA_HOSTDEVICE_FP8__ operator float4() const {
1033
+ const __nv_fp8x2_storage_t slo = static_cast<__nv_fp8x2_storage_t>(__x);
1034
+ const __nv_fp8x2_storage_t shi =
1035
+ static_cast<__nv_fp8x2_storage_t>(__x >> 16U);
1036
+ float2 rlo = __internal_halfraw2_to_float2(
1037
+ __nv_cvt_fp8x2_to_halfraw2(slo, __NV_E5M2));
1038
+ float2 rhi = __internal_halfraw2_to_float2(
1039
+ __nv_cvt_fp8x2_to_halfraw2(shi, __NV_E5M2));
1040
+ float4 res = {rlo.x, rlo.y, rhi.x, rhi.y};
1041
+ return res;
1042
+ }
1043
+ #endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
1044
+ #endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
1045
+ };
1046
+
1047
+ /**
1048
+ * \defgroup CUDA_MATH_FP8_E4M3_STRUCT C++ struct for handling fp8 data type of e4m3 kind.
1049
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
1050
+ */
1051
+
1052
+ /**
1053
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1054
+ * \brief __nv_fp8_e4m3 datatype
1055
+ *
1056
+ * \details This structure implements the datatype for storing
1057
+ * \p fp8 floating-point numbers of \p e4m3 kind:
1058
+ * with 1 sign, 4 exponent, 1 implicit and 3 explicit mantissa bits.
1059
+ * The encoding doesn't support Infinity.
1060
+ * NaNs are limited to 0x7F and 0xFF values.
1061
+ *
1062
+ * The structure implements converting constructors and operators.
1063
+ */
1064
+ struct __CUDA_ALIGN__(1) __nv_fp8_e4m3 {
1065
+ public:
1066
+ /**
1067
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1068
+ * Storage variable contains the \p fp8 floating-point data.
1069
+ */
1070
+ __nv_fp8_storage_t __x;
1071
+
1072
+ /**
1073
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1074
+ * Constructor by default.
1075
+ */
1076
+ #if defined(__CPP_VERSION_AT_LEAST_11_FP8)
1077
+ __nv_fp8_e4m3() = default;
1078
+ #else
1079
+ __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3() {}
1080
+ #endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
1081
+
1082
+ #if !defined(__CUDA_NO_FP8_CONVERSIONS__)
1083
+
1084
+ /* Construct from wider FP types */
1085
+ /* Note we do avoid constructor init-list because of special host/device
1086
+ * compilation rules */
1087
+
1088
+ /**
1089
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1090
+ * Constructor from \p __half data type, relies on \p __NV_SATFINITE
1091
+ * behavior for out-of-range values.
1092
+ */
1093
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const __half f) {
1094
+ __x = __nv_cvt_halfraw_to_fp8(static_cast<__half_raw>(f),
1095
+ __NV_SATFINITE, __NV_E4M3);
1096
+ }
1097
+ /**
1098
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1099
+ * Constructor from \p __nv_bfloat16 data type, relies on \p __NV_SATFINITE
1100
+ * behavior for out-of-range values.
1101
+ */
1102
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const __nv_bfloat16 f) {
1103
+ __x = __nv_cvt_bfloat16raw_to_fp8(static_cast<__nv_bfloat16_raw>(f),
1104
+ __NV_SATFINITE, __NV_E4M3);
1105
+ }
1106
+ /**
1107
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1108
+ * Constructor from \p float data type, relies on \p __NV_SATFINITE behavior
1109
+ * for out-of-range values.
1110
+ */
1111
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const float f) {
1112
+ __x = __nv_cvt_float_to_fp8(f, __NV_SATFINITE, __NV_E4M3);
1113
+ }
1114
+ /**
1115
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1116
+ * Constructor from \p double data type, relies on \p __NV_SATFINITE
1117
+ * behavior for out-of-range values.
1118
+ */
1119
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const double f) {
1120
+ __x = __nv_cvt_double_to_fp8(f, __NV_SATFINITE, __NV_E4M3);
1121
+ }
1122
+
1123
+ /* Converts from integral */
1124
+
1125
+ /**
1126
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1127
+ * Constructor from \p unsigned \p short \p int data type, relies on \p
1128
+ * __NV_SATFINITE behavior for out-of-range values.
1129
+ */
1130
+ explicit __CUDA_HOSTDEVICE_FP8__
1131
+ __nv_fp8_e4m3(const unsigned short int val) {
1132
+ __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
1133
+ }
1134
+ /**
1135
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1136
+ * Constructor from \p unsigned \p int data type, relies on \p
1137
+ * __NV_SATFINITE behavior for out-of-range values.
1138
+ */
1139
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const unsigned int val) {
1140
+ __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
1141
+ }
1142
+ /**
1143
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1144
+ * Constructor from \p unsigned \p long \p long \p int data type, relies on
1145
+ * \p __NV_SATFINITE behavior for out-of-range values.
1146
+ */
1147
+ explicit __CUDA_HOSTDEVICE_FP8__
1148
+ __nv_fp8_e4m3(const unsigned long long int val) {
1149
+ __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
1150
+ }
1151
+
1152
+ /**
1153
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1154
+ * Constructor from \p short \p int data type, relies on \p
1155
+ * __NV_SATFINITE behavior for out-of-range values.
1156
+ */
1157
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const short int val) {
1158
+ __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
1159
+ }
1160
+ /**
1161
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1162
+ * Constructor from \p int data type, relies on \p __NV_SATFINITE behavior
1163
+ * for out-of-range values.
1164
+ */
1165
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const int val) {
1166
+ __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
1167
+ }
1168
+ /**
1169
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1170
+ * Constructor from \p long \p long \p int data type, relies on \p
1171
+ * __NV_SATFINITE behavior for out-of-range values.
1172
+ */
1173
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const long long int val) {
1174
+ __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
1175
+ }
1176
+
1177
+ #if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
1178
+ /* Widening FP converts */
1179
+ /**
1180
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1181
+ * Conversion operator to \p __half data type.
1182
+ */
1183
+ explicit __CUDA_HOSTDEVICE_FP8__ operator __half() const {
1184
+ return static_cast<__half>(__nv_cvt_fp8_to_halfraw(__x, __NV_E4M3));
1185
+ }
1186
+ /**
1187
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1188
+ * Conversion operator to \p float data type.
1189
+ */
1190
+ explicit __CUDA_HOSTDEVICE_FP8__ operator float() const {
1191
+ return __internal_halfraw_to_float(
1192
+ __nv_cvt_fp8_to_halfraw(__x, __NV_E4M3));
1193
+ }
1194
+ /**
1195
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1196
+ * Conversion operator to \p __nv_bfloat16 data type.
1197
+ */
1198
+ explicit __CUDA_HOSTDEVICE_FP8__ operator __nv_bfloat16() const {
1199
+ return static_cast<__nv_bfloat16>(
1200
+ __internal_float_to_bf16raw_rz(float(*this)));
1201
+ }
1202
+ /**
1203
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1204
+ * Conversion operator to \p double data type.
1205
+ */
1206
+ explicit __CUDA_HOSTDEVICE_FP8__ operator double() const {
1207
+ return static_cast<double>(float(*this));
1208
+ }
1209
+
1210
+ /* Convert to integral */
1211
+
1212
+ /**
1213
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1214
+ * Conversion operator to \p unsigned \p char data type.
1215
+ * Clamps negative and too large inputs to the output range.
1216
+ * \p NaN inputs convert to \p zero.
1217
+ */
1218
+ explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned char() const {
1219
+ unsigned char i;
1220
+ const float f = float(*this);
1221
+ const unsigned char max_val = 0xFFU;
1222
+ const unsigned char min_val = 0U;
1223
+ const unsigned char bits = (*this).__x;
1224
+ // saturation fixup
1225
+ if ((bits & 0x7FU) == 0x7FU) {
1226
+ // NaN
1227
+ i = 0;
1228
+ } else if (f > static_cast<float>(max_val)) {
1229
+ // saturate maximum
1230
+ i = max_val;
1231
+ } else if (f < static_cast<float>(min_val)) {
1232
+ // saturate minimum
1233
+ i = min_val;
1234
+ } else {
1235
+ // normal value
1236
+ i = static_cast<unsigned char>(f);
1237
+ }
1238
+ return i;
1239
+ }
1240
+
1241
+ /**
1242
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1243
+ * Conversion operator to \p unsigned \p short \p int data type.
1244
+ * Clamps negative inputs to zero.
1245
+ * \p NaN inputs convert to \p zero.
1246
+ */
1247
+ explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned short int() const {
1248
+ return __half2ushort_rz(__half(*this));
1249
+ }
1250
+ /**
1251
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1252
+ * Conversion operator to \p unsigned \p int data type.
1253
+ * Clamps negative inputs to zero.
1254
+ * \p NaN inputs convert to \p zero.
1255
+ */
1256
+ explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned int() const {
1257
+ return __half2uint_rz(__half(*this));
1258
+ }
1259
+ /**
1260
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1261
+ * Conversion operator to \p unsigned \p long \p long \p int data type.
1262
+ * Clamps negative inputs to zero.
1263
+ * \p NaN inputs convert to \p 0x8000000000000000ULL.
1264
+ */
1265
+ explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long long int() const {
1266
+ return __half2ull_rz(__half(*this));
1267
+ }
1268
+
1269
+ /**
1270
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1271
+ * Conversion operator to \p signed \p char data type.
1272
+ * Clamps too large inputs to the output range.
1273
+ * \p NaN inputs convert to \p zero.
1274
+ */
1275
+ explicit __CUDA_HOSTDEVICE_FP8__ operator signed char() const {
1276
+ signed char i;
1277
+ const float f = float(*this);
1278
+ const signed char max_val = (signed char)0x7FU;
1279
+ const signed char min_val = (signed char)0x80U;
1280
+ const unsigned char bits = (*this).__x;
1281
+ // saturation fixup
1282
+ if ((bits & 0x7FU) == 0x7FU) {
1283
+ // NaN
1284
+ i = 0;
1285
+ } else if (f > static_cast<float>(max_val)) {
1286
+ // saturate maximum
1287
+ i = max_val;
1288
+ } else if (f < static_cast<float>(min_val)) {
1289
+ // saturate minimum
1290
+ i = min_val;
1291
+ } else {
1292
+ // normal value
1293
+ i = static_cast<signed char>(f);
1294
+ }
1295
+ return i;
1296
+ }
1297
+ /**
1298
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1299
+ * Conversion operator to \p short \p int data type.
1300
+ * \p NaN inputs convert to \p zero.
1301
+ */
1302
+ explicit __CUDA_HOSTDEVICE_FP8__ operator short int() const {
1303
+ return __half2short_rz(__half(*this));
1304
+ }
1305
+ /**
1306
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1307
+ * Conversion operator to \p int data type.
1308
+ * \p NaN inputs convert to \p zero.
1309
+ */
1310
+ explicit __CUDA_HOSTDEVICE_FP8__ operator int() const {
1311
+ return __half2int_rz(__half(*this));
1312
+ }
1313
+ /**
1314
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1315
+ * Conversion operator to \p long \p long \p int data type.
1316
+ * \p NaN inputs convert to \p 0x8000000000000000LL.
1317
+ */
1318
+ explicit __CUDA_HOSTDEVICE_FP8__ operator long long int() const {
1319
+ return __half2ll_rz(__half(*this));
1320
+ }
1321
+
1322
+ /**
1323
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
1324
+ * Conversion operator to \p bool data type.
1325
+ * +0 and -0 inputs convert to \p false.
1326
+ * Non-zero inputs convert to \p true.
1327
+ */
1328
+ explicit __CUDA_HOSTDEVICE_FP8__ operator bool() const {
1329
+ return (__x & 0x7FU) != 0U;
1330
+ }
1331
+ #endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
1332
+ #endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
1333
+ };
1334
+
1335
+ /**
1336
+ * \defgroup CUDA_MATH_FP8X2_E4M3_STRUCT C++ struct for handling vector type of two fp8 values of e4m3 kind.
1337
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
1338
+ */
1339
+
1340
+ /**
1341
+ * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
1342
+ * \brief __nv_fp8x2_e4m3 datatype
1343
+ *
1344
+ * \details This structure implements the datatype for storage
1345
+ * and operations on the vector of two \p fp8 values of \p e4m3 kind each:
1346
+ * with 1 sign, 4 exponent, 1 implicit and 3 explicit mantissa bits.
1347
+ * The encoding doesn't support Infinity.
1348
+ * NaNs are limited to 0x7F and 0xFF values.
1349
+ */
1350
+ struct __CUDA_ALIGN__(2) __nv_fp8x2_e4m3 {
1351
+ public:
1352
+ /**
1353
+ * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
1354
+ * Storage variable contains the vector of two \p fp8 floating-point data
1355
+ * values.
1356
+ */
1357
+ __nv_fp8x2_storage_t __x;
1358
+
1359
+ /**
1360
+ * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
1361
+ * Constructor by default.
1362
+ */
1363
+ #if defined(__CPP_VERSION_AT_LEAST_11_FP8)
1364
+ __nv_fp8x2_e4m3() = default;
1365
+ #else
1366
+ __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3() {}
1367
+ #endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
1368
+
1369
+ #if !defined(__CUDA_NO_FP8_CONVERSIONS__)
1370
+
1371
+ /* Construct from wider types */
1372
+
1373
+ /**
1374
+ * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
1375
+ * Constructor from \p __half2 data type, relies on \p __NV_SATFINITE
1376
+ * behavior for out-of-range values.
1377
+ */
1378
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const __half2 f) {
1379
+ __x = __nv_cvt_halfraw2_to_fp8x2(static_cast<__half2_raw>(f),
1380
+ __NV_SATFINITE, __NV_E4M3);
1381
+ }
1382
+ /**
1383
+ * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
1384
+ * Constructor from \p __nv_bfloat162 data type, relies on \p __NV_SATFINITE
1385
+ * behavior for out-of-range values.
1386
+ */
1387
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const __nv_bfloat162 f) {
1388
+ __x = __nv_cvt_bfloat16raw2_to_fp8x2(static_cast<__nv_bfloat162_raw>(f),
1389
+ __NV_SATFINITE, __NV_E4M3);
1390
+ }
1391
+ /**
1392
+ * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
1393
+ * Constructor from \p float2 data type, relies on \p __NV_SATFINITE
1394
+ * behavior for out-of-range values.
1395
+ */
1396
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const float2 f) {
1397
+ __x = __nv_cvt_float2_to_fp8x2(f, __NV_SATFINITE, __NV_E4M3);
1398
+ }
1399
+ /**
1400
+ * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
1401
+ * Constructor from \p double2 data type, relies on \p __NV_SATFINITE
1402
+ * behavior for out-of-range values.
1403
+ */
1404
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const double2 f) {
1405
+ __x = __nv_cvt_double2_to_fp8x2(f, __NV_SATFINITE, __NV_E4M3);
1406
+ }
1407
+
1408
+ #if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
1409
+ /* Widening converts */
1410
+ /**
1411
+ * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
1412
+ * Conversion operator to \p __half2 data type.
1413
+ */
1414
+ explicit __CUDA_HOSTDEVICE_FP8__ operator __half2() const {
1415
+ return static_cast<__half2>(__nv_cvt_fp8x2_to_halfraw2(__x, __NV_E4M3));
1416
+ }
1417
+ /**
1418
+ * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
1419
+ * Conversion operator to \p float2 data type.
1420
+ */
1421
+ explicit __CUDA_HOSTDEVICE_FP8__ operator float2() const {
1422
+ return __internal_halfraw2_to_float2(
1423
+ __nv_cvt_fp8x2_to_halfraw2(__x, __NV_E4M3));
1424
+ }
1425
+ #endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
1426
+ #endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
1427
+ };
1428
+
1429
+ /**
1430
+ * \defgroup CUDA_MATH_FP8X4_E4M3_STRUCT C++ struct for handling vector type of four fp8 values of e4m3 kind.
1431
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
1432
+ */
1433
+
1434
+ /**
1435
+ * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
1436
+ * \brief __nv_fp8x4_e4m3 datatype
1437
+ *
1438
+ * \details This structure implements the datatype for storage
1439
+ * and operations on the vector of four \p fp8 values of \p e4m3 kind each:
1440
+ * with 1 sign, 4 exponent, 1 implicit and 3 explicit mantissa bits.
1441
+ * The encoding doesn't support Infinity.
1442
+ * NaNs are limited to 0x7F and 0xFF values.
1443
+ */
1444
+ struct __CUDA_ALIGN__(4) __nv_fp8x4_e4m3 {
1445
+ public:
1446
+ /**
1447
+ * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
1448
+ * Storage variable contains the vector of four \p fp8 floating-point data
1449
+ * values.
1450
+ */
1451
+ __nv_fp8x4_storage_t __x;
1452
+
1453
+ /**
1454
+ * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
1455
+ * Constructor by default.
1456
+ */
1457
+ #if defined(__CPP_VERSION_AT_LEAST_11_FP8)
1458
+ __nv_fp8x4_e4m3() = default;
1459
+ #else
1460
+ __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3() {}
1461
+ #endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
1462
+
1463
+ #if !defined(__CUDA_NO_FP8_CONVERSIONS__)
1464
+
1465
+ /* Construct from wider types */
1466
+
1467
+ /**
1468
+ * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
1469
+ * Constructor from a pair of \p __half2 data type values,
1470
+ * relies on \p __NV_SATFINITE behavior for out-of-range values.
1471
+ */
1472
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const __half2 flo,
1473
+ const __half2 fhi) {
1474
+ const __nv_fp8x2_storage_t rlo = __nv_cvt_halfraw2_to_fp8x2(
1475
+ static_cast<__half2_raw>(flo), __NV_SATFINITE, __NV_E4M3);
1476
+ const __nv_fp8x2_storage_t rhi = __nv_cvt_halfraw2_to_fp8x2(
1477
+ static_cast<__half2_raw>(fhi), __NV_SATFINITE, __NV_E4M3);
1478
+ __x = __internal_pack_u16x2_to_u32(rlo, rhi);
1479
+ }
1480
+ /**
1481
+ * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
1482
+ * Constructor from a pair of \p __nv_bfloat162 data type values,
1483
+ * relies on \p __NV_SATFINITE behavior for out-of-range values.
1484
+ */
1485
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const __nv_bfloat162 flo,
1486
+ const __nv_bfloat162 fhi) {
1487
+ const __nv_fp8x2_storage_t rlo = __nv_cvt_bfloat16raw2_to_fp8x2(
1488
+ static_cast<__nv_bfloat162_raw>(flo), __NV_SATFINITE, __NV_E4M3);
1489
+ const __nv_fp8x2_storage_t rhi = __nv_cvt_bfloat16raw2_to_fp8x2(
1490
+ static_cast<__nv_bfloat162_raw>(fhi), __NV_SATFINITE, __NV_E4M3);
1491
+ __x = __internal_pack_u16x2_to_u32(rlo, rhi);
1492
+ }
1493
+ /**
1494
+ * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
1495
+ * Constructor from \p float4 vector data type,
1496
+ * relies on \p __NV_SATFINITE behavior for out-of-range values.
1497
+ */
1498
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const float4 f) {
1499
+ const float2 flo = {f.x, f.y};
1500
+ const float2 fhi = {f.z, f.w};
1501
+ const __nv_fp8x2_storage_t rlo =
1502
+ __nv_cvt_float2_to_fp8x2(flo, __NV_SATFINITE, __NV_E4M3);
1503
+ const __nv_fp8x2_storage_t rhi =
1504
+ __nv_cvt_float2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E4M3);
1505
+ __x = __internal_pack_u16x2_to_u32(rlo, rhi);
1506
+ }
1507
+ /**
1508
+ * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
1509
+ * Constructor from \p double4 vector data type,
1510
+ * relies on \p __NV_SATFINITE behavior for out-of-range values.
1511
+ */
1512
+ explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const double4 f) {
1513
+ const double2 flo = {f.x, f.y};
1514
+ const double2 fhi = {f.z, f.w};
1515
+ const __nv_fp8x2_storage_t rlo =
1516
+ __nv_cvt_double2_to_fp8x2(flo, __NV_SATFINITE, __NV_E4M3);
1517
+ const __nv_fp8x2_storage_t rhi =
1518
+ __nv_cvt_double2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E4M3);
1519
+ __x = __internal_pack_u16x2_to_u32(rlo, rhi);
1520
+ }
1521
+
1522
+ #if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
1523
+ /* Widening converts */
1524
+
1525
+ /**
1526
+ * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
1527
+ * Conversion operator to \p float4 vector data type.
1528
+ */
1529
+ explicit __CUDA_HOSTDEVICE_FP8__ operator float4() const {
1530
+ const __nv_fp8x2_storage_t slo = static_cast<__nv_fp8x2_storage_t>(__x);
1531
+ const __nv_fp8x2_storage_t shi =
1532
+ static_cast<__nv_fp8x2_storage_t>(__x >> 16U);
1533
+ float2 rlo = __internal_halfraw2_to_float2(
1534
+ __nv_cvt_fp8x2_to_halfraw2(slo, __NV_E4M3));
1535
+ float2 rhi = __internal_halfraw2_to_float2(
1536
+ __nv_cvt_fp8x2_to_halfraw2(shi, __NV_E4M3));
1537
+ float4 res = {rlo.x, rlo.y, rhi.x, rhi.y};
1538
+ return res;
1539
+ }
1540
+ #endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
1541
+ #endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
1542
+ };
1543
+
1544
+ #endif /* defined(__cplusplus) */
1545
+
1546
+ #endif /* end of include guard: __CUDA_FP8_HPP__ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline_helpers.h ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef _CUDA_PIPELINE_HELPERS_H_
51
+ # define _CUDA_PIPELINE_HELPERS_H_
52
+
53
+ # define _CUDA_PIPELINE_NAMESPACE nvcuda::experimental
54
+ # define _CUDA_PIPELINE_BEGIN_NAMESPACE namespace nvcuda { namespace experimental {
55
+ # define _CUDA_PIPELINE_END_NAMESPACE } }
56
+
57
+ # define _CUDA_PIPELINE_INTERNAL_NAMESPACE _CUDA_PIPELINE_NAMESPACE::__pipeline_internal
58
+ # define _CUDA_PIPELINE_BEGIN_INTERNAL_NAMESPACE _CUDA_PIPELINE_BEGIN_NAMESPACE namespace __pipeline_internal {
59
+ # define _CUDA_PIPELINE_END_INTERNAL_NAMESPACE } _CUDA_PIPELINE_END_NAMESPACE
60
+
61
+ # if !defined(_CUDA_PIPELINE_QUALIFIER)
62
+ # define _CUDA_PIPELINE_QUALIFIER inline __device__
63
+ # endif
64
+ # if !defined(_CUDA_PIPELINE_STATIC_QUALIFIER)
65
+ # define _CUDA_PIPELINE_STATIC_QUALIFIER static inline __device__
66
+ # endif
67
+
68
+ # if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
69
+ # define _CUDA_PIPELINE_ARCH_700_OR_LATER
70
+ # endif
71
+
72
+ # if (__CUDA_ARCH__ >= 800)
73
+ # define _CUDA_PIPELINE_HAS_ASYNC_COPY 1
74
+ # else
75
+ # define _CUDA_PIPELINE_HAS_ASYNC_COPY 0
76
+ # endif
77
+
78
+ # if !defined(_CUDA_PIPELINE_MAX_STAGES)
79
+ # define _CUDA_PIPELINE_MAX_STAGES 8
80
+ # endif
81
+
82
+ # if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && (_MSC_VER >= 1900)))
83
+ # define _CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER
84
+ # endif
85
+
86
+ # if !defined(_CUDA_PIPELINE_DEBUG)
87
+ # if defined(__CUDACC_DEBUG__)
88
+ # define _CUDA_PIPELINE_DEBUG 1
89
+ # else
90
+ # define _CUDA_PIPELINE_DEBUG 0
91
+ # endif
92
+ # endif
93
+
94
+ # if defined(_CUDA_PIPELINE_DEBUG) && (_CUDA_PIPELINE_DEBUG == 1) && !defined(NDEBUG)
95
+ # if !defined(__CUDACC_RTC__)
96
+ # include <cassert>
97
+ # endif
98
+ # define _CUDA_PIPELINE_ASSERT(x) assert((x));
99
+ # define _CUDA_PIPELINE_ABORT() assert(0);
100
+ # else
101
+ # define _CUDA_PIPELINE_ASSERT(x)
102
+ # define _CUDA_PIPELINE_ABORT() __trap();
103
+ # endif
104
+
105
+ # if defined(_CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER)
106
+ # define _CUDA_PIPELINE_STATIC_ASSERT(c, m) static_assert(c, m)
107
+ # else
108
+ # define _CUDA_PIPELINE_STATIC_ASSERT(c, m)
109
+ # endif
110
+
111
+ # if (defined(_MSC_VER) && !defined(_WIN64)) || defined(__arm__)
112
+ # define _CUDA_PIPELINE_ASM_PTR_CONSTRAINT "r"
113
+ # else
114
+ # define _CUDA_PIPELINE_ASM_PTR_CONSTRAINT "l"
115
+ # endif
116
+
117
+ # if defined(__CUDACC_RTC__)
118
+ typedef unsigned int uint32_t;
119
+ typedef unsigned long long uint64_t;
120
+ typedef uint64_t uintptr_t;
121
+ # else
122
+ # include <stdint.h>
123
+ # endif
124
+
125
+ _CUDA_PIPELINE_BEGIN_INTERNAL_NAMESPACE
126
+
127
+ _CUDA_PIPELINE_STATIC_ASSERT(sizeof(short) == 2, "Size mismatch for type 'short'");
128
+ _CUDA_PIPELINE_STATIC_ASSERT(sizeof(int) == 4, "Size mismatch for type 'int'");
129
+ _CUDA_PIPELINE_STATIC_ASSERT(sizeof(int2) == 8, "Size mismatch for type 'int2'");
130
+ _CUDA_PIPELINE_STATIC_ASSERT(sizeof(int4) == 16, "Size mismatch for type 'int4'");
131
+
132
+ extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void *);
133
+
134
+ template<size_t CopySize, size_t SourceSize>
135
+ _CUDA_PIPELINE_QUALIFIER
136
+ void pipeline_memcpy_sync(void* __restrict__ dst, const void* __restrict__ src)
137
+ {
138
+ _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
139
+ _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
140
+ _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
141
+ _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
142
+
143
+ char* const d = reinterpret_cast<char*>(dst);
144
+ const char* const s = reinterpret_cast<const char*>(src);
145
+
146
+ size_t copy_step_size;
147
+ if (SourceSize == 0) {
148
+ copy_step_size = CopySize;
149
+ } else if (SourceSize == 2 || SourceSize == 4 || SourceSize == 8 || SourceSize == 16) {
150
+ copy_step_size = SourceSize;
151
+ } else {
152
+ copy_step_size = 1;
153
+ }
154
+
155
+ for (size_t i = 0; i < CopySize; i += copy_step_size) {
156
+ const bool copy_source = SourceSize && (i < SourceSize);
157
+
158
+ switch (copy_step_size) {
159
+ case 1:
160
+ d[i] = copy_source ? s[i] : char();
161
+ break;
162
+ case 2:
163
+ *reinterpret_cast<short*>(d + i) = copy_source ? *reinterpret_cast<const short*>(s + i) : short();
164
+ break;
165
+ case 4:
166
+ *reinterpret_cast<int*>(d + i) = copy_source ? *reinterpret_cast<const int*>(s + i) : int();
167
+ break;
168
+ case 8:
169
+ *reinterpret_cast<int2*>(d + i) = copy_source ? *reinterpret_cast<const int2*>(s + i) : int2();
170
+ break;
171
+ case 16:
172
+ *reinterpret_cast<int4*>(d + i) = copy_source ? *reinterpret_cast<const int4*>(s + i) : int4();
173
+ break;
174
+ }
175
+ }
176
+ }
177
+
178
+ template<bool UseHwAsyncCopy>
179
+ struct ImplementationChooser;
180
+
181
+ template<>
182
+ struct ImplementationChooser<true> {
183
+ template<size_t CopySize, size_t SourceSize>
184
+ struct CpAsyncChooser {
185
+ _CUDA_PIPELINE_STATIC_QUALIFIER
186
+ void cp_async(void* __restrict__ dst, const void* __restrict__ src)
187
+ {
188
+ asm volatile ("cp.async.ca.shared.global [%0], [%1], %2, %3;"
189
+ :
190
+ : "r"(__nvvm_get_smem_pointer(dst)), _CUDA_PIPELINE_ASM_PTR_CONSTRAINT(src), "n"(CopySize),
191
+ "n"(SourceSize)
192
+ : "memory");
193
+ }
194
+ };
195
+
196
+ template<size_t SourceSize>
197
+ struct CpAsyncChooser<16, SourceSize> {
198
+ _CUDA_PIPELINE_STATIC_QUALIFIER
199
+ void cp_async(void* __restrict__ dst, const void* __restrict__ src)
200
+ {
201
+ asm volatile ("cp.async.cg.shared.global [%0], [%1], %2, %3;"
202
+ :
203
+ : "r"(__nvvm_get_smem_pointer(dst)), _CUDA_PIPELINE_ASM_PTR_CONSTRAINT(src), "n"(16), "n"(SourceSize)
204
+ : "memory");
205
+ }
206
+ };
207
+
208
+ template<size_t CopySize, size_t SourceSize>
209
+ _CUDA_PIPELINE_STATIC_QUALIFIER
210
+ void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src)
211
+ {
212
+ _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
213
+ _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
214
+ _CUDA_PIPELINE_ASSERT(__isShared(dst));
215
+ _CUDA_PIPELINE_ASSERT(__isGlobal(src));
216
+ _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
217
+ _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
218
+
219
+ CpAsyncChooser<CopySize, SourceSize>::cp_async(dst, src);
220
+ }
221
+
222
+ _CUDA_PIPELINE_STATIC_QUALIFIER
223
+ void pipeline_commit()
224
+ {
225
+ asm volatile ("cp.async.commit_group;");
226
+ }
227
+
228
+ template<unsigned N>
229
+ _CUDA_PIPELINE_STATIC_QUALIFIER
230
+ void pipeline_wait_prior()
231
+ {
232
+ asm volatile ("cp.async.wait_group %0;"
233
+ :
234
+ : "n"(N < _CUDA_PIPELINE_MAX_STAGES ? N : _CUDA_PIPELINE_MAX_STAGES));
235
+ }
236
+
237
+ _CUDA_PIPELINE_STATIC_QUALIFIER
238
+ void pipeline_arrive_on(uint64_t* barrier)
239
+ {
240
+ _CUDA_PIPELINE_ASSERT(__isShared(barrier));
241
+
242
+ asm volatile ("cp.async.mbarrier.arrive.shared.b64 [%0];"
243
+ :
244
+ : "r"(__nvvm_get_smem_pointer(barrier)));
245
+ }
246
+ };
247
+
248
+ template<>
249
+ struct ImplementationChooser<false> {
250
+ template<size_t CopySize, size_t SourceSize>
251
+ _CUDA_PIPELINE_STATIC_QUALIFIER
252
+ void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src)
253
+ {
254
+ _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
255
+ _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
256
+ _CUDA_PIPELINE_ASSERT(__isShared(dst));
257
+ _CUDA_PIPELINE_ASSERT(__isGlobal(src));
258
+ _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
259
+ _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
260
+
261
+ pipeline_memcpy_sync<CopySize, SourceSize>(dst, src);
262
+ }
263
+
264
+ _CUDA_PIPELINE_STATIC_QUALIFIER
265
+ void pipeline_commit()
266
+ {
267
+ }
268
+
269
+ template<unsigned N>
270
+ _CUDA_PIPELINE_STATIC_QUALIFIER
271
+ void pipeline_wait_prior()
272
+ {
273
+ }
274
+
275
+ _CUDA_PIPELINE_STATIC_QUALIFIER
276
+ void pipeline_arrive_on(uint64_t* barrier)
277
+ {
278
+ }
279
+ };
280
+
281
+ template<size_t CopySize, size_t SourceSize>
282
+ _CUDA_PIPELINE_QUALIFIER
283
+ void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src)
284
+ {
285
+ _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
286
+ _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
287
+ _CUDA_PIPELINE_ASSERT(__isShared(dst));
288
+ _CUDA_PIPELINE_ASSERT(__isGlobal(src));
289
+ _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
290
+ _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
291
+
292
+ ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_memcpy_async<CopySize, SourceSize>(dst, src);
293
+ }
294
+
295
+ _CUDA_PIPELINE_QUALIFIER
296
+ void pipeline_commit()
297
+ {
298
+ ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_commit();
299
+ }
300
+
301
+ template<unsigned N>
302
+ _CUDA_PIPELINE_QUALIFIER
303
+ void pipeline_wait_prior()
304
+ {
305
+ ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_wait_prior<N>();
306
+ }
307
+
308
+ _CUDA_PIPELINE_QUALIFIER
309
+ void pipeline_arrive_on(uint64_t* barrier)
310
+ {
311
+ ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_arrive_on(barrier);
312
+ }
313
+
314
+ template<size_t CopySize, size_t SourceSize>
315
+ _CUDA_PIPELINE_QUALIFIER
316
+ void pipeline_copy_strict(void* __restrict__ dst, const void* __restrict__ src)
317
+ {
318
+ _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
319
+ _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size.");
320
+ _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
321
+ _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
322
+
323
+ if (__isGlobal(src) && __isShared(dst)) {
324
+ pipeline_memcpy_async<CopySize, SourceSize>(dst, src);
325
+ } else {
326
+ pipeline_memcpy_sync<CopySize, SourceSize>(dst, src);
327
+ }
328
+ }
329
+
330
+ template<size_t CopySize, size_t Align>
331
+ _CUDA_PIPELINE_QUALIFIER
332
+ void pipeline_copy_relaxed(void* __restrict__ dst, const void* __restrict__ src)
333
+ {
334
+ _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (Align - 1)));
335
+ _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (Align - 1)));
336
+
337
+ const char* s = reinterpret_cast<const char*>(src);
338
+ char* d = reinterpret_cast<char*>(dst);
339
+ size_t remaining = CopySize;
340
+
341
+ while (remaining) {
342
+ if ((Align >= 16) && (remaining >= 16)) {
343
+ pipeline_copy_strict<16, 16>(dst, src);
344
+ d += 16;
345
+ s += 16;
346
+ remaining -= 16;
347
+ } else if ((Align >= 8) && (remaining >= 8)) {
348
+ pipeline_copy_strict<8, 8>(dst, src);
349
+ d += 8;
350
+ s += 8;
351
+ remaining -= 8;
352
+ } else if ((Align >= 4) && (remaining >= 4)) {
353
+ pipeline_copy_strict<4, 4>(dst, src);
354
+ d += 4;
355
+ s += 4;
356
+ remaining -= 4;
357
+ } else if ((Align >= 2) && (remaining >= 2)) {
358
+ *reinterpret_cast<short*>(d) = *reinterpret_cast<const short*>(s);
359
+ d += 2;
360
+ s += 2;
361
+ remaining -= 2;
362
+ } else {
363
+ *d = *s;
364
+ d += 1;
365
+ s += 1;
366
+ remaining -= 1;
367
+ }
368
+ }
369
+ }
370
+
371
+ _CUDA_PIPELINE_END_INTERNAL_NAMESPACE
372
+
373
+ #endif /* !_CUDA_PIPELINE_HELPERS_H_ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline_primitives.h ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef _CUDA_PIPELINE_PRIMITIVES_H_
51
+ # define _CUDA_PIPELINE_PRIMITIVES_H_
52
+
53
+ # include "cuda_pipeline_helpers.h"
54
+
55
+ _CUDA_PIPELINE_STATIC_QUALIFIER
56
+ void __pipeline_memcpy_async(void* __restrict__ dst_shared, const void* __restrict__ src_global, size_t size_and_align,
57
+ size_t zfill = 0)
58
+ {
59
+ _CUDA_PIPELINE_ASSERT(size_and_align == 4 || size_and_align == 8 || size_and_align == 16);
60
+ _CUDA_PIPELINE_ASSERT(zfill <= size_and_align);
61
+ _CUDA_PIPELINE_ASSERT(__isShared(dst_shared));
62
+ _CUDA_PIPELINE_ASSERT(__isGlobal(src_global));
63
+ _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst_shared) & (size_and_align - 1)));
64
+ _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src_global) & (size_and_align - 1)));
65
+
66
+ switch (size_and_align) {
67
+ case 16:
68
+ switch (zfill) {
69
+ case 0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 16>(dst_shared, src_global); return;
70
+ case 1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 15>(dst_shared, src_global); return;
71
+ case 2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 14>(dst_shared, src_global); return;
72
+ case 3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 13>(dst_shared, src_global); return;
73
+ case 4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 12>(dst_shared, src_global); return;
74
+ case 5: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 11>(dst_shared, src_global); return;
75
+ case 6: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 10>(dst_shared, src_global); return;
76
+ case 7: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 9>(dst_shared, src_global); return;
77
+ case 8: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 8>(dst_shared, src_global); return;
78
+ case 9: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 7>(dst_shared, src_global); return;
79
+ case 10: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 6>(dst_shared, src_global); return;
80
+ case 11: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 5>(dst_shared, src_global); return;
81
+ case 12: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 4>(dst_shared, src_global); return;
82
+ case 13: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 3>(dst_shared, src_global); return;
83
+ case 14: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 2>(dst_shared, src_global); return;
84
+ case 15: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 1>(dst_shared, src_global); return;
85
+ case 16: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 0>(dst_shared, src_global); return;
86
+ default: _CUDA_PIPELINE_ABORT(); return;
87
+ }
88
+ case 8:
89
+ switch (zfill) {
90
+ case 0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 8>(dst_shared, src_global); return;
91
+ case 1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 7>(dst_shared, src_global); return;
92
+ case 2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 6>(dst_shared, src_global); return;
93
+ case 3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 5>(dst_shared, src_global); return;
94
+ case 4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 4>(dst_shared, src_global); return;
95
+ case 5: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 3>(dst_shared, src_global); return;
96
+ case 6: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 2>(dst_shared, src_global); return;
97
+ case 7: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 1>(dst_shared, src_global); return;
98
+ case 8: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 0>(dst_shared, src_global); return;
99
+ default: _CUDA_PIPELINE_ABORT(); return;
100
+ }
101
+ case 4:
102
+ switch (zfill) {
103
+ case 0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 4>(dst_shared, src_global); return;
104
+ case 1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 3>(dst_shared, src_global); return;
105
+ case 2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 2>(dst_shared, src_global); return;
106
+ case 3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 1>(dst_shared, src_global); return;
107
+ case 4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 0>(dst_shared, src_global); return;
108
+ default: _CUDA_PIPELINE_ABORT(); return;
109
+ }
110
+ default:
111
+ _CUDA_PIPELINE_ABORT();
112
+ return;
113
+ }
114
+ }
115
+
116
+ _CUDA_PIPELINE_STATIC_QUALIFIER
117
+ void __pipeline_commit()
118
+ {
119
+ _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_commit();
120
+ }
121
+
122
+ _CUDA_PIPELINE_STATIC_QUALIFIER
123
+ void __pipeline_wait_prior(size_t prior)
124
+ {
125
+ switch (prior) {
126
+ case 0 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<0>(); return;
127
+ case 1 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<1>(); return;
128
+ case 2 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<2>(); return;
129
+ case 3 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<3>(); return;
130
+ case 4 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<4>(); return;
131
+ case 5 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<5>(); return;
132
+ case 6 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<6>(); return;
133
+ case 7 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<7>(); return;
134
+ default : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<8>(); return;
135
+ }
136
+ }
137
+
138
+ # if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
139
+ # include "cuda_awbarrier_primitives.h"
140
+
141
+ _CUDA_PIPELINE_STATIC_QUALIFIER
142
+ void __pipeline_arrive_on(__mbarrier_t* barrier)
143
+ {
144
+ _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(barrier);
145
+ }
146
+ # endif
147
+
148
+ #endif /* !_CUDA_PIPELINE_PRIMITIVES_H_ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_types.h ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__DEVICE_TYPES_H__)
51
+ #define __DEVICE_TYPES_H__
52
+
53
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
54
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
55
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__
56
+ #endif
57
+
58
+ #ifndef __DOXYGEN_ONLY__
59
+ #include "crt/host_defines.h"
60
+ #endif
61
+
62
+ /*******************************************************************************
63
+ * *
64
+ * *
65
+ * *
66
+ *******************************************************************************/
67
+
68
+ enum __device_builtin__ cudaRoundMode
69
+ {
70
+ cudaRoundNearest,
71
+ cudaRoundZero,
72
+ cudaRoundPosInf,
73
+ cudaRoundMinInf
74
+ };
75
+
76
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__)
77
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
78
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__
79
+ #endif
80
+
81
+ #endif /* !__DEVICE_TYPES_H__ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/host_defines.h ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #if defined(_MSC_VER)
52
+ #pragma message("host_defines.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
53
+ #else
54
+ #warning "host_defines.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead."
55
+ #endif
56
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
57
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__
58
+ #endif
59
+
60
+ #include "crt/host_defines.h"
61
+
62
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__)
63
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
64
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__
65
+ #endif
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/mma.h ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
52
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__
53
+ #endif
54
+
55
+ #include "crt/mma.h"
56
+
57
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__)
58
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
59
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__
60
+ #endif
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_61_intrinsics.h ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2016 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__SM_61_INTRINSICS_H__)
51
+ #define __SM_61_INTRINSICS_H__
52
+
53
+ #if defined(__CUDACC_RTC__)
54
+ #define __SM_61_INTRINSICS_DECL__ __device__
55
+ #else /* !__CUDACC_RTC__ */
56
+ #define __SM_61_INTRINSICS_DECL__ static __device__ __inline__
57
+ #endif /* __CUDACC_RTC__ */
58
+
59
+ #if defined(__cplusplus) && defined(__CUDACC__)
60
+
61
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 610
62
+
63
+ /*******************************************************************************
64
+ * *
65
+ * *
66
+ * *
67
+ *******************************************************************************/
68
+
69
+ #include "cuda_runtime_api.h"
70
+
71
+ #ifndef __CUDA_ARCH__
72
+ #define __DEF_IF_HOST { }
73
+ #else /* !__CUDA_ARCH__ */
74
+ #define __DEF_IF_HOST ;
75
+ #endif /* __CUDA_ARCH__ */
76
+
77
+ /*******************************************************************************
78
+ * *
79
+ * Below are declarations of SM-6.1 intrinsics which are included as *
80
+ * source (instead of being built in to the compiler) *
81
+ * *
82
+ *******************************************************************************/
83
+
84
+
85
+ /******************************************************************************
86
+ * __dp2a *
87
+ ******************************************************************************/
88
+ // Generic [_lo]
89
+ __SM_61_INTRINSICS_DECL__ int __dp2a_lo(int srcA, int srcB, int c) __DEF_IF_HOST
90
+ __SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(unsigned int srcA, unsigned int srcB, unsigned int c) __DEF_IF_HOST
91
+ // Vector-style [_lo]
92
+ __SM_61_INTRINSICS_DECL__ int __dp2a_lo(short2 srcA, char4 srcB, int c) __DEF_IF_HOST
93
+ __SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(ushort2 srcA, uchar4 srcB, unsigned int c) __DEF_IF_HOST
94
+ // Generic [_hi]
95
+ __SM_61_INTRINSICS_DECL__ int __dp2a_hi(int srcA, int srcB, int c) __DEF_IF_HOST
96
+ __SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(unsigned int srcA, unsigned int srcB, unsigned int c) __DEF_IF_HOST
97
+ // Vector-style [_hi]
98
+ __SM_61_INTRINSICS_DECL__ int __dp2a_hi(short2 srcA, char4 srcB, int c) __DEF_IF_HOST
99
+ __SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(ushort2 srcA, uchar4 srcB, unsigned int c) __DEF_IF_HOST
100
+
101
+
102
+ /******************************************************************************
103
+ * __dp4a *
104
+ ******************************************************************************/
105
+ // Generic
106
+ __SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, int srcB, int c) __DEF_IF_HOST
107
+ __SM_61_INTRINSICS_DECL__ unsigned int __dp4a(unsigned int srcA, unsigned int srcB, unsigned int c) __DEF_IF_HOST
108
+ // Vector-style
109
+ __SM_61_INTRINSICS_DECL__ int __dp4a(char4 srcA, char4 srcB, int c) __DEF_IF_HOST
110
+ __SM_61_INTRINSICS_DECL__ unsigned int __dp4a(uchar4 srcA, uchar4 srcB, unsigned int c) __DEF_IF_HOST
111
+
112
+ #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 610 */
113
+
114
+ #endif /* __cplusplus && __CUDACC__ */
115
+
116
+ #undef __DEF_IF_HOST
117
+ #undef __SM_61_INTRINSICS_DECL__
118
+
119
+ #if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
120
+ #include "sm_61_intrinsics.hpp"
121
+ #endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
122
+
123
+ #endif /* !__SM_61_INTRINSICS_H__ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_types.h ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__TEXTURE_TYPES_H__)
51
+ #define __TEXTURE_TYPES_H__
52
+
53
+ /*******************************************************************************
54
+ * *
55
+ * *
56
+ * *
57
+ *******************************************************************************/
58
+
59
+ #include "driver_types.h"
60
+
61
+ /**
62
+ * \addtogroup CUDART_TYPES
63
+ *
64
+ * @{
65
+ */
66
+
67
+ /*******************************************************************************
68
+ * *
69
+ * *
70
+ * *
71
+ *******************************************************************************/
72
+
73
+ #define cudaTextureType1D 0x01
74
+ #define cudaTextureType2D 0x02
75
+ #define cudaTextureType3D 0x03
76
+ #define cudaTextureTypeCubemap 0x0C
77
+ #define cudaTextureType1DLayered 0xF1
78
+ #define cudaTextureType2DLayered 0xF2
79
+ #define cudaTextureTypeCubemapLayered 0xFC
80
+
81
+ /**
82
+ * CUDA texture address modes
83
+ */
84
+ enum __device_builtin__ cudaTextureAddressMode
85
+ {
86
+ cudaAddressModeWrap = 0, /**< Wrapping address mode */
87
+ cudaAddressModeClamp = 1, /**< Clamp to edge address mode */
88
+ cudaAddressModeMirror = 2, /**< Mirror address mode */
89
+ cudaAddressModeBorder = 3 /**< Border address mode */
90
+ };
91
+
92
+ /**
93
+ * CUDA texture filter modes
94
+ */
95
+ enum __device_builtin__ cudaTextureFilterMode
96
+ {
97
+ cudaFilterModePoint = 0, /**< Point filter mode */
98
+ cudaFilterModeLinear = 1 /**< Linear filter mode */
99
+ };
100
+
101
+ /**
102
+ * CUDA texture read modes
103
+ */
104
+ enum __device_builtin__ cudaTextureReadMode
105
+ {
106
+ cudaReadModeElementType = 0, /**< Read texture as specified element type */
107
+ cudaReadModeNormalizedFloat = 1 /**< Read texture as normalized float */
108
+ };
109
+
110
+ /**
111
+ * CUDA texture reference
112
+ */
113
+ struct __device_builtin__ textureReference
114
+ {
115
+ /**
116
+ * Indicates whether texture reads are normalized or not
117
+ */
118
+ int normalized;
119
+ /**
120
+ * Texture filter mode
121
+ */
122
+ enum cudaTextureFilterMode filterMode;
123
+ /**
124
+ * Texture address mode for up to 3 dimensions
125
+ */
126
+ enum cudaTextureAddressMode addressMode[3];
127
+ /**
128
+ * Channel descriptor for the texture reference
129
+ */
130
+ struct cudaChannelFormatDesc channelDesc;
131
+ /**
132
+ * Perform sRGB->linear conversion during texture read
133
+ */
134
+ int sRGB;
135
+ /**
136
+ * Limit to the anisotropy ratio
137
+ */
138
+ unsigned int maxAnisotropy;
139
+ /**
140
+ * Mipmap filter mode
141
+ */
142
+ enum cudaTextureFilterMode mipmapFilterMode;
143
+ /**
144
+ * Offset applied to the supplied mipmap level
145
+ */
146
+ float mipmapLevelBias;
147
+ /**
148
+ * Lower end of the mipmap level range to clamp access to
149
+ */
150
+ float minMipmapLevelClamp;
151
+ /**
152
+ * Upper end of the mipmap level range to clamp access to
153
+ */
154
+ float maxMipmapLevelClamp;
155
+ /**
156
+ * Disable any trilinear filtering optimizations.
157
+ */
158
+ int disableTrilinearOptimization;
159
+ int __cudaReserved[14];
160
+ };
161
+
162
+ /**
163
+ * CUDA texture descriptor
164
+ */
165
+ struct __device_builtin__ cudaTextureDesc
166
+ {
167
+ /**
168
+ * Texture address mode for up to 3 dimensions
169
+ */
170
+ enum cudaTextureAddressMode addressMode[3];
171
+ /**
172
+ * Texture filter mode
173
+ */
174
+ enum cudaTextureFilterMode filterMode;
175
+ /**
176
+ * Texture read mode
177
+ */
178
+ enum cudaTextureReadMode readMode;
179
+ /**
180
+ * Perform sRGB->linear conversion during texture read
181
+ */
182
+ int sRGB;
183
+ /**
184
+ * Texture Border Color
185
+ */
186
+ float borderColor[4];
187
+ /**
188
+ * Indicates whether texture reads are normalized or not
189
+ */
190
+ int normalizedCoords;
191
+ /**
192
+ * Limit to the anisotropy ratio
193
+ */
194
+ unsigned int maxAnisotropy;
195
+ /**
196
+ * Mipmap filter mode
197
+ */
198
+ enum cudaTextureFilterMode mipmapFilterMode;
199
+ /**
200
+ * Offset applied to the supplied mipmap level
201
+ */
202
+ float mipmapLevelBias;
203
+ /**
204
+ * Lower end of the mipmap level range to clamp access to
205
+ */
206
+ float minMipmapLevelClamp;
207
+ /**
208
+ * Upper end of the mipmap level range to clamp access to
209
+ */
210
+ float maxMipmapLevelClamp;
211
+ /**
212
+ * Disable any trilinear filtering optimizations.
213
+ */
214
+ int disableTrilinearOptimization;
215
+ };
216
+
217
+ struct __device_builtin__ cudaTextureDesc_v2
218
+ {
219
+ /**
220
+ * Texture address mode for up to 3 dimensions
221
+ */
222
+ enum cudaTextureAddressMode addressMode[3];
223
+ /**
224
+ * Texture filter mode
225
+ */
226
+ enum cudaTextureFilterMode filterMode;
227
+ /**
228
+ * Texture read mode
229
+ */
230
+ enum cudaTextureReadMode readMode;
231
+ /**
232
+ * Perform sRGB->linear conversion during texture read
233
+ */
234
+ int sRGB;
235
+ /**
236
+ * Texture Border Color
237
+ */
238
+ float borderColor[4];
239
+ /**
240
+ * Indicates whether texture reads are normalized or not
241
+ */
242
+ int normalizedCoords;
243
+ /**
244
+ * Limit to the anisotropy ratio
245
+ */
246
+ unsigned int maxAnisotropy;
247
+ /**
248
+ * Mipmap filter mode
249
+ */
250
+ enum cudaTextureFilterMode mipmapFilterMode;
251
+ /**
252
+ * Offset applied to the supplied mipmap level
253
+ */
254
+ float mipmapLevelBias;
255
+ /**
256
+ * Lower end of the mipmap level range to clamp access to
257
+ */
258
+ float minMipmapLevelClamp;
259
+ /**
260
+ * Upper end of the mipmap level range to clamp access to
261
+ */
262
+ float maxMipmapLevelClamp;
263
+ /**
264
+ * Disable any trilinear filtering optimizations.
265
+ */
266
+ int disableTrilinearOptimization;
267
+ /**
268
+ * Enable seamless cube map filtering.
269
+ */
270
+ int seamlessCubemap;
271
+ };
272
+
273
+ /**
274
+ * An opaque value that represents a CUDA texture object
275
+ */
276
+ typedef __device_builtin__ unsigned long long cudaTextureObject_t;
277
+
278
+ /** @} */
279
+ /** @} */ /* END CUDART_TYPES */
280
+
281
+ #endif /* !__TEXTURE_TYPES_H__ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/vector_functions.h ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__VECTOR_FUNCTIONS_H__)
51
+ #define __VECTOR_FUNCTIONS_H__
52
+
53
+ /*******************************************************************************
54
+ * *
55
+ * *
56
+ * *
57
+ *******************************************************************************/
58
+
59
+ #include "cuda_runtime_api.h"
60
+
61
+ #if defined(__CUDACC_RTC__)
62
+ #define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
63
+ #else /* !__CUDACC_RTC__ */
64
+ #define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
65
+ #endif /* __CUDACC_RTC__ */
66
+
67
+ /*******************************************************************************
68
+ * *
69
+ * *
70
+ * *
71
+ *******************************************************************************/
72
+
73
+ __VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x);
74
+
75
+ __VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x);
76
+
77
+ __VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y);
78
+
79
+ __VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y);
80
+
81
+ __VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z);
82
+
83
+ __VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z);
84
+
85
+ __VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w);
86
+
87
+ __VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w);
88
+
89
+ __VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x);
90
+
91
+ __VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x);
92
+
93
+ __VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y);
94
+
95
+ __VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y);
96
+
97
+ __VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z);
98
+
99
+ __VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z);
100
+
101
+ __VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w);
102
+
103
+ __VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
104
+
105
+ __VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x);
106
+
107
+ __VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x);
108
+
109
+ __VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y);
110
+
111
+ __VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y);
112
+
113
+ __VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z);
114
+
115
+ __VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z);
116
+
117
+ __VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w);
118
+
119
+ __VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w);
120
+
121
+ __VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x);
122
+
123
+ __VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x);
124
+
125
+ __VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y);
126
+
127
+ __VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y);
128
+
129
+ __VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z);
130
+
131
+ __VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z);
132
+
133
+ __VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w);
134
+
135
+ __VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w);
136
+
137
+ __VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x);
138
+
139
+ __VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y);
140
+
141
+ __VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z);
142
+
143
+ __VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w);
144
+
145
+ __VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x);
146
+
147
+ __VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x);
148
+
149
+ __VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y);
150
+
151
+ __VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y);
152
+
153
+ __VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z);
154
+
155
+ __VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z);
156
+
157
+ __VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w);
158
+
159
+ __VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w);
160
+
161
+ __VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x);
162
+
163
+ __VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y);
164
+
165
+ __VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z);
166
+
167
+ __VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w);
168
+
169
+ #undef __VECTOR_FUNCTIONS_DECL__
170
+
171
+ #if !defined(__CUDACC_RTC__)
172
+ #include "vector_functions.hpp"
173
+ #endif /* !__CUDACC_RTC__ */
174
+
175
+ #endif /* !__VECTOR_FUNCTIONS_H__ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_globals.h ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+ #ifndef CURAND_GLOBALS_H
49
+ #define CURAND_GLOBALS_H
50
+
51
+ #define MAX_XOR_N (5)
52
+ #define SKIPAHEAD_BLOCKSIZE (4)
53
+ #define SKIPAHEAD_MASK ((1<<SKIPAHEAD_BLOCKSIZE)-1)
54
+ #define CURAND_2POW32 (4294967296.f)
55
+ #define CURAND_2POW32_DOUBLE (4294967296.)
56
+ #define CURAND_2POW32_INV (2.3283064e-10f)
57
+ #define CURAND_2POW32_INV_DOUBLE (2.3283064365386963e-10)
58
+ #define CURAND_2POW53_INV_DOUBLE (1.1102230246251565e-16)
59
+ #define CURAND_2POW32_INV_2PI (2.3283064e-10f * 6.2831855f)
60
+ #define CURAND_2PI (6.2831855f)
61
+ #define CURAND_2POW53_INV_2PI_DOUBLE (1.1102230246251565e-16 * 6.2831853071795860)
62
+ #define CURAND_PI_DOUBLE (3.1415926535897932)
63
+ #define CURAND_2PI_DOUBLE (6.2831853071795860)
64
+ #define CURAND_SQRT2 (-1.4142135f)
65
+ #define CURAND_SQRT2_DOUBLE (-1.4142135623730951)
66
+
67
+ #define SOBOL64_ITR_BINARY_DIVIDE 2
68
+ #define SOBOL_M2_BINARY_DIVIDE 10
69
+ #define MTGP32_M2_BINARY_DIVIDE 32
70
+ #define MAX_LAMBDA 400000
71
+ #define MIN_GAUSS_LAMBDA 2000
72
+
73
+ struct normal_args_st {
74
+ float mean;
75
+ float stddev;
76
+ };
77
+
78
+ typedef struct normal_args_st normal_args_t;
79
+
80
+ struct normal_args_double_st {
81
+ double mean;
82
+ double stddev;
83
+ };
84
+
85
+ typedef struct normal_args_double_st normal_args_double_t;
86
+
87
+
88
+
89
+
90
+
91
+
92
+
93
+ #endif
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_lognormal.h ADDED
@@ -0,0 +1,697 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ /* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * The source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * The Licensed Deliverables contained herein are PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+
51
+ #if !defined(CURAND_LOGNORMAL_H_)
52
+ #define CURAND_LOGNORMAL_H_
53
+
54
+ /**
55
+ * \defgroup DEVICE Device API
56
+ *
57
+ * @{
58
+ */
59
+
60
+ #ifndef __CUDACC_RTC__
61
+ #include <math.h>
62
+ #endif // __CUDACC_RTC__
63
+
64
+ #include "curand_mrg32k3a.h"
65
+ #include "curand_mtgp32_kernel.h"
66
+ #include "curand_philox4x32_x.h"
67
+
68
+ /**
69
+ * \brief Return a log-normally distributed float from an XORWOW generator.
70
+ *
71
+ * Return a single log-normally distributed float derived from a normal
72
+ * distribution with mean \p mean and standard deviation \p stddev
73
+ * from the XORWOW generator in \p state,
74
+ * increment position of generator by one.
75
+ *
76
+ * The implementation uses a Box-Muller transform to generate two
77
+ * normally distributed results, transforms them to log-normal distribution,
78
+ * then returns them one at a time.
79
+ * See ::curand_log_normal2() for a more efficient version that returns
80
+ * both results at once.
81
+ *
82
+ * \param state - Pointer to state to update
83
+ * \param mean - Mean of the related normal distribution
84
+ * \param stddev - Standard deviation of the related normal distribution
85
+ *
86
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
87
+ */
88
+ QUALIFIERS float curand_log_normal(curandStateXORWOW_t *state, float mean, float stddev)
89
+ {
90
+ if(state->boxmuller_flag != EXTRA_FLAG_LOG_NORMAL) {
91
+ unsigned int x, y;
92
+ x = curand(state);
93
+ y = curand(state);
94
+ float2 v = _curand_box_muller(x, y);
95
+ state->boxmuller_extra = expf(mean + (stddev * v.y));
96
+ state->boxmuller_flag = EXTRA_FLAG_LOG_NORMAL;
97
+ return expf(mean + (stddev * v.x));
98
+ }
99
+ state->boxmuller_flag = 0;
100
+ return state->boxmuller_extra;
101
+ }
102
+
103
+ /**
104
+ * \brief Return a log-normally distributed float from an Philox4_32_10 generator.
105
+ *
106
+ * Return a single log-normally distributed float derived from a normal
107
+ * distribution with mean \p mean and standard deviation \p stddev
108
+ * from the Philox4_32_10 generator in \p state,
109
+ * increment position of generator by one.
110
+ *
111
+ * The implementation uses a Box-Muller transform to generate two
112
+ * normally distributed results, transforms them to log-normal distribution,
113
+ * then returns them one at a time.
114
+ * See ::curand_log_normal2() for a more efficient version that returns
115
+ * both results at once.
116
+ *
117
+ * \param state - Pointer to state to update
118
+ * \param mean - Mean of the related normal distribution
119
+ * \param stddev - Standard deviation of the related normal distribution
120
+ *
121
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
122
+ */
123
+
124
+ QUALIFIERS float curand_log_normal(curandStatePhilox4_32_10_t *state, float mean, float stddev)
125
+ {
126
+ if(state->boxmuller_flag != EXTRA_FLAG_LOG_NORMAL) {
127
+ unsigned int x, y;
128
+ x = curand(state);
129
+ y = curand(state);
130
+ float2 v = _curand_box_muller(x, y);
131
+ state->boxmuller_extra = expf(mean + (stddev * v.y));
132
+ state->boxmuller_flag = EXTRA_FLAG_LOG_NORMAL;
133
+ return expf(mean + (stddev * v.x));
134
+ }
135
+ state->boxmuller_flag = 0;
136
+ return state->boxmuller_extra;
137
+ }
138
+
139
+ /**
140
+ * \brief Return two normally distributed floats from an XORWOW generator.
141
+ *
142
+ * Return two log-normally distributed floats derived from a normal
143
+ * distribution with mean \p mean and standard deviation \p stddev
144
+ * from the XORWOW generator in \p state,
145
+ * increment position of generator by two.
146
+ *
147
+ * The implementation uses a Box-Muller transform to generate two
148
+ * normally distributed results, then transforms them to log-normal.
149
+ *
150
+ * \param state - Pointer to state to update
151
+ * \param mean - Mean of the related normal distribution
152
+ * \param stddev - Standard deviation of the related normal distribution
153
+ *
154
+ * \return Log-normally distributed float2 where each element is from a
155
+ * distribution with mean \p mean and standard deviation \p stddev
156
+ */
157
+ QUALIFIERS float2 curand_log_normal2(curandStateXORWOW_t *state, float mean, float stddev)
158
+ {
159
+ float2 v = curand_box_muller(state);
160
+ v.x = expf(mean + (stddev * v.x));
161
+ v.y = expf(mean + (stddev * v.y));
162
+ return v;
163
+ }
164
+
165
+ /**
166
+ * \brief Return two normally distributed floats from an Philox4_32_10 generator.
167
+ *
168
+ * Return two log-normally distributed floats derived from a normal
169
+ * distribution with mean \p mean and standard deviation \p stddev
170
+ * from the Philox4_32_10 generator in \p state,
171
+ * increment position of generator by two.
172
+ *
173
+ * The implementation uses a Box-Muller transform to generate two
174
+ * normally distributed results, then transforms them to log-normal.
175
+ *
176
+ * \param state - Pointer to state to update
177
+ * \param mean - Mean of the related normal distribution
178
+ * \param stddev - Standard deviation of the related normal distribution
179
+ *
180
+ * \return Log-normally distributed float2 where each element is from a
181
+ * distribution with mean \p mean and standard deviation \p stddev
182
+ */
183
+ QUALIFIERS float2 curand_log_normal2(curandStatePhilox4_32_10_t *state, float mean, float stddev)
184
+ {
185
+ float2 v = curand_box_muller(state);
186
+ v.x = expf(mean + (stddev * v.x));
187
+ v.y = expf(mean + (stddev * v.y));
188
+ return v;
189
+ }
190
+ /**
191
+ * \brief Return four normally distributed floats from an Philox4_32_10 generator.
192
+ *
193
+ * Return four log-normally distributed floats derived from a normal
194
+ * distribution with mean \p mean and standard deviation \p stddev
195
+ * from the Philox4_32_10 generator in \p state,
196
+ * increment position of generator by four.
197
+ *
198
+ * The implementation uses a Box-Muller transform to generate two
199
+ * normally distributed results, then transforms them to log-normal.
200
+ *
201
+ * \param state - Pointer to state to update
202
+ * \param mean - Mean of the related normal distribution
203
+ * \param stddev - Standard deviation of the related normal distribution
204
+ *
205
+ * \return Log-normally distributed float4 where each element is from a
206
+ * distribution with mean \p mean and standard deviation \p stddev
207
+ */
208
+ QUALIFIERS float4 curand_log_normal4(curandStatePhilox4_32_10_t *state, float mean, float stddev)
209
+ {
210
+ float4 v = curand_box_muller4(state);
211
+ v.x = expf(mean + (stddev * v.x));
212
+ v.y = expf(mean + (stddev * v.y));
213
+ v.z = expf(mean + (stddev * v.z));
214
+ v.w = expf(mean + (stddev * v.w));
215
+ return v;
216
+ }
217
+
218
+ /**
219
+ * \brief Return a log-normally distributed float from an MRG32k3a generator.
220
+ *
221
+ * Return a single log-normally distributed float derived from a normal
222
+ * distribution with mean \p mean and standard deviation \p stddev
223
+ * from the MRG32k3a generator in \p state,
224
+ * increment position of generator by one.
225
+ *
226
+ * The implementation uses a Box-Muller transform to generate two
227
+ * normally distributed results, transforms them to log-normal distribution,
228
+ * then returns them one at a time.
229
+ * See ::curand_log_normal2() for a more efficient version that returns
230
+ * both results at once.
231
+ *
232
+ * \param state - Pointer to state to update
233
+ * \param mean - Mean of the related normal distribution
234
+ * \param stddev - Standard deviation of the related normal distribution
235
+ *
236
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
237
+ */
238
+ QUALIFIERS float curand_log_normal(curandStateMRG32k3a_t *state, float mean, float stddev)
239
+ {
240
+ if(state->boxmuller_flag != EXTRA_FLAG_LOG_NORMAL) {
241
+ float2 v = curand_box_muller_mrg(state);
242
+ state->boxmuller_extra = expf(mean + (stddev * v.y));
243
+ state->boxmuller_flag = EXTRA_FLAG_LOG_NORMAL;
244
+ return expf(mean + (stddev * v.x));
245
+ }
246
+ state->boxmuller_flag = 0;
247
+ return state->boxmuller_extra;
248
+ }
249
+
250
+ /**
251
+ * \brief Return two normally distributed floats from an MRG32k3a generator.
252
+ *
253
+ * Return two log-normally distributed floats derived from a normal
254
+ * distribution with mean \p mean and standard deviation \p stddev
255
+ * from the MRG32k3a generator in \p state,
256
+ * increment position of generator by two.
257
+ *
258
+ * The implementation uses a Box-Muller transform to generate two
259
+ * normally distributed results, then transforms them to log-normal.
260
+ *
261
+ * \param state - Pointer to state to update
262
+ * \param mean - Mean of the related normal distribution
263
+ * \param stddev - Standard deviation of the related normal distribution
264
+ *
265
+ * \return Log-normally distributed float2 where each element is from a
266
+ * distribution with mean \p mean and standard deviation \p stddev
267
+ */
268
+ QUALIFIERS float2 curand_log_normal2(curandStateMRG32k3a_t *state, float mean, float stddev)
269
+ {
270
+ float2 v = curand_box_muller_mrg(state);
271
+ v.x = expf(mean + (stddev * v.x));
272
+ v.y = expf(mean + (stddev * v.y));
273
+ return v;
274
+ }
275
+
276
+ /**
277
+ * \brief Return a log-normally distributed float from an MTGP32 generator.
278
+ *
279
+ * Return a single log-normally distributed float derived from a normal
280
+ * distribution with mean \p mean and standard deviation \p stddev
281
+ * from the MTGP32 generator in \p state,
282
+ * increment position of generator.
283
+ *
284
+ * The implementation uses the inverse cumulative distribution function
285
+ * to generate a normally distributed result, then transforms the result
286
+ * to log-normal.
287
+ *
288
+ * \param state - Pointer to state to update
289
+ * \param mean - Mean of the related normal distribution
290
+ * \param stddev - Standard deviation of the related normal distribution
291
+ *
292
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
293
+ */
294
+ QUALIFIERS float curand_log_normal(curandStateMtgp32_t *state, float mean, float stddev)
295
+ {
296
+ return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
297
+ }
298
+
299
+ /**
300
+ * \brief Return a log-normally distributed float from a Sobol32 generator.
301
+ *
302
+ * Return a single log-normally distributed float derived from a normal
303
+ * distribution with mean \p mean and standard deviation \p stddev
304
+ * from the Sobol32 generator in \p state,
305
+ * increment position of generator by one.
306
+ *
307
+ * The implementation uses the inverse cumulative distribution function
308
+ * to generate a normally distributed result, then transforms the result
309
+ * to log-normal.
310
+ *
311
+ * \param state - Pointer to state to update
312
+ * \param mean - Mean of the related normal distribution
313
+ * \param stddev - Standard deviation of the related normal distribution
314
+ *
315
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
316
+ */
317
+ QUALIFIERS float curand_log_normal(curandStateSobol32_t *state, float mean, float stddev)
318
+ {
319
+ return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
320
+ }
321
+ /**
322
+ * \brief Return a log-normally distributed float from a scrambled Sobol32 generator.
323
+ *
324
+ * Return a single log-normally distributed float derived from a normal
325
+ * distribution with mean \p mean and standard deviation \p stddev
326
+ * from the scrambled Sobol32 generator in \p state,
327
+ * increment position of generator by one.
328
+ *
329
+ * The implementation uses the inverse cumulative distribution function
330
+ * to generate a normally distributed result, then transforms the result
331
+ * to log-normal.
332
+ *
333
+ * \param state - Pointer to state to update
334
+ * \param mean - Mean of the related normal distribution
335
+ * \param stddev - Standard deviation of the related normal distribution
336
+ *
337
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
338
+ */
339
+ QUALIFIERS float curand_log_normal(curandStateScrambledSobol32_t *state, float mean, float stddev)
340
+ {
341
+ return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
342
+ }
343
+
344
+ /**
345
+ * \brief Return a log-normally distributed float from a Sobol64 generator.
346
+ *
347
+ * Return a single log-normally distributed float derived from a normal
348
+ * distribution with mean \p mean and standard deviation \p stddev
349
+ * from the Sobol64 generator in \p state,
350
+ * increment position of generator by one.
351
+ *
352
+ * The implementation uses the inverse cumulative distribution function
353
+ * to generate normally distributed results, then converts to log-normal
354
+ * distribution.
355
+ *
356
+ * \param state - Pointer to state to update
357
+ * \param mean - Mean of the related normal distribution
358
+ * \param stddev - Standard deviation of the related normal distribution
359
+ *
360
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
361
+ */
362
+ QUALIFIERS float curand_log_normal(curandStateSobol64_t *state, float mean, float stddev)
363
+ {
364
+ return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
365
+ }
366
+
367
+ /**
368
+ * \brief Return a log-normally distributed float from a scrambled Sobol64 generator.
369
+ *
370
+ * Return a single log-normally distributed float derived from a normal
371
+ * distribution with mean \p mean and standard deviation \p stddev
372
+ * from the scrambled Sobol64 generator in \p state,
373
+ * increment position of generator by one.
374
+ *
375
+ * The implementation uses the inverse cumulative distribution function
376
+ * to generate normally distributed results, then converts to log-normal
377
+ * distribution.
378
+ *
379
+ * \param state - Pointer to state to update
380
+ * \param mean - Mean of the related normal distribution
381
+ * \param stddev - Standard deviation of the related normal distribution
382
+ *
383
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
384
+ */
385
+ QUALIFIERS float curand_log_normal(curandStateScrambledSobol64_t *state, float mean, float stddev)
386
+ {
387
+ return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
388
+ }
389
+
390
+ /**
391
+ * \brief Return a log-normally distributed double from an XORWOW generator.
392
+ *
393
+ * Return a single normally distributed double derived from a normal
394
+ * distribution with mean \p mean and standard deviation \p stddev
395
+ * from the XORWOW generator in \p state,
396
+ * increment position of generator.
397
+ *
398
+ * The implementation uses a Box-Muller transform to generate two
399
+ * normally distributed results, transforms them to log-normal distribution,
400
+ * then returns them one at a time.
401
+ * See ::curand_log_normal2_double() for a more efficient version that returns
402
+ * both results at once.
403
+ *
404
+ * \param state - Pointer to state to update
405
+ * \param mean - Mean of the related normal distribution
406
+ * \param stddev - Standard deviation of the related normal distribution
407
+ *
408
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
409
+ */
410
+
411
+ QUALIFIERS double curand_log_normal_double(curandStateXORWOW_t *state, double mean, double stddev)
412
+ {
413
+ if(state->boxmuller_flag_double != EXTRA_FLAG_LOG_NORMAL) {
414
+ unsigned int x0, x1, y0, y1;
415
+ x0 = curand(state);
416
+ x1 = curand(state);
417
+ y0 = curand(state);
418
+ y1 = curand(state);
419
+ double2 v = _curand_box_muller_double(x0, x1, y0, y1);
420
+ state->boxmuller_extra_double = exp(mean + (stddev * v.y));
421
+ state->boxmuller_flag_double = EXTRA_FLAG_LOG_NORMAL;
422
+ return exp(mean + (stddev * v.x));
423
+ }
424
+ state->boxmuller_flag_double = 0;
425
+ return state->boxmuller_extra_double;
426
+ }
427
+
428
+ /**
429
+ * \brief Return a log-normally distributed double from an Philox4_32_10 generator.
430
+ *
431
+ * Return a single normally distributed double derived from a normal
432
+ * distribution with mean \p mean and standard deviation \p stddev
433
+ * from the Philox4_32_10 generator in \p state,
434
+ * increment position of generator.
435
+ *
436
+ * The implementation uses a Box-Muller transform to generate two
437
+ * normally distributed results, transforms them to log-normal distribution,
438
+ * then returns them one at a time.
439
+ * See ::curand_log_normal2_double() for a more efficient version that returns
440
+ * both results at once.
441
+ *
442
+ * \param state - Pointer to state to update
443
+ * \param mean - Mean of the related normal distribution
444
+ * \param stddev - Standard deviation of the related normal distribution
445
+ *
446
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
447
+ */
448
+
449
+ QUALIFIERS double curand_log_normal_double(curandStatePhilox4_32_10_t *state, double mean, double stddev)
450
+ {
451
+ if(state->boxmuller_flag_double != EXTRA_FLAG_LOG_NORMAL) {
452
+ uint4 _x;
453
+ _x = curand4(state);
454
+ double2 v = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
455
+ state->boxmuller_extra_double = exp(mean + (stddev * v.y));
456
+ state->boxmuller_flag_double = EXTRA_FLAG_LOG_NORMAL;
457
+ return exp(mean + (stddev * v.x));
458
+ }
459
+ state->boxmuller_flag_double = 0;
460
+ return state->boxmuller_extra_double;
461
+ }
462
+
463
+
464
+ /**
465
+ * \brief Return two log-normally distributed doubles from an XORWOW generator.
466
+ *
467
+ * Return two log-normally distributed doubles derived from a normal
468
+ * distribution with mean \p mean and standard deviation \p stddev
469
+ * from the XORWOW generator in \p state,
470
+ * increment position of generator by two.
471
+ *
472
+ * The implementation uses a Box-Muller transform to generate two
473
+ * normally distributed results, and transforms them to log-normal distribution,.
474
+ *
475
+ * \param state - Pointer to state to update
476
+ * \param mean - Mean of the related normal distribution
477
+ * \param stddev - Standard deviation of the related normal distribution
478
+ *
479
+ * \return Log-normally distributed double2 where each element is from a
480
+ * distribution with mean \p mean and standard deviation \p stddev
481
+ */
482
+ QUALIFIERS double2 curand_log_normal2_double(curandStateXORWOW_t *state, double mean, double stddev)
483
+ {
484
+ double2 v = curand_box_muller_double(state);
485
+ v.x = exp(mean + (stddev * v.x));
486
+ v.y = exp(mean + (stddev * v.y));
487
+ return v;
488
+ }
489
+
490
+ /**
491
+ * \brief Return two log-normally distributed doubles from an Philox4_32_10 generator.
492
+ *
493
+ * Return two log-normally distributed doubles derived from a normal
494
+ * distribution with mean \p mean and standard deviation \p stddev
495
+ * from the Philox4_32_10 generator in \p state,
496
+ * increment position of generator by four.
497
+ *
498
+ * The implementation uses a Box-Muller transform to generate two
499
+ * normally distributed results, and transforms them to log-normal distribution,.
500
+ *
501
+ * \param state - Pointer to state to update
502
+ * \param mean - Mean of the related normal distribution
503
+ * \param stddev - Standard deviation of the related normal distribution
504
+ *
505
+ * \return Log-normally distributed double4 where each element is from a
506
+ * distribution with mean \p mean and standard deviation \p stddev
507
+ */
508
+ QUALIFIERS double2 curand_log_normal2_double(curandStatePhilox4_32_10_t *state, double mean, double stddev)
509
+ {
510
+ double2 v = curand_box_muller2_double(state);
511
+ v.x = exp(mean + (stddev * v.x));
512
+ v.y = exp(mean + (stddev * v.y));
513
+ return v;
514
+ }
515
+ // nor part of API
516
+ QUALIFIERS double4 curand_log_normal4_double(curandStatePhilox4_32_10_t *state, double mean, double stddev)
517
+ {
518
+ double4 v = curand_box_muller4_double(state);
519
+ v.x = exp(mean + (stddev * v.x));
520
+ v.y = exp(mean + (stddev * v.y));
521
+ v.z = exp(mean + (stddev * v.z));
522
+ v.w = exp(mean + (stddev * v.w));
523
+ return v;
524
+ }
525
+
526
+ /**
527
+ * \brief Return a log-normally distributed double from an MRG32k3a generator.
528
+ *
529
+ * Return a single normally distributed double derived from a normal
530
+ * distribution with mean \p mean and standard deviation \p stddev
531
+ * from the MRG32k3a generator in \p state,
532
+ * increment position of generator.
533
+ *
534
+ * The implementation uses a Box-Muller transform to generate two
535
+ * normally distributed results, transforms them to log-normal distribution,
536
+ * then returns them one at a time.
537
+ * See ::curand_log_normal2_double() for a more efficient version that returns
538
+ * both results at once.
539
+ *
540
+ * \param state - Pointer to state to update
541
+ * \param mean - Mean of the related normal distribution
542
+ * \param stddev - Standard deviation of the related normal distribution
543
+ *
544
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
545
+ */
546
+ QUALIFIERS double curand_log_normal_double(curandStateMRG32k3a_t *state, double mean, double stddev)
547
+ {
548
+ if(state->boxmuller_flag_double != EXTRA_FLAG_LOG_NORMAL) {
549
+ double2 v = curand_box_muller_mrg_double(state);
550
+ state->boxmuller_extra_double = exp(mean + (stddev * v.y));
551
+ state->boxmuller_flag_double = EXTRA_FLAG_LOG_NORMAL;
552
+ return exp(mean + (stddev * v.x));
553
+ }
554
+ state->boxmuller_flag_double = 0;
555
+ return state->boxmuller_extra_double;
556
+ }
557
+
558
+ /**
559
+ * \brief Return two log-normally distributed doubles from an MRG32k3a generator.
560
+ *
561
+ * Return two log-normally distributed doubles derived from a normal
562
+ * distribution with mean \p mean and standard deviation \p stddev
563
+ * from the MRG32k3a generator in \p state,
564
+ * increment position of generator by two.
565
+ *
566
+ * The implementation uses a Box-Muller transform to generate two
567
+ * normally distributed results, and transforms them to log-normal distribution,.
568
+ *
569
+ * \param state - Pointer to state to update
570
+ * \param mean - Mean of the related normal distribution
571
+ * \param stddev - Standard deviation of the related normal distribution
572
+ *
573
+ * \return Log-normally distributed double2 where each element is from a
574
+ * distribution with mean \p mean and standard deviation \p stddev
575
+ */
576
+ QUALIFIERS double2 curand_log_normal2_double(curandStateMRG32k3a_t *state, double mean, double stddev)
577
+ {
578
+ double2 v = curand_box_muller_mrg_double(state);
579
+ v.x = exp(mean + (stddev * v.x));
580
+ v.y = exp(mean + (stddev * v.y));
581
+ return v;
582
+ }
583
+
584
+ /**
585
+ * \brief Return a log-normally distributed double from an MTGP32 generator.
586
+ *
587
+ * Return a single log-normally distributed double derived from a normal
588
+ * distribution with mean \p mean and standard deviation \p stddev
589
+ * from the MTGP32 generator in \p state,
590
+ * increment position of generator.
591
+ *
592
+ * The implementation uses the inverse cumulative distribution function
593
+ * to generate normally distributed results, and transforms them into
594
+ * log-normal distribution.
595
+ *
596
+ * \param state - Pointer to state to update
597
+ * \param mean - Mean of the related normal distribution
598
+ * \param stddev - Standard deviation of the related normal distribution
599
+ *
600
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
601
+ */
602
+ QUALIFIERS double curand_log_normal_double(curandStateMtgp32_t *state, double mean, double stddev)
603
+ {
604
+ return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
605
+ }
606
+
607
+ /**
608
+ * \brief Return a log-normally distributed double from a Sobol32 generator.
609
+ *
610
+ * Return a single log-normally distributed double derived from a normal
611
+ * distribution with mean \p mean and standard deviation \p stddev
612
+ * from the Sobol32 generator in \p state,
613
+ * increment position of generator by one.
614
+ *
615
+ * The implementation uses the inverse cumulative distribution function
616
+ * to generate normally distributed results, and transforms them into
617
+ * log-normal distribution.
618
+ *
619
+ * \param state - Pointer to state to update
620
+ * \param mean - Mean of the related normal distribution
621
+ * \param stddev - Standard deviation of the related normal distribution
622
+ *
623
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
624
+ */
625
+ QUALIFIERS double curand_log_normal_double(curandStateSobol32_t *state, double mean, double stddev)
626
+ {
627
+ return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
628
+ }
629
+
630
+ /**
631
+ * \brief Return a log-normally distributed double from a scrambled Sobol32 generator.
632
+ *
633
+ * Return a single log-normally distributed double derived from a normal
634
+ * distribution with mean \p mean and standard deviation \p stddev
635
+ * from the scrambled Sobol32 generator in \p state,
636
+ * increment position of generator by one.
637
+ *
638
+ * The implementation uses the inverse cumulative distribution function
639
+ * to generate normally distributed results, and transforms them into
640
+ * log-normal distribution.
641
+ *
642
+ * \param state - Pointer to state to update
643
+ * \param mean - Mean of the related normal distribution
644
+ * \param stddev - Standard deviation of the related normal distribution
645
+ *
646
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
647
+ */
648
+ QUALIFIERS double curand_log_normal_double(curandStateScrambledSobol32_t *state, double mean, double stddev)
649
+ {
650
+ return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
651
+ }
652
+
653
+ /**
654
+ * \brief Return a log-normally distributed double from a Sobol64 generator.
655
+ *
656
+ * Return a single normally distributed double derived from a normal
657
+ * distribution with mean \p mean and standard deviation \p stddev
658
+ * from the Sobol64 generator in \p state,
659
+ * increment position of generator by one.
660
+ *
661
+ * The implementation uses the inverse cumulative distribution function
662
+ * to generate normally distributed results.
663
+ *
664
+ * \param state - Pointer to state to update
665
+ * \param mean - Mean of the related normal distribution
666
+ * \param stddev - Standard deviation of the related normal distribution
667
+ *
668
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
669
+ */
670
+ QUALIFIERS double curand_log_normal_double(curandStateSobol64_t *state, double mean, double stddev)
671
+ {
672
+ return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
673
+ }
674
+
675
+ /**
676
+ * \brief Return a log-normally distributed double from a scrambled Sobol64 generator.
677
+ *
678
+ * Return a single normally distributed double derived from a normal
679
+ * distribution with mean \p mean and standard deviation \p stddev
680
+ * from the scrambled Sobol64 generator in \p state,
681
+ * increment position of generator by one.
682
+ *
683
+ * The implementation uses the inverse cumulative distribution function
684
+ * to generate normally distributed results.
685
+ *
686
+ * \param state - Pointer to state to update
687
+ * \param mean - Mean of the related normal distribution
688
+ * \param stddev - Standard deviation of the related normal distribution
689
+ *
690
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
691
+ */
692
+ QUALIFIERS double curand_log_normal_double(curandStateScrambledSobol64_t *state, double mean, double stddev)
693
+ {
694
+ return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
695
+ }
696
+
697
+ #endif // !defined(CURAND_LOGNORMAL_H_)
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mrg32k3a.h ADDED
The diff for this file is too large to render. See raw diff
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32dc_p_11213.h ADDED
The diff for this file is too large to render. See raw diff
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_normal_static.h ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+ #ifndef CURAND_NORMAL_STATIC_H
49
+ #define CURAND_NORMAL_STATIC_H
50
+
51
+ #define QUALIFIERS_STATIC __host__ __device__ __forceinline__
52
+
53
+ QUALIFIERS_STATIC float _curand_normal_icdf(unsigned int x)
54
+ {
55
+ #if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
56
+ float s = CURAND_SQRT2;
57
+ // Mirror to avoid loss of precision
58
+ if(x > 0x80000000UL) {
59
+ x = 0xffffffffUL - x;
60
+ s = -s;
61
+ }
62
+ float p = x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
63
+ // p is in (0, 0.5], 2p is in (0, 1]
64
+ return s * erfcinvf(2.0f * p);
65
+ #else
66
+ x++; //suppress warnings
67
+ return 0.0f;
68
+ #endif
69
+ }
70
+
71
+ QUALIFIERS_STATIC float _curand_normal_icdf(unsigned long long x)
72
+ {
73
+ #if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
74
+ unsigned int t = (unsigned int)(x >> 32);
75
+ float s = CURAND_SQRT2;
76
+ // Mirror to avoid loss of precision
77
+ if(t > 0x80000000UL) {
78
+ t = 0xffffffffUL - t;
79
+ s = -s;
80
+ }
81
+ float p = t * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
82
+ // p is in (0, 0.5], 2p is in (0, 1]
83
+ return s * erfcinvf(2.0f * p);
84
+ #else
85
+ x++;
86
+ return 0.0f;
87
+ #endif
88
+ }
89
+
90
+ QUALIFIERS_STATIC double _curand_normal_icdf_double(unsigned int x)
91
+ {
92
+ #if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
93
+ double s = CURAND_SQRT2_DOUBLE;
94
+ // Mirror to avoid loss of precision
95
+ if(x > 0x80000000UL) {
96
+ x = 0xffffffffUL - x;
97
+ s = -s;
98
+ }
99
+ double p = x * CURAND_2POW32_INV_DOUBLE + (CURAND_2POW32_INV_DOUBLE/2.0);
100
+ // p is in (0, 0.5], 2p is in (0, 1]
101
+ return s * erfcinv(2.0 * p);
102
+ #else
103
+ x++;
104
+ return 0.0;
105
+ #endif
106
+ }
107
+
108
+ QUALIFIERS_STATIC double _curand_normal_icdf_double(unsigned long long x)
109
+ {
110
+ #if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
111
+ double s = CURAND_SQRT2_DOUBLE;
112
+ x >>= 11;
113
+ // Mirror to avoid loss of precision
114
+ if(x > 0x10000000000000UL) {
115
+ x = 0x1fffffffffffffUL - x;
116
+ s = -s;
117
+ }
118
+ double p = x * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
119
+ // p is in (0, 0.5], 2p is in (0, 1]
120
+ return s * erfcinv(2.0 * p);
121
+ #else
122
+ x++;
123
+ return 0.0;
124
+ #endif
125
+ }
126
+ #undef QUALIFIERS_STATIC
127
+ #endif
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_philox4x32_x.h ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+ /*
49
+ Copyright 2010-2011, D. E. Shaw Research.
50
+ All rights reserved.
51
+
52
+ Redistribution and use in source and binary forms, with or without
53
+ modification, are permitted provided that the following conditions are
54
+ met:
55
+
56
+ * Redistributions of source code must retain the above copyright
57
+ notice, this list of conditions, and the following disclaimer.
58
+
59
+ * Redistributions in binary form must reproduce the above copyright
60
+ notice, this list of conditions, and the following disclaimer in the
61
+ documentation and/or other materials provided with the distribution.
62
+
63
+ * Neither the name of D. E. Shaw Research nor the names of its
64
+ contributors may be used to endorse or promote products derived from
65
+ this software without specific prior written permission.
66
+
67
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
68
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
69
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
70
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
71
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
72
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
73
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
74
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
75
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
76
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
77
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
78
+ */
79
+
80
+ #ifndef CURAND_PHILOX4X32_X__H_
81
+ #define CURAND_PHILOX4X32_X__H_
82
+
83
+ #if !defined(QUALIFIERS)
84
+ #define QUALIFIERS static __forceinline__ __device__
85
+ #endif
86
+
87
+ #define PHILOX_W32_0 (0x9E3779B9)
88
+ #define PHILOX_W32_1 (0xBB67AE85)
89
+ #define PHILOX_M4x32_0 (0xD2511F53)
90
+ #define PHILOX_M4x32_1 (0xCD9E8D57)
91
+
92
+ struct curandStatePhilox4_32_10 {
93
+ uint4 ctr;
94
+ uint4 output;
95
+ uint2 key;
96
+ unsigned int STATE;
97
+ int boxmuller_flag;
98
+ int boxmuller_flag_double;
99
+ float boxmuller_extra;
100
+ double boxmuller_extra_double;
101
+ };
102
+
103
+ typedef struct curandStatePhilox4_32_10 curandStatePhilox4_32_10_t;
104
+
105
+
106
+ QUALIFIERS void Philox_State_Incr(curandStatePhilox4_32_10_t* s, unsigned long long n)
107
+ {
108
+ unsigned int nlo = (unsigned int)(n);
109
+ unsigned int nhi = (unsigned int)(n>>32);
110
+
111
+ s->ctr.x += nlo;
112
+ if( s->ctr.x < nlo )
113
+ nhi++;
114
+
115
+ s->ctr.y += nhi;
116
+ if(nhi <= s->ctr.y)
117
+ return;
118
+ if(++s->ctr.z) return;
119
+ ++s->ctr.w;
120
+ }
121
+
122
+ QUALIFIERS void Philox_State_Incr_hi(curandStatePhilox4_32_10_t* s, unsigned long long n)
123
+ {
124
+ unsigned int nlo = (unsigned int)(n);
125
+ unsigned int nhi = (unsigned int)(n>>32);
126
+
127
+ s->ctr.z += nlo;
128
+ if( s->ctr.z < nlo )
129
+ nhi++;
130
+
131
+ s->ctr.w += nhi;
132
+ }
133
+
134
+
135
+
136
+ QUALIFIERS void Philox_State_Incr(curandStatePhilox4_32_10_t* s)
137
+ {
138
+ if(++s->ctr.x) return;
139
+ if(++s->ctr.y) return;
140
+ if(++s->ctr.z) return;
141
+ ++s->ctr.w;
142
+ }
143
+
144
+
145
+ QUALIFIERS unsigned int mulhilo32(unsigned int a, unsigned int b, unsigned int* hip)
146
+ {
147
+ #ifndef __CUDA_ARCH__
148
+ // host code
149
+ unsigned long long product = ((unsigned long long)a) * ((unsigned long long)b);
150
+ *hip = product >> 32;
151
+ return (unsigned int)product;
152
+ #else
153
+ // device code
154
+ *hip = __umulhi(a,b);
155
+ return a*b;
156
+ #endif
157
+ }
158
+
159
+ QUALIFIERS uint4 _philox4x32round(uint4 ctr, uint2 key)
160
+ {
161
+ unsigned int hi0;
162
+ unsigned int hi1;
163
+ unsigned int lo0 = mulhilo32(PHILOX_M4x32_0, ctr.x, &hi0);
164
+ unsigned int lo1 = mulhilo32(PHILOX_M4x32_1, ctr.z, &hi1);
165
+
166
+ uint4 ret = {hi1^ctr.y^key.x, lo1, hi0^ctr.w^key.y, lo0};
167
+ return ret;
168
+ }
169
+
170
+ QUALIFIERS uint4 curand_Philox4x32_10( uint4 c, uint2 k)
171
+ {
172
+ c = _philox4x32round(c, k); // 1
173
+ k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
174
+ c = _philox4x32round(c, k); // 2
175
+ k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
176
+ c = _philox4x32round(c, k); // 3
177
+ k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
178
+ c = _philox4x32round(c, k); // 4
179
+ k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
180
+ c = _philox4x32round(c, k); // 5
181
+ k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
182
+ c = _philox4x32round(c, k); // 6
183
+ k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
184
+ c = _philox4x32round(c, k); // 7
185
+ k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
186
+ c = _philox4x32round(c, k); // 8
187
+ k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
188
+ c = _philox4x32round(c, k); // 9
189
+ k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
190
+ return _philox4x32round(c, k); // 10
191
+ }
192
+
193
+
194
+ #endif
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_precalc.h ADDED
The diff for this file is too large to render. See raw diff
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtCuda.h ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2009-2017 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO USER:
5
+ *
6
+ * This source code is subject to NVIDIA ownership rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * This software and the information contained herein is PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
11
+ * of a form of NVIDIA software license agreement.
12
+ *
13
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
14
+ * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
15
+ * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
16
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
17
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
18
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
19
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
20
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
21
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
22
+ * OR PERFORMANCE OF THIS SOURCE CODE.
23
+ *
24
+ * U.S. Government End Users. This source code is a "commercial item" as
25
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
26
+ * "commercial computer software" and "commercial computer software
27
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
28
+ * and is provided to the U.S. Government only as a commercial end item.
29
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
30
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
31
+ * source code with only those rights set forth herein.
32
+ *
33
+ * Any use of this source code in individual and commercial software must
34
+ * include, in the user documentation and internal comments to the code,
35
+ * the above Disclaimer and U.S. Government End Users Notice.
36
+ */
37
+
38
+ #ifndef NVTOOLSEXT_CUDA_H_
39
+ #define NVTOOLSEXT_CUDA_H_
40
+
41
+ #include "cuda.h"
42
+
43
+ #include "nvToolsExt.h"
44
+
45
+ #ifdef __cplusplus
46
+ extern "C" {
47
+ #endif /* __cplusplus */
48
+
49
+ /* ========================================================================= */
50
+ /** \name Functions for CUDA Resource Naming
51
+ */
52
+ /** \addtogroup RESOURCE_NAMING
53
+ * \section RESOURCE_NAMING_CUDA CUDA Resource Naming
54
+ *
55
+ * This section covers the API functions that allow to annotate CUDA resources
56
+ * with user-provided names.
57
+ *
58
+ * @{
59
+ */
60
+
61
+ /* ------------------------------------------------------------------------- */
62
+ /* \cond SHOW_HIDDEN
63
+ * \brief Used to build a non-colliding value for resource types separated class
64
+ * \version \NVTX_VERSION_2
65
+ */
66
+ #define NVTX_RESOURCE_CLASS_CUDA 4
67
+ /** \endcond */
68
+
69
+ /* ------------------------------------------------------------------------- */
70
+ /** \brief Resource types for CUDA
71
+ */
72
+ typedef enum nvtxResourceCUDAType_t
73
+ {
74
+ NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */
75
+ NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */
76
+ NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */
77
+ NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4) /* CUevent */
78
+ } nvtxResourceCUDAType_t;
79
+
80
+
81
+ /* ------------------------------------------------------------------------- */
82
+ /** \brief Annotates a CUDA device.
83
+ *
84
+ * Allows the user to associate a CUDA device with a user-provided name.
85
+ *
86
+ * \param device - The handle of the CUDA device to name.
87
+ * \param name - The name of the CUDA device.
88
+ *
89
+ * \version \NVTX_VERSION_1
90
+ * @{ */
91
+ NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name);
92
+ NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name);
93
+ /** @} */
94
+
95
+ /* ------------------------------------------------------------------------- */
96
+ /** \brief Annotates a CUDA context.
97
+ *
98
+ * Allows the user to associate a CUDA context with a user-provided name.
99
+ *
100
+ * \param context - The handle of the CUDA context to name.
101
+ * \param name - The name of the CUDA context.
102
+ *
103
+ * \par Example:
104
+ * \code
105
+ * CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );
106
+ * if ( CUDA_SUCCESS != status )
107
+ * goto Error;
108
+ * nvtxNameCuContext(cuContext, "CTX_NAME");
109
+ * \endcode
110
+ *
111
+ * \version \NVTX_VERSION_1
112
+ * @{ */
113
+ NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name);
114
+ NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name);
115
+ /** @} */
116
+
117
+ /* ------------------------------------------------------------------------- */
118
+ /** \brief Annotates a CUDA stream.
119
+ *
120
+ * Allows the user to associate a CUDA stream with a user-provided name.
121
+ *
122
+ * \param stream - The handle of the CUDA stream to name.
123
+ * \param name - The name of the CUDA stream.
124
+ *
125
+ * \version \NVTX_VERSION_1
126
+ * @{ */
127
+ NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name);
128
+ NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name);
129
+ /** @} */
130
+
131
+ /* ------------------------------------------------------------------------- */
132
+ /** \brief Annotates a CUDA event.
133
+ *
134
+ * Allows the user to associate a CUDA event with a user-provided name.
135
+ *
136
+ * \param event - The handle of the CUDA event to name.
137
+ * \param name - The name of the CUDA event.
138
+ *
139
+ * \version \NVTX_VERSION_1
140
+ * @{ */
141
+ NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name);
142
+ NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name);
143
+ /** @} */
144
+
145
+ /** @} */ /* END RESOURCE_NAMING */
146
+
147
+ /* ========================================================================= */
148
+ #ifdef UNICODE
149
+ #define nvtxNameCuDevice nvtxNameCuDeviceW
150
+ #define nvtxNameCuContext nvtxNameCuContextW
151
+ #define nvtxNameCuStream nvtxNameCuStreamW
152
+ #define nvtxNameCuEvent nvtxNameCuEventW
153
+ #else
154
+ #define nvtxNameCuDevice nvtxNameCuDeviceA
155
+ #define nvtxNameCuContext nvtxNameCuContextA
156
+ #define nvtxNameCuStream nvtxNameCuStreamA
157
+ #define nvtxNameCuEvent nvtxNameCuEventA
158
+ #endif
159
+
160
+ #ifdef __cplusplus
161
+ }
162
+ #endif /* __cplusplus */
163
+
164
+ #endif /* NVTOOLSEXT_CUDA_H_ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtOpenCL.h ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2009-2017 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO USER:
5
+ *
6
+ * This source code is subject to NVIDIA ownership rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * This software and the information contained herein is PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
11
+ * of a form of NVIDIA software license agreement.
12
+ *
13
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
14
+ * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
15
+ * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
16
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
17
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
18
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
19
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
20
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
21
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
22
+ * OR PERFORMANCE OF THIS SOURCE CODE.
23
+ *
24
+ * U.S. Government End Users. This source code is a "commercial item" as
25
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
26
+ * "commercial computer software" and "commercial computer software
27
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
28
+ * and is provided to the U.S. Government only as a commercial end item.
29
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
30
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
31
+ * source code with only those rights set forth herein.
32
+ *
33
+ * Any use of this source code in individual and commercial software must
34
+ * include, in the user documentation and internal comments to the code,
35
+ * the above Disclaimer and U.S. Government End Users Notice.
36
+ */
37
+
38
+ #ifndef NVTOOLSEXT_OPENCL_H_
39
+ #define NVTOOLSEXT_OPENCL_H_
40
+
41
+ #include <CL/cl.h>
42
+
43
+ #include "nvToolsExt.h"
44
+
45
+ #ifdef __cplusplus
46
+ extern "C" {
47
+ #endif /* __cplusplus */
48
+
49
+ /* ========================================================================= */
50
+ /** \name Functions for OpenCL Resource Naming
51
+ */
52
+ /** \addtogroup RESOURCE_NAMING
53
+ * \section RESOURCE_NAMING_OPENCL OpenCL Resource Naming
54
+ *
55
+ * This section covers the API functions that allow to annotate OpenCL resources
56
+ * with user-provided names.
57
+ *
58
+ * @{
59
+ */
60
+
61
+ /* ------------------------------------------------------------------------- */
62
+ /* \cond SHOW_HIDDEN
63
+ * \brief Used to build a non-colliding value for resource types separated class
64
+ * \version \NVTX_VERSION_2
65
+ */
66
+ #define NVTX_RESOURCE_CLASS_OPENCL 6
67
+ /** \endcond */
68
+
69
+ /* ------------------------------------------------------------------------- */
70
+ /** \brief Resource types for OpenCL
71
+ */
72
+ typedef enum nvtxResourceOpenCLType_t
73
+ {
74
+ NVTX_RESOURCE_TYPE_OPENCL_DEVICE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 1),
75
+ NVTX_RESOURCE_TYPE_OPENCL_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 2),
76
+ NVTX_RESOURCE_TYPE_OPENCL_COMMANDQUEUE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 3),
77
+ NVTX_RESOURCE_TYPE_OPENCL_MEMOBJECT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 4),
78
+ NVTX_RESOURCE_TYPE_OPENCL_SAMPLER = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 5),
79
+ NVTX_RESOURCE_TYPE_OPENCL_PROGRAM = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 6),
80
+ NVTX_RESOURCE_TYPE_OPENCL_EVENT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 7)
81
+ } nvtxResourceOpenCLType_t;
82
+
83
+
84
+ /* ------------------------------------------------------------------------- */
85
+ /** \brief Annotates an OpenCL device.
86
+ *
87
+ * Allows to associate an OpenCL device with a user-provided name.
88
+ *
89
+ * \param device - The handle of the OpenCL device to name.
90
+ * \param name - The name of the OpenCL device.
91
+ *
92
+ * \version \NVTX_VERSION_1
93
+ * @{ */
94
+ NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name);
95
+ NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name);
96
+ /** @} */
97
+
98
+ /* ------------------------------------------------------------------------- */
99
+ /** \brief Annotates an OpenCL context.
100
+ *
101
+ * Allows to associate an OpenCL context with a user-provided name.
102
+ *
103
+ * \param context - The handle of the OpenCL context to name.
104
+ * \param name - The name of the OpenCL context.
105
+ *
106
+ * \version \NVTX_VERSION_1
107
+ * @{ */
108
+ NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name);
109
+ NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name);
110
+ /** @} */
111
+
112
+ /* ------------------------------------------------------------------------- */
113
+ /** \brief Annotates an OpenCL command queue.
114
+ *
115
+ * Allows to associate an OpenCL command queue with a user-provided name.
116
+ *
117
+ * \param command_queue - The handle of the OpenCL command queue to name.
118
+ * \param name - The name of the OpenCL command queue.
119
+ *
120
+ * \version \NVTX_VERSION_1
121
+ * @{ */
122
+ NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name);
123
+ NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name);
124
+ /** @} */
125
+
126
+ /* ------------------------------------------------------------------------- */
127
+ /** \brief Annotates an OpenCL memory object.
128
+ *
129
+ * Allows to associate an OpenCL memory object with a user-provided name.
130
+ *
131
+ * \param memobj - The handle of the OpenCL memory object to name.
132
+ * \param name - The name of the OpenCL memory object.
133
+ *
134
+ * \version \NVTX_VERSION_1
135
+ * @{ */
136
+ NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name);
137
+ NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name);
138
+ /** @} */
139
+
140
+ /* ------------------------------------------------------------------------- */
141
+ /** \brief Annotates an OpenCL sampler.
142
+ *
143
+ * Allows to associate an OpenCL sampler with a user-provided name.
144
+ *
145
+ * \param sampler - The handle of the OpenCL sampler to name.
146
+ * \param name - The name of the OpenCL sampler.
147
+ *
148
+ * \version \NVTX_VERSION_1
149
+ * @{ */
150
+ NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name);
151
+ NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name);
152
+ /** @} */
153
+
154
+ /* ------------------------------------------------------------------------- */
155
+ /** \brief Annotates an OpenCL program.
156
+ *
157
+ * Allows to associate an OpenCL program with a user-provided name.
158
+ *
159
+ * \param program - The handle of the OpenCL program to name.
160
+ * \param name - The name of the OpenCL program.
161
+ *
162
+ * \code
163
+ * cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
164
+ * (const char **) &cSourceCL, &program_length, &ciErrNum);
165
+ * shrCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
166
+ * nvtxNameClProgram(cpProgram, L"PROGRAM_NAME");
167
+ * \endcode
168
+ *
169
+ * \version \NVTX_VERSION_1
170
+ * @{ */
171
+ NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name);
172
+ NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name);
173
+ /** @} */
174
+
175
+ /* ------------------------------------------------------------------------- */
176
+ /** \brief Annotates an OpenCL event.
177
+ *
178
+ * Allows to associate an OpenCL event with a user-provided name.
179
+ *
180
+ * \param evnt - The handle of the OpenCL event to name.
181
+ * \param name - The name of the OpenCL event.
182
+ *
183
+ * \version \NVTX_VERSION_1
184
+ * @{ */
185
+ NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name);
186
+ NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name);
187
+ /** @} */
188
+
189
+ /** @} */ /* END RESOURCE_NAMING */
190
+
191
+ /* ========================================================================= */
192
+ #ifdef UNICODE
193
+ #define nvtxNameClDevice nvtxNameClDeviceW
194
+ #define nvtxNameClContext nvtxNameClContextW
195
+ #define nvtxNameClCommandQueue nvtxNameClCommandQueueW
196
+ #define nvtxNameClMemObject nvtxNameClMemObjectW
197
+ #define nvtxNameClSampler nvtxNameClSamplerW
198
+ #define nvtxNameClProgram nvtxNameClProgramW
199
+ #define nvtxNameClEvent nvtxNameClEventW
200
+ #else
201
+ #define nvtxNameClDevice nvtxNameClDeviceA
202
+ #define nvtxNameClContext nvtxNameClContextA
203
+ #define nvtxNameClCommandQueue nvtxNameClCommandQueueA
204
+ #define nvtxNameClMemObject nvtxNameClMemObjectA
205
+ #define nvtxNameClSampler nvtxNameClSamplerA
206
+ #define nvtxNameClProgram nvtxNameClProgramA
207
+ #define nvtxNameClEvent nvtxNameClEventA
208
+ #endif
209
+
210
+ #ifdef __cplusplus
211
+ }
212
+ #endif /* __cplusplus */
213
+
214
+ #endif /* NVTOOLSEXT_OPENCL_H_ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExtOpenCL.h ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2009-2016 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO USER:
5
+ *
6
+ * This source code is subject to NVIDIA ownership rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * This software and the information contained herein is PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
11
+ * of a form of NVIDIA software license agreement.
12
+ *
13
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
14
+ * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
15
+ * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
16
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
17
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
18
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
19
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
20
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
21
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
22
+ * OR PERFORMANCE OF THIS SOURCE CODE.
23
+ *
24
+ * U.S. Government End Users. This source code is a "commercial item" as
25
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
26
+ * "commercial computer software" and "commercial computer software
27
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
28
+ * and is provided to the U.S. Government only as a commercial end item.
29
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
30
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
31
+ * source code with only those rights set forth herein.
32
+ *
33
+ * Any use of this source code in individual and commercial software must
34
+ * include, in the user documentation and internal comments to the code,
35
+ * the above Disclaimer and U.S. Government End Users Notice.
36
+ */
37
+
38
+ #include "nvToolsExt.h"
39
+
40
+ #include <CL/cl.h>
41
+
42
+ #ifndef NVTOOLSEXT_OPENCL_V3
43
+ #define NVTOOLSEXT_OPENCL_V3
44
+
45
+ #ifdef __cplusplus
46
+ extern "C" {
47
+ #endif /* __cplusplus */
48
+
49
+ /* ========================================================================= */
50
+ /** \name Functions for OpenCL Resource Naming
51
+ */
52
+ /** \addtogroup RESOURCE_NAMING
53
+ * \section RESOURCE_NAMING_OPENCL OpenCL Resource Naming
54
+ *
55
+ * This section covers the API functions that allow to annotate OpenCL resources
56
+ * with user-provided names.
57
+ *
58
+ * @{
59
+ */
60
+
61
+ /* ------------------------------------------------------------------------- */
62
+ /* \cond SHOW_HIDDEN
63
+ * \brief Used to build a non-colliding value for resource types separated class
64
+ * \version \NVTX_VERSION_2
65
+ */
66
+ #define NVTX_RESOURCE_CLASS_OPENCL 6
67
+ /** \endcond */
68
+
69
+ /* ------------------------------------------------------------------------- */
70
+ /** \brief Resource types for OpenCL
71
+ */
72
+ typedef enum nvtxResourceOpenCLType_t
73
+ {
74
+ NVTX_RESOURCE_TYPE_OPENCL_DEVICE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 1),
75
+ NVTX_RESOURCE_TYPE_OPENCL_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 2),
76
+ NVTX_RESOURCE_TYPE_OPENCL_COMMANDQUEUE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 3),
77
+ NVTX_RESOURCE_TYPE_OPENCL_MEMOBJECT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 4),
78
+ NVTX_RESOURCE_TYPE_OPENCL_SAMPLER = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 5),
79
+ NVTX_RESOURCE_TYPE_OPENCL_PROGRAM = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 6),
80
+ NVTX_RESOURCE_TYPE_OPENCL_EVENT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 7),
81
+ } nvtxResourceOpenCLType_t;
82
+
83
+
84
+ /* ------------------------------------------------------------------------- */
85
+ /** \brief Annotates an OpenCL device.
86
+ *
87
+ * Allows to associate an OpenCL device with a user-provided name.
88
+ *
89
+ * \param device - The handle of the OpenCL device to name.
90
+ * \param name - The name of the OpenCL device.
91
+ *
92
+ * \version \NVTX_VERSION_1
93
+ * @{ */
94
+ NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name);
95
+ NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name);
96
+ /** @} */
97
+
98
+ /* ------------------------------------------------------------------------- */
99
+ /** \brief Annotates an OpenCL context.
100
+ *
101
+ * Allows to associate an OpenCL context with a user-provided name.
102
+ *
103
+ * \param context - The handle of the OpenCL context to name.
104
+ * \param name - The name of the OpenCL context.
105
+ *
106
+ * \version \NVTX_VERSION_1
107
+ * @{ */
108
+ NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name);
109
+ NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name);
110
+ /** @} */
111
+
112
+ /* ------------------------------------------------------------------------- */
113
+ /** \brief Annotates an OpenCL command queue.
114
+ *
115
+ * Allows to associate an OpenCL command queue with a user-provided name.
116
+ *
117
+ * \param command_queue - The handle of the OpenCL command queue to name.
118
+ * \param name - The name of the OpenCL command queue.
119
+ *
120
+ * \version \NVTX_VERSION_1
121
+ * @{ */
122
+ NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name);
123
+ NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name);
124
+ /** @} */
125
+
126
+ /* ------------------------------------------------------------------------- */
127
+ /** \brief Annotates an OpenCL memory object.
128
+ *
129
+ * Allows to associate an OpenCL memory object with a user-provided name.
130
+ *
131
+ * \param memobj - The handle of the OpenCL memory object to name.
132
+ * \param name - The name of the OpenCL memory object.
133
+ *
134
+ * \version \NVTX_VERSION_1
135
+ * @{ */
136
+ NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name);
137
+ NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name);
138
+ /** @} */
139
+
140
+ /* ------------------------------------------------------------------------- */
141
+ /** \brief Annotates an OpenCL sampler.
142
+ *
143
+ * Allows to associate an OpenCL sampler with a user-provided name.
144
+ *
145
+ * \param sampler - The handle of the OpenCL sampler to name.
146
+ * \param name - The name of the OpenCL sampler.
147
+ *
148
+ * \version \NVTX_VERSION_1
149
+ * @{ */
150
+ NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name);
151
+ NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name);
152
+ /** @} */
153
+
154
+ /* ------------------------------------------------------------------------- */
155
+ /** \brief Annotates an OpenCL program.
156
+ *
157
+ * Allows to associate an OpenCL program with a user-provided name.
158
+ *
159
+ * \param program - The handle of the OpenCL program to name.
160
+ * \param name - The name of the OpenCL program.
161
+ *
162
+ * \code
163
+ * cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
164
+ * (const char **) &cSourceCL, &program_length, &ciErrNum);
165
+ * shrCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
166
+ * nvtxNameClProgram(cpProgram, L"PROGRAM_NAME");
167
+ * \endcode
168
+ *
169
+ * \version \NVTX_VERSION_1
170
+ * @{ */
171
+ NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name);
172
+ NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name);
173
+ /** @} */
174
+
175
+ /* ------------------------------------------------------------------------- */
176
+ /** \brief Annotates an OpenCL event.
177
+ *
178
+ * Allows to associate an OpenCL event with a user-provided name.
179
+ *
180
+ * \param evnt - The handle of the OpenCL event to name.
181
+ * \param name - The name of the OpenCL event.
182
+ *
183
+ * \version \NVTX_VERSION_1
184
+ * @{ */
185
+ NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name);
186
+ NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name);
187
+ /** @} */
188
+
189
+ /** @} */ /* END RESOURCE_NAMING */
190
+
191
+ /* ========================================================================= */
192
+ #ifdef UNICODE
193
+ #define nvtxNameClDevice nvtxNameClDeviceW
194
+ #define nvtxNameClContext nvtxNameClContextW
195
+ #define nvtxNameClCommandQueue nvtxNameClCommandQueueW
196
+ #define nvtxNameClMemObject nvtxNameClMemObjectW
197
+ #define nvtxNameClSampler nvtxNameClSamplerW
198
+ #define nvtxNameClProgram nvtxNameClProgramW
199
+ #define nvtxNameClEvent nvtxNameClEventW
200
+ #else
201
+ #define nvtxNameClDevice nvtxNameClDeviceA
202
+ #define nvtxNameClContext nvtxNameClContextA
203
+ #define nvtxNameClCommandQueue nvtxNameClCommandQueueA
204
+ #define nvtxNameClMemObject nvtxNameClMemObjectA
205
+ #define nvtxNameClSampler nvtxNameClSamplerA
206
+ #define nvtxNameClProgram nvtxNameClProgramA
207
+ #define nvtxNameClEvent nvtxNameClEventA
208
+ #endif
209
+
210
+ #ifdef __cplusplus
211
+ }
212
+ #endif /* __cplusplus */
213
+
214
+ #ifndef NVTX_NO_IMPL
215
+ #define NVTX_IMPL_GUARD_OPENCL /* Ensure other headers cannot included directly */
216
+ #include "nvtxDetail/nvtxImplOpenCL_v3.h"
217
+ #undef NVTX_IMPL_GUARD_OPENCL
218
+ #endif /*NVTX_NO_IMPL*/
219
+
220
+ #endif /* NVTOOLSEXT_OPENCL_V3 */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/WHEEL ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (74.1.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ pybind11
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .pyximport import *
2
+
3
+ # replicate docstring
4
+ from .pyximport import __doc__
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__pycache__/_pyximport2.cpython-311.pyc ADDED
Binary file (28.6 kB). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__pycache__/pyxbuild.cpython-311.pyc ADDED
Binary file (7.1 kB). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/_pyximport3.py ADDED
@@ -0,0 +1,478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Import hooks; when installed with the install() function, these hooks
3
+ allow importing .pyx files as if they were Python modules.
4
+
5
+ If you want the hook installed every time you run Python
6
+ you can add it to your Python version by adding these lines to
7
+ sitecustomize.py (which you can create from scratch in site-packages
8
+ if it doesn't exist there or somewhere else on your python path)::
9
+
10
+ import pyximport
11
+ pyximport.install()
12
+
13
+ For instance on the Mac with a non-system Python 2.3, you could create
14
+ sitecustomize.py with only those two lines at
15
+ /usr/local/lib/python2.3/site-packages/sitecustomize.py .
16
+
17
+ A custom distutils.core.Extension instance and setup() args
18
+ (Distribution) for for the build can be defined by a <modulename>.pyxbld
19
+ file like:
20
+
21
+ # examplemod.pyxbld
22
+ def make_ext(modname, pyxfilename):
23
+ from distutils.extension import Extension
24
+ return Extension(name = modname,
25
+ sources=[pyxfilename, 'hello.c'],
26
+ include_dirs=['/myinclude'] )
27
+ def make_setup_args():
28
+ return dict(script_args=["--compiler=mingw32"])
29
+
30
+ Extra dependencies can be defined by a <modulename>.pyxdep .
31
+ See README.
32
+
33
+ Since Cython 0.11, the :mod:`pyximport` module also has experimental
34
+ compilation support for normal Python modules. This allows you to
35
+ automatically run Cython on every .pyx and .py module that Python
36
+ imports, including parts of the standard library and installed
37
+ packages. Cython will still fail to compile a lot of Python modules,
38
+ in which case the import mechanism will fall back to loading the
39
+ Python source modules instead. The .py import mechanism is installed
40
+ like this::
41
+
42
+ pyximport.install(pyimport = True)
43
+
44
+ Running this module as a top-level script will run a test and then print
45
+ the documentation.
46
+ """
47
+
48
+ import glob
49
+ import importlib
50
+ import os
51
+ import sys
52
+ from importlib.abc import MetaPathFinder
53
+ from importlib.machinery import ExtensionFileLoader, SourceFileLoader
54
+ from importlib.util import spec_from_file_location
55
+
56
+ mod_name = "pyximport"
57
+
58
+ PY_EXT = ".py"
59
+ PYX_EXT = ".pyx"
60
+ PYXDEP_EXT = ".pyxdep"
61
+ PYXBLD_EXT = ".pyxbld"
62
+
63
+ DEBUG_IMPORT = False
64
+
65
+
66
+ def _print(message, args):
67
+ if args:
68
+ message = message % args
69
+ print(message)
70
+
71
+
72
+ def _debug(message, *args):
73
+ if DEBUG_IMPORT:
74
+ _print(message, args)
75
+
76
+
77
+ def _info(message, *args):
78
+ _print(message, args)
79
+
80
+
81
+ def load_source(file_path):
82
+ import importlib.util
83
+ from importlib.machinery import SourceFileLoader
84
+ spec = importlib.util.spec_from_file_location("XXXX", file_path, loader=SourceFileLoader("XXXX", file_path))
85
+ module = importlib.util.module_from_spec(spec)
86
+ spec.loader.exec_module(module)
87
+ return module
88
+
89
+
90
+ def get_distutils_extension(modname, pyxfilename, language_level=None):
91
+ # try:
92
+ # import hashlib
93
+ # except ImportError:
94
+ # import md5 as hashlib
95
+ # extra = "_" + hashlib.md5(open(pyxfilename).read()).hexdigest()
96
+ # modname = modname + extra
97
+ extension_mod,setup_args = handle_special_build(modname, pyxfilename)
98
+ if not extension_mod:
99
+ if not isinstance(pyxfilename, str):
100
+ # distutils is stupid in Py2 and requires exactly 'str'
101
+ # => encode accidentally coerced unicode strings back to str
102
+ pyxfilename = pyxfilename.encode(sys.getfilesystemencoding())
103
+ from distutils.extension import Extension
104
+ extension_mod = Extension(name = modname, sources=[pyxfilename])
105
+ if language_level is not None:
106
+ extension_mod.cython_directives = {'language_level': language_level}
107
+ return extension_mod,setup_args
108
+
109
+
110
+ def handle_special_build(modname, pyxfilename):
111
+ special_build = os.path.splitext(pyxfilename)[0] + PYXBLD_EXT
112
+ ext = None
113
+ setup_args={}
114
+ if os.path.exists(special_build):
115
+ # globls = {}
116
+ # locs = {}
117
+ # execfile(special_build, globls, locs)
118
+ # ext = locs["make_ext"](modname, pyxfilename)
119
+ mod = load_source(special_build)
120
+ make_ext = getattr(mod,'make_ext',None)
121
+ if make_ext:
122
+ ext = make_ext(modname, pyxfilename)
123
+ assert ext and ext.sources, "make_ext in %s did not return Extension" % special_build
124
+ make_setup_args = getattr(mod, 'make_setup_args',None)
125
+ if make_setup_args:
126
+ setup_args = make_setup_args()
127
+ assert isinstance(setup_args,dict), ("make_setup_args in %s did not return a dict"
128
+ % special_build)
129
+ assert ext or setup_args, ("neither make_ext nor make_setup_args %s"
130
+ % special_build)
131
+ ext.sources = [os.path.join(os.path.dirname(special_build), source)
132
+ for source in ext.sources]
133
+ return ext, setup_args
134
+
135
+
136
+ def handle_dependencies(pyxfilename):
137
+ testing = '_test_files' in globals()
138
+ dependfile = os.path.splitext(pyxfilename)[0] + PYXDEP_EXT
139
+
140
+ # by default let distutils decide whether to rebuild on its own
141
+ # (it has a better idea of what the output file will be)
142
+
143
+ # but we know more about dependencies so force a rebuild if
144
+ # some of the dependencies are newer than the pyxfile.
145
+ if os.path.exists(dependfile):
146
+ with open(dependfile) as fid:
147
+ depends = fid.readlines()
148
+ depends = [depend.strip() for depend in depends]
149
+
150
+ # gather dependencies in the "files" variable
151
+ # the dependency file is itself a dependency
152
+ files = [dependfile]
153
+ for depend in depends:
154
+ fullpath = os.path.join(os.path.dirname(dependfile),
155
+ depend)
156
+ files.extend(glob.glob(fullpath))
157
+
158
+ # only for unit testing to see we did the right thing
159
+ if testing:
160
+ _test_files[:] = [] #$pycheck_no
161
+
162
+ # if any file that the pyxfile depends upon is newer than
163
+ # the pyx file, 'touch' the pyx file so that distutils will
164
+ # be tricked into rebuilding it.
165
+ for file in files:
166
+ from distutils.dep_util import newer
167
+ if newer(file, pyxfilename):
168
+ _debug("Rebuilding %s because of %s", pyxfilename, file)
169
+ filetime = os.path.getmtime(file)
170
+ os.utime(pyxfilename, (filetime, filetime))
171
+ if testing:
172
+ _test_files.append(file)
173
+
174
+
175
+ def build_module(name, pyxfilename, pyxbuild_dir=None, inplace=False, language_level=None):
176
+ assert os.path.exists(pyxfilename), "Path does not exist: %s" % pyxfilename
177
+ handle_dependencies(pyxfilename)
178
+
179
+ extension_mod, setup_args = get_distutils_extension(name, pyxfilename, language_level)
180
+ build_in_temp = pyxargs.build_in_temp
181
+ sargs = pyxargs.setup_args.copy()
182
+ sargs.update(setup_args)
183
+ build_in_temp = sargs.pop('build_in_temp',build_in_temp)
184
+
185
+ from . import pyxbuild
186
+ olddir = os.getcwd()
187
+ common = ''
188
+ if pyxbuild_dir and sys.platform == 'win32':
189
+ # Windows concatenates the pyxbuild_dir to the pyxfilename when
190
+ # compiling, and then complains that the filename is too long
191
+ common = os.path.commonprefix([pyxbuild_dir, pyxfilename])
192
+ if len(common) > 30:
193
+ pyxfilename = os.path.relpath(pyxfilename, common)
194
+ pyxbuild_dir = os.path.relpath(pyxbuild_dir, common)
195
+ os.chdir(common)
196
+ try:
197
+ so_path = pyxbuild.pyx_to_dll(pyxfilename, extension_mod,
198
+ build_in_temp=build_in_temp,
199
+ pyxbuild_dir=pyxbuild_dir,
200
+ setup_args=sargs,
201
+ inplace=inplace,
202
+ reload_support=pyxargs.reload_support)
203
+ finally:
204
+ os.chdir(olddir)
205
+ so_path = os.path.join(common, so_path)
206
+ assert os.path.exists(so_path), "Cannot find: %s" % so_path
207
+
208
+ junkpath = os.path.join(os.path.dirname(so_path), name+"_*") #very dangerous with --inplace ? yes, indeed, trying to eat my files ;)
209
+ junkstuff = glob.glob(junkpath)
210
+ for path in junkstuff:
211
+ if path != so_path:
212
+ try:
213
+ os.remove(path)
214
+ except IOError:
215
+ _info("Couldn't remove %s", path)
216
+
217
+ return so_path
218
+
219
+
220
+ # import hooks
221
+
222
+ class PyxImportMetaFinder(MetaPathFinder):
223
+
224
+ def __init__(self, extension=PYX_EXT, pyxbuild_dir=None, inplace=False, language_level=None):
225
+ self.pyxbuild_dir = pyxbuild_dir
226
+ self.inplace = inplace
227
+ self.language_level = language_level
228
+ self.extension = extension
229
+
230
+ def find_spec(self, fullname, path, target=None):
231
+ if not path:
232
+ path = [os.getcwd()] # top level import --
233
+ if "." in fullname:
234
+ *parents, name = fullname.split(".")
235
+ else:
236
+ name = fullname
237
+ for entry in path:
238
+ if os.path.isdir(os.path.join(entry, name)):
239
+ # this module has child modules
240
+ filename = os.path.join(entry, name, "__init__" + self.extension)
241
+ submodule_locations = [os.path.join(entry, name)]
242
+ else:
243
+ filename = os.path.join(entry, name + self.extension)
244
+ submodule_locations = None
245
+ if not os.path.exists(filename):
246
+ continue
247
+
248
+ return spec_from_file_location(
249
+ fullname, filename,
250
+ loader=PyxImportLoader(filename, self.pyxbuild_dir, self.inplace, self.language_level),
251
+ submodule_search_locations=submodule_locations)
252
+
253
+ return None # we don't know how to import this
254
+
255
+
256
+ class PyImportMetaFinder(MetaPathFinder):
257
+
258
+ def __init__(self, extension=PY_EXT, pyxbuild_dir=None, inplace=False, language_level=None):
259
+ self.pyxbuild_dir = pyxbuild_dir
260
+ self.inplace = inplace
261
+ self.language_level = language_level
262
+ self.extension = extension
263
+ self.uncompilable_modules = {}
264
+ self.blocked_modules = ['Cython', 'pyxbuild', 'pyximport.pyxbuild',
265
+ 'distutils', 'cython']
266
+ self.blocked_packages = ['Cython.', 'distutils.']
267
+
268
+ def find_spec(self, fullname, path, target=None):
269
+ if fullname in sys.modules:
270
+ return None
271
+ if any([fullname.startswith(pkg) for pkg in self.blocked_packages]):
272
+ return None
273
+ if fullname in self.blocked_modules:
274
+ # prevent infinite recursion
275
+ return None
276
+
277
+ self.blocked_modules.append(fullname)
278
+ name = fullname
279
+ if not path:
280
+ path = [os.getcwd()] # top level import --
281
+ try:
282
+ for entry in path:
283
+ if os.path.isdir(os.path.join(entry, name)):
284
+ # this module has child modules
285
+ filename = os.path.join(entry, name, "__init__" + self.extension)
286
+ submodule_locations = [os.path.join(entry, name)]
287
+ else:
288
+ filename = os.path.join(entry, name + self.extension)
289
+ submodule_locations = None
290
+ if not os.path.exists(filename):
291
+ continue
292
+
293
+ return spec_from_file_location(
294
+ fullname, filename,
295
+ loader=PyxImportLoader(filename, self.pyxbuild_dir, self.inplace, self.language_level),
296
+ submodule_search_locations=submodule_locations)
297
+ finally:
298
+ self.blocked_modules.pop()
299
+
300
+ return None # we don't know how to import this
301
+
302
+
303
+ class PyxImportLoader(ExtensionFileLoader):
304
+
305
+ def __init__(self, filename, pyxbuild_dir, inplace, language_level):
306
+ module_name = os.path.splitext(os.path.basename(filename))[0]
307
+ super().__init__(module_name, filename)
308
+ self._pyxbuild_dir = pyxbuild_dir
309
+ self._inplace = inplace
310
+ self._language_level = language_level
311
+
312
+ def create_module(self, spec):
313
+ try:
314
+ so_path = build_module(spec.name, pyxfilename=spec.origin, pyxbuild_dir=self._pyxbuild_dir,
315
+ inplace=self._inplace, language_level=self._language_level)
316
+ self.path = so_path
317
+ spec.origin = so_path
318
+ return super().create_module(spec)
319
+ except Exception as failure_exc:
320
+ _debug("Failed to load extension module: %r" % failure_exc)
321
+ if pyxargs.load_py_module_on_import_failure and spec.origin.endswith(PY_EXT):
322
+ spec = importlib.util.spec_from_file_location(spec.name, spec.origin,
323
+ loader=SourceFileLoader(spec.name, spec.origin))
324
+ mod = importlib.util.module_from_spec(spec)
325
+ assert mod.__file__ in (spec.origin, spec.origin + 'c', spec.origin + 'o'), (mod.__file__, spec.origin)
326
+ return mod
327
+ else:
328
+ tb = sys.exc_info()[2]
329
+ import traceback
330
+ exc = ImportError("Building module %s failed: %s" % (
331
+ spec.name, traceback.format_exception_only(*sys.exc_info()[:2])))
332
+ raise exc.with_traceback(tb)
333
+
334
+ def exec_module(self, module):
335
+ try:
336
+ return super().exec_module(module)
337
+ except Exception as failure_exc:
338
+ import traceback
339
+ _debug("Failed to load extension module: %r" % failure_exc)
340
+ raise ImportError("Executing module %s failed %s" % (
341
+ module.__file__, traceback.format_exception_only(*sys.exc_info()[:2])))
342
+
343
+
344
+ #install args
345
+ class PyxArgs(object):
346
+ build_dir=True
347
+ build_in_temp=True
348
+ setup_args={} #None
349
+
350
+
351
+ def _have_importers():
352
+ has_py_importer = False
353
+ has_pyx_importer = False
354
+ for importer in sys.meta_path:
355
+ if isinstance(importer, PyxImportMetaFinder):
356
+ if isinstance(importer, PyImportMetaFinder):
357
+ has_py_importer = True
358
+ else:
359
+ has_pyx_importer = True
360
+
361
+ return has_py_importer, has_pyx_importer
362
+
363
+
364
+ def install(pyximport=True, pyimport=False, build_dir=None, build_in_temp=True,
365
+ setup_args=None, reload_support=False,
366
+ load_py_module_on_import_failure=False, inplace=False,
367
+ language_level=None):
368
+ """ Main entry point for pyxinstall.
369
+
370
+ Call this to install the ``.pyx`` import hook in
371
+ your meta-path for a single Python process. If you want it to be
372
+ installed whenever you use Python, add it to your ``sitecustomize``
373
+ (as described above).
374
+
375
+ :param pyximport: If set to False, does not try to import ``.pyx`` files.
376
+
377
+ :param pyimport: You can pass ``pyimport=True`` to also
378
+ install the ``.py`` import hook
379
+ in your meta-path. Note, however, that it is rather experimental,
380
+ will not work at all for some ``.py`` files and packages, and will
381
+ heavily slow down your imports due to search and compilation.
382
+ Use at your own risk.
383
+
384
+ :param build_dir: By default, compiled modules will end up in a ``.pyxbld``
385
+ directory in the user's home directory. Passing a different path
386
+ as ``build_dir`` will override this.
387
+
388
+ :param build_in_temp: If ``False``, will produce the C files locally. Working
389
+ with complex dependencies and debugging becomes more easy. This
390
+ can principally interfere with existing files of the same name.
391
+
392
+ :param setup_args: Dict of arguments for Distribution.
393
+ See ``distutils.core.setup()``.
394
+
395
+ :param reload_support: Enables support for dynamic
396
+ ``reload(my_module)``, e.g. after a change in the Cython code.
397
+ Additional files ``<so_path>.reloadNN`` may arise on that account, when
398
+ the previously loaded module file cannot be overwritten.
399
+
400
+ :param load_py_module_on_import_failure: If the compilation of a ``.py``
401
+ file succeeds, but the subsequent import fails for some reason,
402
+ retry the import with the normal ``.py`` module instead of the
403
+ compiled module. Note that this may lead to unpredictable results
404
+ for modules that change the system state during their import, as
405
+ the second import will rerun these modifications in whatever state
406
+ the system was left after the import of the compiled module
407
+ failed.
408
+
409
+ :param inplace: Install the compiled module
410
+ (``.so`` for Linux and Mac / ``.pyd`` for Windows)
411
+ next to the source file.
412
+
413
+ :param language_level: The source language level to use: 2 or 3.
414
+ The default is to use the language level of the current Python
415
+ runtime for .py files and Py2 for ``.pyx`` files.
416
+ """
417
+ if setup_args is None:
418
+ setup_args = {}
419
+ if not build_dir:
420
+ build_dir = os.path.join(os.path.expanduser('~'), '.pyxbld')
421
+
422
+ global pyxargs
423
+ pyxargs = PyxArgs() #$pycheck_no
424
+ pyxargs.build_dir = build_dir
425
+ pyxargs.build_in_temp = build_in_temp
426
+ pyxargs.setup_args = (setup_args or {}).copy()
427
+ pyxargs.reload_support = reload_support
428
+ pyxargs.load_py_module_on_import_failure = load_py_module_on_import_failure
429
+
430
+ has_py_importer, has_pyx_importer = _have_importers()
431
+ py_importer, pyx_importer = None, None
432
+
433
+ if pyimport and not has_py_importer:
434
+ py_importer = PyImportMetaFinder(pyxbuild_dir=build_dir, inplace=inplace,
435
+ language_level=language_level)
436
+ # make sure we import Cython before we install the import hook
437
+ import Cython.Compiler.Main, Cython.Compiler.Pipeline, Cython.Compiler.Optimize
438
+ sys.meta_path.insert(0, py_importer)
439
+
440
+ if pyximport and not has_pyx_importer:
441
+ pyx_importer = PyxImportMetaFinder(pyxbuild_dir=build_dir, inplace=inplace,
442
+ language_level=language_level)
443
+ sys.meta_path.append(pyx_importer)
444
+
445
+ return py_importer, pyx_importer
446
+
447
+
448
+ def uninstall(py_importer, pyx_importer):
449
+ """
450
+ Uninstall an import hook.
451
+ """
452
+ try:
453
+ sys.meta_path.remove(py_importer)
454
+ except ValueError:
455
+ pass
456
+
457
+ try:
458
+ sys.meta_path.remove(pyx_importer)
459
+ except ValueError:
460
+ pass
461
+
462
+
463
+ # MAIN
464
+
465
+ def show_docs():
466
+ import __main__
467
+ __main__.__name__ = mod_name
468
+ for name in dir(__main__):
469
+ item = getattr(__main__, name)
470
+ try:
471
+ setattr(item, "__module__", mod_name)
472
+ except (AttributeError, TypeError):
473
+ pass
474
+ help(__main__)
475
+
476
+
477
+ if __name__ == '__main__':
478
+ show_docs()
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_VF.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This makes the functions in torch._C._VariableFunctions available as
3
+ torch._VF.<funcname>
4
+ without mypy being able to find them.
5
+
6
+ A subset of those functions are mapped to ATen functions in
7
+ torch/jit/_builtins.py
8
+
9
+ See https://github.com/pytorch/pytorch/issues/21478 for the reason for
10
+ introducing torch._VF
11
+
12
+ """
13
+ import sys
14
+ import types
15
+
16
+ import torch
17
+
18
+
19
+ class VFModule(types.ModuleType):
20
+ vf: types.ModuleType
21
+
22
+ def __init__(self, name):
23
+ super().__init__(name)
24
+ self.vf = torch._C._VariableFunctions
25
+
26
+ def __getattr__(self, attr):
27
+ return getattr(self.vf, attr)
28
+
29
+
30
+ sys.modules[__name__] = VFModule(__name__)
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_classes.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import types
2
+
3
+ import torch._C
4
+
5
+
6
+ class _ClassNamespace(types.ModuleType):
7
+ def __init__(self, name):
8
+ super().__init__("torch.classes" + name)
9
+ self.name = name
10
+
11
+ def __getattr__(self, attr):
12
+ proxy = torch._C._get_custom_class_python_wrapper(self.name, attr)
13
+ if proxy is None:
14
+ raise RuntimeError(f"Class {self.name}.{attr} not registered!")
15
+ return proxy
16
+
17
+
18
+ class _Classes(types.ModuleType):
19
+ __file__ = "_classes.py"
20
+
21
+ def __init__(self):
22
+ super().__init__("torch.classes")
23
+
24
+ def __getattr__(self, name):
25
+ namespace = _ClassNamespace(name)
26
+ setattr(self, name, namespace)
27
+ return namespace
28
+
29
+ @property
30
+ def loaded_libraries(self):
31
+ return torch.ops.loaded_libraries
32
+
33
+ def load_library(self, path):
34
+ """
35
+ Loads a shared library from the given path into the current process.
36
+
37
+ The library being loaded may run global initialization code to register
38
+ custom classes with the PyTorch JIT runtime. This allows dynamically
39
+ loading custom classes. For this, you should compile your class
40
+ and the static registration code into a shared library object, and then
41
+ call ``torch.classes.load_library('path/to/libcustom.so')`` to load the
42
+ shared object.
43
+
44
+ After the library is loaded, it is added to the
45
+ ``torch.classes.loaded_libraries`` attribute, a set that may be inspected
46
+ for the paths of all libraries loaded using this function.
47
+
48
+ Args:
49
+ path (str): A path to a shared library to load.
50
+ """
51
+ torch.ops.load_library(path)
52
+
53
+
54
+ # The classes "namespace"
55
+ classes = _Classes()
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_deploy.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+
3
+ import torch
4
+ from torch.package import Importer, OrderedImporter, PackageImporter, sys_importer
5
+ from torch.package._package_pickler import create_pickler
6
+ from torch.package._package_unpickler import PackageUnpickler
7
+ from torch.serialization import _maybe_decode_ascii
8
+
9
+
10
+ def _save_storages(importer, obj):
11
+ serialized_storages = []
12
+ serialized_dtypes = []
13
+
14
+ importer = importer if isinstance(importer, torch.package.PackageImporter) else None
15
+ importers: Importer
16
+ if importer is not None:
17
+ importers = OrderedImporter(importer, sys_importer)
18
+ else:
19
+ importers = sys_importer
20
+
21
+ def persistent_id(obj):
22
+ if torch.is_storage(obj) or isinstance(obj, torch.storage.TypedStorage):
23
+ if isinstance(obj, torch.storage.TypedStorage):
24
+ # TODO: Once we decide to break serialization FC, we can
25
+ # remove this case
26
+ storage = obj._untyped_storage
27
+ dtype = obj.dtype
28
+ else:
29
+ storage = obj
30
+ dtype = torch.uint8
31
+
32
+ serialized_storages.append(obj)
33
+ serialized_dtypes.append(dtype)
34
+ return ("storage", len(serialized_storages) - 1)
35
+
36
+ if hasattr(obj, "__reduce_deploy__"):
37
+ if _serialized_reduces.get(id(obj)) is None:
38
+ _serialized_reduces[id(obj)] = (
39
+ "reduce_deploy",
40
+ id(obj),
41
+ *obj.__reduce_deploy__(importers),
42
+ )
43
+ return _serialized_reduces[id(obj)]
44
+
45
+ return None
46
+
47
+ # Write the pickle data for `obj`
48
+ data_buf = io.BytesIO()
49
+ pickler = create_pickler(data_buf, importers)
50
+ pickler.persistent_id = persistent_id
51
+ pickler.dump(obj)
52
+ data_value = data_buf.getvalue()
53
+ return (
54
+ data_value,
55
+ serialized_storages,
56
+ serialized_dtypes,
57
+ importer.zip_reader if importer else None,
58
+ )
59
+
60
+
61
+ def _load_storages(id, zip_reader, obj_bytes, serialized_storages, serialized_dtypes):
62
+ def persistent_load(saved_id):
63
+ assert isinstance(saved_id, tuple)
64
+ typename = _maybe_decode_ascii(saved_id[0])
65
+ data = saved_id[1:]
66
+
67
+ if typename == "storage":
68
+ # TODO: Once we decide to break serialization FC, we can
69
+ # stop wrapping with TypedStorage
70
+ storage = serialized_storages[data[0]]
71
+ dtype = serialized_dtypes[data[0]]
72
+ return torch.storage.TypedStorage(
73
+ wrap_storage=storage.untyped(), dtype=dtype
74
+ )
75
+
76
+ if typename == "reduce_deploy":
77
+ reduce_id, func, args = data
78
+ if reduce_id not in _loaded_reduces:
79
+ _loaded_reduces[reduce_id] = func(_raw_packages[zip_reader], *args)
80
+ return _loaded_reduces[reduce_id]
81
+
82
+ return None
83
+
84
+ importer: Importer
85
+ if zip_reader is not None:
86
+ importer = OrderedImporter(_get_package(zip_reader), sys_importer)
87
+ else:
88
+ importer = sys_importer
89
+
90
+ unpickler = PackageUnpickler(importer, io.BytesIO(obj_bytes))
91
+ unpickler.persistent_load = persistent_load # type: ignore[method-assign]
92
+ result = _deploy_objects[id] = unpickler.load()
93
+ return result
94
+
95
+
96
+ def _get_package(zip_reader):
97
+ if zip_reader not in _raw_packages:
98
+ _raw_packages[zip_reader] = PackageImporter(zip_reader)
99
+ return _raw_packages[zip_reader]
100
+
101
+
102
+ _raw_packages: dict = {}
103
+ _deploy_objects: dict = {}
104
+ _serialized_reduces: dict = {}
105
+ _loaded_reduces: dict = {}
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_linalg_utils.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Various linear algebra utility methods for internal use.
2
+
3
+ """
4
+
5
+ from typing import Optional, Tuple
6
+
7
+ import torch
8
+ from torch import Tensor
9
+
10
+
11
+ def is_sparse(A):
12
+ """Check if tensor A is a sparse tensor"""
13
+ if isinstance(A, torch.Tensor):
14
+ return A.layout == torch.sparse_coo
15
+
16
+ error_str = "expected Tensor"
17
+ if not torch.jit.is_scripting():
18
+ error_str += f" but got {type(A)}"
19
+ raise TypeError(error_str)
20
+
21
+
22
+ def get_floating_dtype(A):
23
+ """Return the floating point dtype of tensor A.
24
+
25
+ Integer types map to float32.
26
+ """
27
+ dtype = A.dtype
28
+ if dtype in (torch.float16, torch.float32, torch.float64):
29
+ return dtype
30
+ return torch.float32
31
+
32
+
33
+ def matmul(A: Optional[Tensor], B: Tensor) -> Tensor:
34
+ """Multiply two matrices.
35
+
36
+ If A is None, return B. A can be sparse or dense. B is always
37
+ dense.
38
+ """
39
+ if A is None:
40
+ return B
41
+ if is_sparse(A):
42
+ return torch.sparse.mm(A, B)
43
+ return torch.matmul(A, B)
44
+
45
+
46
+ def conjugate(A):
47
+ """Return conjugate of tensor A.
48
+
49
+ .. note:: If A's dtype is not complex, A is returned.
50
+ """
51
+ if A.is_complex():
52
+ return A.conj()
53
+ return A
54
+
55
+
56
+ def transpose(A):
57
+ """Return transpose of a matrix or batches of matrices."""
58
+ ndim = len(A.shape)
59
+ return A.transpose(ndim - 1, ndim - 2)
60
+
61
+
62
+ def transjugate(A):
63
+ """Return transpose conjugate of a matrix or batches of matrices."""
64
+ return conjugate(transpose(A))
65
+
66
+
67
+ def bform(X: Tensor, A: Optional[Tensor], Y: Tensor) -> Tensor:
68
+ """Return bilinear form of matrices: :math:`X^T A Y`."""
69
+ return matmul(transpose(X), matmul(A, Y))
70
+
71
+
72
+ def qform(A: Optional[Tensor], S: Tensor):
73
+ """Return quadratic form :math:`S^T A S`."""
74
+ return bform(S, A, S)
75
+
76
+
77
+ def basis(A):
78
+ """Return orthogonal basis of A columns."""
79
+ return torch.linalg.qr(A).Q
80
+
81
+
82
+ def symeig(A: Tensor, largest: Optional[bool] = False) -> Tuple[Tensor, Tensor]:
83
+ """Return eigenpairs of A with specified ordering."""
84
+ if largest is None:
85
+ largest = False
86
+ E, Z = torch.linalg.eigh(A, UPLO="U")
87
+ # assuming that E is ordered
88
+ if largest:
89
+ E = torch.flip(E, dims=(-1,))
90
+ Z = torch.flip(Z, dims=(-1,))
91
+ return E, Z
92
+
93
+
94
+ # These functions were deprecated and removed
95
+ # This nice error message can be removed in version 1.13+
96
+ def matrix_rank(input, tol=None, symmetric=False, *, out=None) -> Tensor:
97
+ raise RuntimeError(
98
+ "This function was deprecated since version 1.9 and is now removed.\n"
99
+ "Please use the `torch.linalg.matrix_rank` function instead. "
100
+ "The parameter 'symmetric' was renamed in `torch.linalg.matrix_rank()` to 'hermitian'."
101
+ )
102
+
103
+
104
+ def solve(input: Tensor, A: Tensor, *, out=None) -> Tuple[Tensor, Tensor]:
105
+ raise RuntimeError(
106
+ "This function was deprecated since version 1.9 and is now removed. "
107
+ "`torch.solve` is deprecated in favor of `torch.linalg.solve`. "
108
+ "`torch.linalg.solve` has its arguments reversed and does not return the LU factorization.\n\n"
109
+ "To get the LU factorization see `torch.lu`, which can be used with `torch.lu_solve` or `torch.lu_unpack`.\n"
110
+ "X = torch.solve(B, A).solution "
111
+ "should be replaced with:\n"
112
+ "X = torch.linalg.solve(A, B)"
113
+ )
114
+
115
+
116
+ def lstsq(input: Tensor, A: Tensor, *, out=None) -> Tuple[Tensor, Tensor]:
117
+ raise RuntimeError(
118
+ "This function was deprecated since version 1.9 and is now removed. "
119
+ "`torch.lstsq` is deprecated in favor of `torch.linalg.lstsq`.\n"
120
+ "`torch.linalg.lstsq` has reversed arguments and does not return the QR decomposition in "
121
+ "the returned tuple (although it returns other information about the problem).\n\n"
122
+ "To get the QR decomposition consider using `torch.linalg.qr`.\n\n"
123
+ "The returned solution in `torch.lstsq` stored the residuals of the solution in the "
124
+ "last m - n columns of the returned value whenever m > n. In torch.linalg.lstsq, "
125
+ "the residuals are in the field 'residuals' of the returned named tuple.\n\n"
126
+ "The unpacking of the solution, as in\n"
127
+ "X, _ = torch.lstsq(B, A).solution[:A.size(1)]\n"
128
+ "should be replaced with:\n"
129
+ "X = torch.linalg.lstsq(A, B).solution"
130
+ )
131
+
132
+
133
+ def _symeig(
134
+ input, eigenvectors=False, upper=True, *, out=None
135
+ ) -> Tuple[Tensor, Tensor]:
136
+ raise RuntimeError(
137
+ "This function was deprecated since version 1.9 and is now removed. "
138
+ "The default behavior has changed from using the upper triangular portion of the matrix by default "
139
+ "to using the lower triangular portion.\n\n"
140
+ "L, _ = torch.symeig(A, upper=upper) "
141
+ "should be replaced with:\n"
142
+ "L = torch.linalg.eigvalsh(A, UPLO='U' if upper else 'L')\n\n"
143
+ "and\n\n"
144
+ "L, V = torch.symeig(A, eigenvectors=True) "
145
+ "should be replaced with:\n"
146
+ "L, V = torch.linalg.eigh(A, UPLO='U' if upper else 'L')"
147
+ )
148
+
149
+
150
+ def eig(
151
+ self: Tensor, eigenvectors: bool = False, *, e=None, v=None
152
+ ) -> Tuple[Tensor, Tensor]:
153
+ raise RuntimeError(
154
+ "This function was deprecated since version 1.9 and is now removed. "
155
+ "`torch.linalg.eig` returns complex tensors of dtype `cfloat` or `cdouble` rather than real tensors "
156
+ "mimicking complex tensors.\n\n"
157
+ "L, _ = torch.eig(A) "
158
+ "should be replaced with:\n"
159
+ "L_complex = torch.linalg.eigvals(A)\n\n"
160
+ "and\n\n"
161
+ "L, V = torch.eig(A, eigenvectors=True) "
162
+ "should be replaced with:\n"
163
+ "L_complex, V_complex = torch.linalg.eig(A)"
164
+ )