koichi12 commited on
Commit
a034abf
·
verified ·
1 Parent(s): 95fc195

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/cython.cpython-311.pyc +0 -0
  2. tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/isympy.cpython-311.pyc +0 -0
  3. tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_soft.cpython-311.pyc +0 -0
  4. tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_error.py +30 -0
  5. tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_soft.py +47 -0
  6. tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/py.typed +0 -0
  7. tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/version.py +16 -0
  8. tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/__init__.cpython-311.pyc +0 -0
  9. tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/utils.cpython-311.pyc +0 -0
  10. tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/__init__.cpython-311.pyc +0 -0
  11. tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/common.cpython-311.pyc +0 -0
  12. tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/copy.cpython-311.pyc +0 -0
  13. tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/put.cpython-311.pyc +0 -0
  14. tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/get.py +587 -0
  15. tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/put.py +591 -0
  16. tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/__init__.py +38 -0
  17. tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/control_flow.cpython-311.pyc +0 -0
  18. tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/ops.cpython-311.pyc +0 -0
  19. tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/control_flow.py +8 -0
  20. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaProfilerTypedefs.h +78 -0
  21. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaVDPAU.h +282 -0
  22. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_atomic_functions.h +100 -0
  23. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_atomic_functions.hpp +85 -0
  24. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_35_intrinsics.h +116 -0
  25. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/vector_functions.hpp +316 -0
  26. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/__pycache__/__init__.cpython-311.pyc +0 -0
  27. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/__pycache__/__init__.cpython-311.pyc +0 -0
  28. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_train.h +540 -0
  29. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_backend.h +600 -0
  30. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_infer.h +1183 -0
  31. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_infer_v8.h +1183 -0
  32. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_train.h +501 -0
  33. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/__init__.py +0 -0
  34. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/__pycache__/__init__.cpython-311.pyc +0 -0
  35. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufft.h +322 -0
  36. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/__pycache__/__init__.cpython-311.pyc +0 -0
  37. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/__pycache__/__init__.cpython-311.pyc +0 -0
  38. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete.h +87 -0
  39. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete2.h +253 -0
  40. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32.h +210 -0
  41. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32_kernel.h +385 -0
  42. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_normal.h +837 -0
  43. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/lib/__init__.py +0 -0
  44. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/lib/__pycache__/__init__.cpython-311.pyc +0 -0
  45. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/__init__.py +0 -0
  46. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/__pycache__/__init__.cpython-311.pyc +0 -0
  47. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/__init__.py +0 -0
  48. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvtxDetail/nvtxImpl.h +469 -0
  49. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/lib/__pycache__/__init__.cpython-311.pyc +0 -0
  50. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/lib/libnvToolsExt.so.1 +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/cython.cpython-311.pyc ADDED
Binary file (863 Bytes). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/isympy.cpython-311.pyc ADDED
Binary file (11.3 kB). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_soft.cpython-311.pyc ADDED
Binary file (2.71 kB). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_error.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+
6
+ class Timeout(TimeoutError): # noqa: N818
7
+ """Raised when the lock could not be acquired in *timeout* seconds."""
8
+
9
+ def __init__(self, lock_file: str) -> None:
10
+ super().__init__()
11
+ self._lock_file = lock_file
12
+
13
+ def __reduce__(self) -> str | tuple[Any, ...]:
14
+ return self.__class__, (self._lock_file,) # Properly pickle the exception
15
+
16
+ def __str__(self) -> str:
17
+ return f"The file lock '{self._lock_file}' could not be acquired."
18
+
19
+ def __repr__(self) -> str:
20
+ return f"{self.__class__.__name__}({self.lock_file!r})"
21
+
22
+ @property
23
+ def lock_file(self) -> str:
24
+ """:return: The path of the file lock."""
25
+ return self._lock_file
26
+
27
+
28
+ __all__ = [
29
+ "Timeout",
30
+ ]
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_soft.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import sys
5
+ from contextlib import suppress
6
+ from errno import EACCES, EEXIST
7
+ from pathlib import Path
8
+
9
+ from ._api import BaseFileLock
10
+ from ._util import ensure_directory_exists, raise_on_not_writable_file
11
+
12
+
13
+ class SoftFileLock(BaseFileLock):
14
+ """Simply watches the existence of the lock file."""
15
+
16
+ def _acquire(self) -> None:
17
+ raise_on_not_writable_file(self.lock_file)
18
+ ensure_directory_exists(self.lock_file)
19
+ # first check for exists and read-only mode as the open will mask this case as EEXIST
20
+ flags = (
21
+ os.O_WRONLY # open for writing only
22
+ | os.O_CREAT
23
+ | os.O_EXCL # together with above raise EEXIST if the file specified by filename exists
24
+ | os.O_TRUNC # truncate the file to zero byte
25
+ )
26
+ try:
27
+ file_handler = os.open(self.lock_file, flags, self._context.mode)
28
+ except OSError as exception: # re-raise unless expected exception
29
+ if not (
30
+ exception.errno == EEXIST # lock already exist
31
+ or (exception.errno == EACCES and sys.platform == "win32") # has no access to this lock
32
+ ): # pragma: win32 no cover
33
+ raise
34
+ else:
35
+ self._context.lock_file_fd = file_handler
36
+
37
+ def _release(self) -> None:
38
+ assert self._context.lock_file_fd is not None # noqa: S101
39
+ os.close(self._context.lock_file_fd) # the lock file is definitely not None
40
+ self._context.lock_file_fd = None
41
+ with suppress(OSError): # the file is already deleted and that's what we want
42
+ Path(self.lock_file).unlink()
43
+
44
+
45
+ __all__ = [
46
+ "SoftFileLock",
47
+ ]
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/py.typed ADDED
File without changes
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/version.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # file generated by setuptools_scm
2
+ # don't change, don't track in version control
3
+ TYPE_CHECKING = False
4
+ if TYPE_CHECKING:
5
+ from typing import Tuple, Union
6
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
7
+ else:
8
+ VERSION_TUPLE = object
9
+
10
+ version: str
11
+ __version__: str
12
+ __version_tuple__: VERSION_TUPLE
13
+ version_tuple: VERSION_TUPLE
14
+
15
+ __version__ = version = '3.13.1'
16
+ __version_tuple__ = version_tuple = (3, 13, 1)
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (2.25 kB). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/utils.cpython-311.pyc ADDED
Binary file (32.1 kB). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (15 kB). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/common.cpython-311.pyc ADDED
Binary file (2.32 kB). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/copy.cpython-311.pyc ADDED
Binary file (26.6 kB). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/put.cpython-311.pyc ADDED
Binary file (27.8 kB). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/get.py ADDED
@@ -0,0 +1,587 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from hashlib import md5
2
+ from itertools import product
3
+
4
+ import pytest
5
+
6
+ from fsspec.implementations.local import make_path_posix
7
+ from fsspec.tests.abstract.common import GLOB_EDGE_CASES_TESTS
8
+
9
+
10
+ class AbstractGetTests:
11
+ def test_get_file_to_existing_directory(
12
+ self,
13
+ fs,
14
+ fs_join,
15
+ fs_bulk_operations_scenario_0,
16
+ local_fs,
17
+ local_join,
18
+ local_target,
19
+ ):
20
+ # Copy scenario 1a
21
+ source = fs_bulk_operations_scenario_0
22
+
23
+ target = local_target
24
+ local_fs.mkdir(target)
25
+ assert local_fs.isdir(target)
26
+
27
+ target_file2 = local_join(target, "file2")
28
+ target_subfile1 = local_join(target, "subfile1")
29
+
30
+ # Copy from source directory
31
+ fs.get(fs_join(source, "file2"), target)
32
+ assert local_fs.isfile(target_file2)
33
+
34
+ # Copy from sub directory
35
+ fs.get(fs_join(source, "subdir", "subfile1"), target)
36
+ assert local_fs.isfile(target_subfile1)
37
+
38
+ # Remove copied files
39
+ local_fs.rm([target_file2, target_subfile1])
40
+ assert not local_fs.exists(target_file2)
41
+ assert not local_fs.exists(target_subfile1)
42
+
43
+ # Repeat with trailing slash on target
44
+ fs.get(fs_join(source, "file2"), target + "/")
45
+ assert local_fs.isdir(target)
46
+ assert local_fs.isfile(target_file2)
47
+
48
+ fs.get(fs_join(source, "subdir", "subfile1"), target + "/")
49
+ assert local_fs.isfile(target_subfile1)
50
+
51
+ def test_get_file_to_new_directory(
52
+ self,
53
+ fs,
54
+ fs_join,
55
+ fs_bulk_operations_scenario_0,
56
+ local_fs,
57
+ local_join,
58
+ local_target,
59
+ ):
60
+ # Copy scenario 1b
61
+ source = fs_bulk_operations_scenario_0
62
+
63
+ target = local_target
64
+ local_fs.mkdir(target)
65
+
66
+ fs.get(
67
+ fs_join(source, "subdir", "subfile1"), local_join(target, "newdir/")
68
+ ) # Note trailing slash
69
+
70
+ assert local_fs.isdir(target)
71
+ assert local_fs.isdir(local_join(target, "newdir"))
72
+ assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
73
+
74
+ def test_get_file_to_file_in_existing_directory(
75
+ self,
76
+ fs,
77
+ fs_join,
78
+ fs_bulk_operations_scenario_0,
79
+ local_fs,
80
+ local_join,
81
+ local_target,
82
+ ):
83
+ # Copy scenario 1c
84
+ source = fs_bulk_operations_scenario_0
85
+
86
+ target = local_target
87
+ local_fs.mkdir(target)
88
+
89
+ fs.get(fs_join(source, "subdir", "subfile1"), local_join(target, "newfile"))
90
+ assert local_fs.isfile(local_join(target, "newfile"))
91
+
92
+ def test_get_file_to_file_in_new_directory(
93
+ self,
94
+ fs,
95
+ fs_join,
96
+ fs_bulk_operations_scenario_0,
97
+ local_fs,
98
+ local_join,
99
+ local_target,
100
+ ):
101
+ # Copy scenario 1d
102
+ source = fs_bulk_operations_scenario_0
103
+
104
+ target = local_target
105
+ local_fs.mkdir(target)
106
+
107
+ fs.get(
108
+ fs_join(source, "subdir", "subfile1"),
109
+ local_join(target, "newdir", "newfile"),
110
+ )
111
+ assert local_fs.isdir(local_join(target, "newdir"))
112
+ assert local_fs.isfile(local_join(target, "newdir", "newfile"))
113
+
114
+ def test_get_directory_to_existing_directory(
115
+ self,
116
+ fs,
117
+ fs_join,
118
+ fs_bulk_operations_scenario_0,
119
+ local_fs,
120
+ local_join,
121
+ local_target,
122
+ ):
123
+ # Copy scenario 1e
124
+ source = fs_bulk_operations_scenario_0
125
+
126
+ target = local_target
127
+ local_fs.mkdir(target)
128
+ assert local_fs.isdir(target)
129
+
130
+ for source_slash, target_slash in zip([False, True], [False, True]):
131
+ s = fs_join(source, "subdir")
132
+ if source_slash:
133
+ s += "/"
134
+ t = target + "/" if target_slash else target
135
+
136
+ # Without recursive does nothing
137
+ fs.get(s, t)
138
+ assert local_fs.ls(target) == []
139
+
140
+ # With recursive
141
+ fs.get(s, t, recursive=True)
142
+ if source_slash:
143
+ assert local_fs.isfile(local_join(target, "subfile1"))
144
+ assert local_fs.isfile(local_join(target, "subfile2"))
145
+ assert local_fs.isdir(local_join(target, "nesteddir"))
146
+ assert local_fs.isfile(local_join(target, "nesteddir", "nestedfile"))
147
+ assert not local_fs.exists(local_join(target, "subdir"))
148
+
149
+ local_fs.rm(
150
+ [
151
+ local_join(target, "subfile1"),
152
+ local_join(target, "subfile2"),
153
+ local_join(target, "nesteddir"),
154
+ ],
155
+ recursive=True,
156
+ )
157
+ else:
158
+ assert local_fs.isdir(local_join(target, "subdir"))
159
+ assert local_fs.isfile(local_join(target, "subdir", "subfile1"))
160
+ assert local_fs.isfile(local_join(target, "subdir", "subfile2"))
161
+ assert local_fs.isdir(local_join(target, "subdir", "nesteddir"))
162
+ assert local_fs.isfile(
163
+ local_join(target, "subdir", "nesteddir", "nestedfile")
164
+ )
165
+
166
+ local_fs.rm(local_join(target, "subdir"), recursive=True)
167
+ assert local_fs.ls(target) == []
168
+
169
+ # Limit recursive by maxdepth
170
+ fs.get(s, t, recursive=True, maxdepth=1)
171
+ if source_slash:
172
+ assert local_fs.isfile(local_join(target, "subfile1"))
173
+ assert local_fs.isfile(local_join(target, "subfile2"))
174
+ assert not local_fs.exists(local_join(target, "nesteddir"))
175
+ assert not local_fs.exists(local_join(target, "subdir"))
176
+
177
+ local_fs.rm(
178
+ [
179
+ local_join(target, "subfile1"),
180
+ local_join(target, "subfile2"),
181
+ ],
182
+ recursive=True,
183
+ )
184
+ else:
185
+ assert local_fs.isdir(local_join(target, "subdir"))
186
+ assert local_fs.isfile(local_join(target, "subdir", "subfile1"))
187
+ assert local_fs.isfile(local_join(target, "subdir", "subfile2"))
188
+ assert not local_fs.exists(local_join(target, "subdir", "nesteddir"))
189
+
190
+ local_fs.rm(local_join(target, "subdir"), recursive=True)
191
+ assert local_fs.ls(target) == []
192
+
193
+ def test_get_directory_to_new_directory(
194
+ self,
195
+ fs,
196
+ fs_join,
197
+ fs_bulk_operations_scenario_0,
198
+ local_fs,
199
+ local_join,
200
+ local_target,
201
+ ):
202
+ # Copy scenario 1f
203
+ source = fs_bulk_operations_scenario_0
204
+
205
+ target = local_target
206
+ local_fs.mkdir(target)
207
+
208
+ for source_slash, target_slash in zip([False, True], [False, True]):
209
+ s = fs_join(source, "subdir")
210
+ if source_slash:
211
+ s += "/"
212
+ t = local_join(target, "newdir")
213
+ if target_slash:
214
+ t += "/"
215
+
216
+ # Without recursive does nothing
217
+ fs.get(s, t)
218
+ assert local_fs.ls(target) == []
219
+
220
+ # With recursive
221
+ fs.get(s, t, recursive=True)
222
+ assert local_fs.isdir(local_join(target, "newdir"))
223
+ assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
224
+ assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
225
+ assert local_fs.isdir(local_join(target, "newdir", "nesteddir"))
226
+ assert local_fs.isfile(
227
+ local_join(target, "newdir", "nesteddir", "nestedfile")
228
+ )
229
+ assert not local_fs.exists(local_join(target, "subdir"))
230
+
231
+ local_fs.rm(local_join(target, "newdir"), recursive=True)
232
+ assert local_fs.ls(target) == []
233
+
234
+ # Limit recursive by maxdepth
235
+ fs.get(s, t, recursive=True, maxdepth=1)
236
+ assert local_fs.isdir(local_join(target, "newdir"))
237
+ assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
238
+ assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
239
+ assert not local_fs.exists(local_join(target, "newdir", "nesteddir"))
240
+ assert not local_fs.exists(local_join(target, "subdir"))
241
+
242
+ local_fs.rm(local_join(target, "newdir"), recursive=True)
243
+ assert not local_fs.exists(local_join(target, "newdir"))
244
+
245
+ def test_get_glob_to_existing_directory(
246
+ self,
247
+ fs,
248
+ fs_join,
249
+ fs_bulk_operations_scenario_0,
250
+ local_fs,
251
+ local_join,
252
+ local_target,
253
+ ):
254
+ # Copy scenario 1g
255
+ source = fs_bulk_operations_scenario_0
256
+
257
+ target = local_target
258
+ local_fs.mkdir(target)
259
+
260
+ for target_slash in [False, True]:
261
+ t = target + "/" if target_slash else target
262
+
263
+ # Without recursive
264
+ fs.get(fs_join(source, "subdir", "*"), t)
265
+ assert local_fs.isfile(local_join(target, "subfile1"))
266
+ assert local_fs.isfile(local_join(target, "subfile2"))
267
+ assert not local_fs.isdir(local_join(target, "nesteddir"))
268
+ assert not local_fs.exists(local_join(target, "nesteddir", "nestedfile"))
269
+ assert not local_fs.exists(local_join(target, "subdir"))
270
+
271
+ local_fs.rm(
272
+ [
273
+ local_join(target, "subfile1"),
274
+ local_join(target, "subfile2"),
275
+ ],
276
+ recursive=True,
277
+ )
278
+ assert local_fs.ls(target) == []
279
+
280
+ # With recursive
281
+ for glob, recursive in zip(["*", "**"], [True, False]):
282
+ fs.get(fs_join(source, "subdir", glob), t, recursive=recursive)
283
+ assert local_fs.isfile(local_join(target, "subfile1"))
284
+ assert local_fs.isfile(local_join(target, "subfile2"))
285
+ assert local_fs.isdir(local_join(target, "nesteddir"))
286
+ assert local_fs.isfile(local_join(target, "nesteddir", "nestedfile"))
287
+ assert not local_fs.exists(local_join(target, "subdir"))
288
+
289
+ local_fs.rm(
290
+ [
291
+ local_join(target, "subfile1"),
292
+ local_join(target, "subfile2"),
293
+ local_join(target, "nesteddir"),
294
+ ],
295
+ recursive=True,
296
+ )
297
+ assert local_fs.ls(target) == []
298
+
299
+ # Limit recursive by maxdepth
300
+ fs.get(
301
+ fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1
302
+ )
303
+ assert local_fs.isfile(local_join(target, "subfile1"))
304
+ assert local_fs.isfile(local_join(target, "subfile2"))
305
+ assert not local_fs.exists(local_join(target, "nesteddir"))
306
+ assert not local_fs.exists(local_join(target, "subdir"))
307
+
308
+ local_fs.rm(
309
+ [
310
+ local_join(target, "subfile1"),
311
+ local_join(target, "subfile2"),
312
+ ],
313
+ recursive=True,
314
+ )
315
+ assert local_fs.ls(target) == []
316
+
317
+ def test_get_glob_to_new_directory(
318
+ self,
319
+ fs,
320
+ fs_join,
321
+ fs_bulk_operations_scenario_0,
322
+ local_fs,
323
+ local_join,
324
+ local_target,
325
+ ):
326
+ # Copy scenario 1h
327
+ source = fs_bulk_operations_scenario_0
328
+
329
+ target = local_target
330
+ local_fs.mkdir(target)
331
+
332
+ for target_slash in [False, True]:
333
+ t = fs_join(target, "newdir")
334
+ if target_slash:
335
+ t += "/"
336
+
337
+ # Without recursive
338
+ fs.get(fs_join(source, "subdir", "*"), t)
339
+ assert local_fs.isdir(local_join(target, "newdir"))
340
+ assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
341
+ assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
342
+ assert not local_fs.exists(local_join(target, "newdir", "nesteddir"))
343
+ assert not local_fs.exists(
344
+ local_join(target, "newdir", "nesteddir", "nestedfile")
345
+ )
346
+ assert not local_fs.exists(local_join(target, "subdir"))
347
+ assert not local_fs.exists(local_join(target, "newdir", "subdir"))
348
+
349
+ local_fs.rm(local_join(target, "newdir"), recursive=True)
350
+ assert local_fs.ls(target) == []
351
+
352
+ # With recursive
353
+ for glob, recursive in zip(["*", "**"], [True, False]):
354
+ fs.get(fs_join(source, "subdir", glob), t, recursive=recursive)
355
+ assert local_fs.isdir(local_join(target, "newdir"))
356
+ assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
357
+ assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
358
+ assert local_fs.isdir(local_join(target, "newdir", "nesteddir"))
359
+ assert local_fs.isfile(
360
+ local_join(target, "newdir", "nesteddir", "nestedfile")
361
+ )
362
+ assert not local_fs.exists(local_join(target, "subdir"))
363
+ assert not local_fs.exists(local_join(target, "newdir", "subdir"))
364
+
365
+ local_fs.rm(local_join(target, "newdir"), recursive=True)
366
+ assert not local_fs.exists(local_join(target, "newdir"))
367
+
368
+ # Limit recursive by maxdepth
369
+ fs.get(
370
+ fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1
371
+ )
372
+ assert local_fs.isdir(local_join(target, "newdir"))
373
+ assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
374
+ assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
375
+ assert not local_fs.exists(local_join(target, "newdir", "nesteddir"))
376
+ assert not local_fs.exists(local_join(target, "subdir"))
377
+ assert not local_fs.exists(local_join(target, "newdir", "subdir"))
378
+
379
+ local_fs.rm(local_fs.ls(target, detail=False), recursive=True)
380
+ assert not local_fs.exists(local_join(target, "newdir"))
381
+
382
+ @pytest.mark.parametrize(
383
+ GLOB_EDGE_CASES_TESTS["argnames"],
384
+ GLOB_EDGE_CASES_TESTS["argvalues"],
385
+ )
386
+ def test_get_glob_edge_cases(
387
+ self,
388
+ path,
389
+ recursive,
390
+ maxdepth,
391
+ expected,
392
+ fs,
393
+ fs_join,
394
+ fs_glob_edge_cases_files,
395
+ local_fs,
396
+ local_join,
397
+ local_target,
398
+ ):
399
+ # Copy scenario 1g
400
+ source = fs_glob_edge_cases_files
401
+
402
+ target = local_target
403
+
404
+ for new_dir, target_slash in product([True, False], [True, False]):
405
+ local_fs.mkdir(target)
406
+
407
+ t = local_join(target, "newdir") if new_dir else target
408
+ t = t + "/" if target_slash else t
409
+
410
+ fs.get(fs_join(source, path), t, recursive=recursive, maxdepth=maxdepth)
411
+
412
+ output = local_fs.find(target)
413
+ if new_dir:
414
+ prefixed_expected = [
415
+ make_path_posix(local_join(target, "newdir", p)) for p in expected
416
+ ]
417
+ else:
418
+ prefixed_expected = [
419
+ make_path_posix(local_join(target, p)) for p in expected
420
+ ]
421
+ assert sorted(output) == sorted(prefixed_expected)
422
+
423
+ try:
424
+ local_fs.rm(target, recursive=True)
425
+ except FileNotFoundError:
426
+ pass
427
+
428
+ def test_get_list_of_files_to_existing_directory(
429
+ self,
430
+ fs,
431
+ fs_join,
432
+ fs_bulk_operations_scenario_0,
433
+ local_fs,
434
+ local_join,
435
+ local_target,
436
+ ):
437
+ # Copy scenario 2a
438
+ source = fs_bulk_operations_scenario_0
439
+
440
+ target = local_target
441
+ local_fs.mkdir(target)
442
+
443
+ source_files = [
444
+ fs_join(source, "file1"),
445
+ fs_join(source, "file2"),
446
+ fs_join(source, "subdir", "subfile1"),
447
+ ]
448
+
449
+ for target_slash in [False, True]:
450
+ t = target + "/" if target_slash else target
451
+
452
+ fs.get(source_files, t)
453
+ assert local_fs.isfile(local_join(target, "file1"))
454
+ assert local_fs.isfile(local_join(target, "file2"))
455
+ assert local_fs.isfile(local_join(target, "subfile1"))
456
+
457
+ local_fs.rm(
458
+ [
459
+ local_join(target, "file1"),
460
+ local_join(target, "file2"),
461
+ local_join(target, "subfile1"),
462
+ ],
463
+ recursive=True,
464
+ )
465
+ assert local_fs.ls(target) == []
466
+
467
+ def test_get_list_of_files_to_new_directory(
468
+ self,
469
+ fs,
470
+ fs_join,
471
+ fs_bulk_operations_scenario_0,
472
+ local_fs,
473
+ local_join,
474
+ local_target,
475
+ ):
476
+ # Copy scenario 2b
477
+ source = fs_bulk_operations_scenario_0
478
+
479
+ target = local_target
480
+ local_fs.mkdir(target)
481
+
482
+ source_files = [
483
+ fs_join(source, "file1"),
484
+ fs_join(source, "file2"),
485
+ fs_join(source, "subdir", "subfile1"),
486
+ ]
487
+
488
+ fs.get(source_files, local_join(target, "newdir") + "/") # Note trailing slash
489
+ assert local_fs.isdir(local_join(target, "newdir"))
490
+ assert local_fs.isfile(local_join(target, "newdir", "file1"))
491
+ assert local_fs.isfile(local_join(target, "newdir", "file2"))
492
+ assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
493
+
494
+ def test_get_directory_recursive(
495
+ self, fs, fs_join, fs_path, local_fs, local_join, local_target
496
+ ):
497
+ # https://github.com/fsspec/filesystem_spec/issues/1062
498
+ # Recursive cp/get/put of source directory into non-existent target directory.
499
+ src = fs_join(fs_path, "src")
500
+ src_file = fs_join(src, "file")
501
+ fs.mkdir(src)
502
+ fs.touch(src_file)
503
+
504
+ target = local_target
505
+
506
+ # get without slash
507
+ assert not local_fs.exists(target)
508
+ for loop in range(2):
509
+ fs.get(src, target, recursive=True)
510
+ assert local_fs.isdir(target)
511
+
512
+ if loop == 0:
513
+ assert local_fs.isfile(local_join(target, "file"))
514
+ assert not local_fs.exists(local_join(target, "src"))
515
+ else:
516
+ assert local_fs.isfile(local_join(target, "file"))
517
+ assert local_fs.isdir(local_join(target, "src"))
518
+ assert local_fs.isfile(local_join(target, "src", "file"))
519
+
520
+ local_fs.rm(target, recursive=True)
521
+
522
+ # get with slash
523
+ assert not local_fs.exists(target)
524
+ for loop in range(2):
525
+ fs.get(src + "/", target, recursive=True)
526
+ assert local_fs.isdir(target)
527
+ assert local_fs.isfile(local_join(target, "file"))
528
+ assert not local_fs.exists(local_join(target, "src"))
529
+
530
+ def test_get_directory_without_files_with_same_name_prefix(
531
+ self,
532
+ fs,
533
+ fs_join,
534
+ local_fs,
535
+ local_join,
536
+ local_target,
537
+ fs_dir_and_file_with_same_name_prefix,
538
+ ):
539
+ # Create the test dirs
540
+ source = fs_dir_and_file_with_same_name_prefix
541
+ target = local_target
542
+
543
+ # Test without glob
544
+ fs.get(fs_join(source, "subdir"), target, recursive=True)
545
+
546
+ assert local_fs.isfile(local_join(target, "subfile.txt"))
547
+ assert not local_fs.isfile(local_join(target, "subdir.txt"))
548
+
549
+ local_fs.rm([local_join(target, "subfile.txt")])
550
+ assert local_fs.ls(target) == []
551
+
552
+ # Test with glob
553
+ fs.get(fs_join(source, "subdir*"), target, recursive=True)
554
+
555
+ assert local_fs.isdir(local_join(target, "subdir"))
556
+ assert local_fs.isfile(local_join(target, "subdir", "subfile.txt"))
557
+ assert local_fs.isfile(local_join(target, "subdir.txt"))
558
+
559
+ def test_get_with_source_and_destination_as_list(
560
+ self,
561
+ fs,
562
+ fs_join,
563
+ local_fs,
564
+ local_join,
565
+ local_target,
566
+ fs_10_files_with_hashed_names,
567
+ ):
568
+ # Create the test dir
569
+ source = fs_10_files_with_hashed_names
570
+ target = local_target
571
+
572
+ # Create list of files for source and destination
573
+ source_files = []
574
+ destination_files = []
575
+ for i in range(10):
576
+ hashed_i = md5(str(i).encode("utf-8")).hexdigest()
577
+ source_files.append(fs_join(source, f"{hashed_i}.txt"))
578
+ destination_files.append(
579
+ make_path_posix(local_join(target, f"{hashed_i}.txt"))
580
+ )
581
+
582
+ # Copy and assert order was kept
583
+ fs.get(rpath=source_files, lpath=destination_files)
584
+
585
+ for i in range(10):
586
+ file_content = local_fs.cat(destination_files[i]).decode("utf-8")
587
+ assert file_content == str(i)
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/put.py ADDED
@@ -0,0 +1,591 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from hashlib import md5
2
+ from itertools import product
3
+
4
+ import pytest
5
+
6
+ from fsspec.tests.abstract.common import GLOB_EDGE_CASES_TESTS
7
+
8
+
9
+ class AbstractPutTests:
10
+ def test_put_file_to_existing_directory(
11
+ self,
12
+ fs,
13
+ fs_join,
14
+ fs_target,
15
+ local_join,
16
+ local_bulk_operations_scenario_0,
17
+ supports_empty_directories,
18
+ ):
19
+ # Copy scenario 1a
20
+ source = local_bulk_operations_scenario_0
21
+
22
+ target = fs_target
23
+ fs.mkdir(target)
24
+ if not supports_empty_directories:
25
+ # Force target directory to exist by adding a dummy file
26
+ fs.touch(fs_join(target, "dummy"))
27
+ assert fs.isdir(target)
28
+
29
+ target_file2 = fs_join(target, "file2")
30
+ target_subfile1 = fs_join(target, "subfile1")
31
+
32
+ # Copy from source directory
33
+ fs.put(local_join(source, "file2"), target)
34
+ assert fs.isfile(target_file2)
35
+
36
+ # Copy from sub directory
37
+ fs.put(local_join(source, "subdir", "subfile1"), target)
38
+ assert fs.isfile(target_subfile1)
39
+
40
+ # Remove copied files
41
+ fs.rm([target_file2, target_subfile1])
42
+ assert not fs.exists(target_file2)
43
+ assert not fs.exists(target_subfile1)
44
+
45
+ # Repeat with trailing slash on target
46
+ fs.put(local_join(source, "file2"), target + "/")
47
+ assert fs.isdir(target)
48
+ assert fs.isfile(target_file2)
49
+
50
+ fs.put(local_join(source, "subdir", "subfile1"), target + "/")
51
+ assert fs.isfile(target_subfile1)
52
+
53
+ def test_put_file_to_new_directory(
54
+ self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
55
+ ):
56
+ # Copy scenario 1b
57
+ source = local_bulk_operations_scenario_0
58
+
59
+ target = fs_target
60
+ fs.mkdir(target)
61
+
62
+ fs.put(
63
+ local_join(source, "subdir", "subfile1"), fs_join(target, "newdir/")
64
+ ) # Note trailing slash
65
+ assert fs.isdir(target)
66
+ assert fs.isdir(fs_join(target, "newdir"))
67
+ assert fs.isfile(fs_join(target, "newdir", "subfile1"))
68
+
69
+ def test_put_file_to_file_in_existing_directory(
70
+ self,
71
+ fs,
72
+ fs_join,
73
+ fs_target,
74
+ local_join,
75
+ supports_empty_directories,
76
+ local_bulk_operations_scenario_0,
77
+ ):
78
+ # Copy scenario 1c
79
+ source = local_bulk_operations_scenario_0
80
+
81
+ target = fs_target
82
+ fs.mkdir(target)
83
+ if not supports_empty_directories:
84
+ # Force target directory to exist by adding a dummy file
85
+ fs.touch(fs_join(target, "dummy"))
86
+ assert fs.isdir(target)
87
+
88
+ fs.put(local_join(source, "subdir", "subfile1"), fs_join(target, "newfile"))
89
+ assert fs.isfile(fs_join(target, "newfile"))
90
+
91
+ def test_put_file_to_file_in_new_directory(
92
+ self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
93
+ ):
94
+ # Copy scenario 1d
95
+ source = local_bulk_operations_scenario_0
96
+
97
+ target = fs_target
98
+ fs.mkdir(target)
99
+
100
+ fs.put(
101
+ local_join(source, "subdir", "subfile1"),
102
+ fs_join(target, "newdir", "newfile"),
103
+ )
104
+ assert fs.isdir(fs_join(target, "newdir"))
105
+ assert fs.isfile(fs_join(target, "newdir", "newfile"))
106
+
107
+ def test_put_directory_to_existing_directory(
108
+ self,
109
+ fs,
110
+ fs_join,
111
+ fs_target,
112
+ local_bulk_operations_scenario_0,
113
+ supports_empty_directories,
114
+ ):
115
+ # Copy scenario 1e
116
+ source = local_bulk_operations_scenario_0
117
+
118
+ target = fs_target
119
+ fs.mkdir(target)
120
+ if not supports_empty_directories:
121
+ # Force target directory to exist by adding a dummy file
122
+ dummy = fs_join(target, "dummy")
123
+ fs.touch(dummy)
124
+ assert fs.isdir(target)
125
+
126
+ for source_slash, target_slash in zip([False, True], [False, True]):
127
+ s = fs_join(source, "subdir")
128
+ if source_slash:
129
+ s += "/"
130
+ t = target + "/" if target_slash else target
131
+
132
+ # Without recursive does nothing
133
+ fs.put(s, t)
134
+ assert fs.ls(target, detail=False) == (
135
+ [] if supports_empty_directories else [dummy]
136
+ )
137
+
138
+ # With recursive
139
+ fs.put(s, t, recursive=True)
140
+ if source_slash:
141
+ assert fs.isfile(fs_join(target, "subfile1"))
142
+ assert fs.isfile(fs_join(target, "subfile2"))
143
+ assert fs.isdir(fs_join(target, "nesteddir"))
144
+ assert fs.isfile(fs_join(target, "nesteddir", "nestedfile"))
145
+ assert not fs.exists(fs_join(target, "subdir"))
146
+
147
+ fs.rm(
148
+ [
149
+ fs_join(target, "subfile1"),
150
+ fs_join(target, "subfile2"),
151
+ fs_join(target, "nesteddir"),
152
+ ],
153
+ recursive=True,
154
+ )
155
+ else:
156
+ assert fs.isdir(fs_join(target, "subdir"))
157
+ assert fs.isfile(fs_join(target, "subdir", "subfile1"))
158
+ assert fs.isfile(fs_join(target, "subdir", "subfile2"))
159
+ assert fs.isdir(fs_join(target, "subdir", "nesteddir"))
160
+ assert fs.isfile(fs_join(target, "subdir", "nesteddir", "nestedfile"))
161
+
162
+ fs.rm(fs_join(target, "subdir"), recursive=True)
163
+ assert fs.ls(target, detail=False) == (
164
+ [] if supports_empty_directories else [dummy]
165
+ )
166
+
167
+ # Limit recursive by maxdepth
168
+ fs.put(s, t, recursive=True, maxdepth=1)
169
+ if source_slash:
170
+ assert fs.isfile(fs_join(target, "subfile1"))
171
+ assert fs.isfile(fs_join(target, "subfile2"))
172
+ assert not fs.exists(fs_join(target, "nesteddir"))
173
+ assert not fs.exists(fs_join(target, "subdir"))
174
+
175
+ fs.rm(
176
+ [
177
+ fs_join(target, "subfile1"),
178
+ fs_join(target, "subfile2"),
179
+ ],
180
+ recursive=True,
181
+ )
182
+ else:
183
+ assert fs.isdir(fs_join(target, "subdir"))
184
+ assert fs.isfile(fs_join(target, "subdir", "subfile1"))
185
+ assert fs.isfile(fs_join(target, "subdir", "subfile2"))
186
+ assert not fs.exists(fs_join(target, "subdir", "nesteddir"))
187
+
188
+ fs.rm(fs_join(target, "subdir"), recursive=True)
189
+ assert fs.ls(target, detail=False) == (
190
+ [] if supports_empty_directories else [dummy]
191
+ )
192
+
193
+ def test_put_directory_to_new_directory(
194
+ self,
195
+ fs,
196
+ fs_join,
197
+ fs_target,
198
+ local_bulk_operations_scenario_0,
199
+ supports_empty_directories,
200
+ ):
201
+ # Copy scenario 1f
202
+ source = local_bulk_operations_scenario_0
203
+
204
+ target = fs_target
205
+ fs.mkdir(target)
206
+
207
+ for source_slash, target_slash in zip([False, True], [False, True]):
208
+ s = fs_join(source, "subdir")
209
+ if source_slash:
210
+ s += "/"
211
+ t = fs_join(target, "newdir")
212
+ if target_slash:
213
+ t += "/"
214
+
215
+ # Without recursive does nothing
216
+ fs.put(s, t)
217
+ if supports_empty_directories:
218
+ assert fs.ls(target) == []
219
+ else:
220
+ with pytest.raises(FileNotFoundError):
221
+ fs.ls(target)
222
+
223
+ # With recursive
224
+ fs.put(s, t, recursive=True)
225
+ assert fs.isdir(fs_join(target, "newdir"))
226
+ assert fs.isfile(fs_join(target, "newdir", "subfile1"))
227
+ assert fs.isfile(fs_join(target, "newdir", "subfile2"))
228
+ assert fs.isdir(fs_join(target, "newdir", "nesteddir"))
229
+ assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile"))
230
+ assert not fs.exists(fs_join(target, "subdir"))
231
+
232
+ fs.rm(fs_join(target, "newdir"), recursive=True)
233
+ assert not fs.exists(fs_join(target, "newdir"))
234
+
235
+ # Limit recursive by maxdepth
236
+ fs.put(s, t, recursive=True, maxdepth=1)
237
+ assert fs.isdir(fs_join(target, "newdir"))
238
+ assert fs.isfile(fs_join(target, "newdir", "subfile1"))
239
+ assert fs.isfile(fs_join(target, "newdir", "subfile2"))
240
+ assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
241
+ assert not fs.exists(fs_join(target, "subdir"))
242
+
243
+ fs.rm(fs_join(target, "newdir"), recursive=True)
244
+ assert not fs.exists(fs_join(target, "newdir"))
245
+
246
+ def test_put_glob_to_existing_directory(
247
+ self,
248
+ fs,
249
+ fs_join,
250
+ fs_target,
251
+ local_join,
252
+ supports_empty_directories,
253
+ local_bulk_operations_scenario_0,
254
+ ):
255
+ # Copy scenario 1g
256
+ source = local_bulk_operations_scenario_0
257
+
258
+ target = fs_target
259
+ fs.mkdir(target)
260
+ if not supports_empty_directories:
261
+ # Force target directory to exist by adding a dummy file
262
+ dummy = fs_join(target, "dummy")
263
+ fs.touch(dummy)
264
+ assert fs.isdir(target)
265
+
266
+ for target_slash in [False, True]:
267
+ t = target + "/" if target_slash else target
268
+
269
+ # Without recursive
270
+ fs.put(local_join(source, "subdir", "*"), t)
271
+ assert fs.isfile(fs_join(target, "subfile1"))
272
+ assert fs.isfile(fs_join(target, "subfile2"))
273
+ assert not fs.isdir(fs_join(target, "nesteddir"))
274
+ assert not fs.exists(fs_join(target, "nesteddir", "nestedfile"))
275
+ assert not fs.exists(fs_join(target, "subdir"))
276
+
277
+ fs.rm(
278
+ [
279
+ fs_join(target, "subfile1"),
280
+ fs_join(target, "subfile2"),
281
+ ],
282
+ recursive=True,
283
+ )
284
+ assert fs.ls(target, detail=False) == (
285
+ [] if supports_empty_directories else [dummy]
286
+ )
287
+
288
+ # With recursive
289
+ for glob, recursive in zip(["*", "**"], [True, False]):
290
+ fs.put(local_join(source, "subdir", glob), t, recursive=recursive)
291
+ assert fs.isfile(fs_join(target, "subfile1"))
292
+ assert fs.isfile(fs_join(target, "subfile2"))
293
+ assert fs.isdir(fs_join(target, "nesteddir"))
294
+ assert fs.isfile(fs_join(target, "nesteddir", "nestedfile"))
295
+ assert not fs.exists(fs_join(target, "subdir"))
296
+
297
+ fs.rm(
298
+ [
299
+ fs_join(target, "subfile1"),
300
+ fs_join(target, "subfile2"),
301
+ fs_join(target, "nesteddir"),
302
+ ],
303
+ recursive=True,
304
+ )
305
+ assert fs.ls(target, detail=False) == (
306
+ [] if supports_empty_directories else [dummy]
307
+ )
308
+
309
+ # Limit recursive by maxdepth
310
+ fs.put(
311
+ local_join(source, "subdir", glob),
312
+ t,
313
+ recursive=recursive,
314
+ maxdepth=1,
315
+ )
316
+ assert fs.isfile(fs_join(target, "subfile1"))
317
+ assert fs.isfile(fs_join(target, "subfile2"))
318
+ assert not fs.exists(fs_join(target, "nesteddir"))
319
+ assert not fs.exists(fs_join(target, "subdir"))
320
+
321
+ fs.rm(
322
+ [
323
+ fs_join(target, "subfile1"),
324
+ fs_join(target, "subfile2"),
325
+ ],
326
+ recursive=True,
327
+ )
328
+ assert fs.ls(target, detail=False) == (
329
+ [] if supports_empty_directories else [dummy]
330
+ )
331
+
332
+ def test_put_glob_to_new_directory(
333
+ self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
334
+ ):
335
+ # Copy scenario 1h
336
+ source = local_bulk_operations_scenario_0
337
+
338
+ target = fs_target
339
+ fs.mkdir(target)
340
+
341
+ for target_slash in [False, True]:
342
+ t = fs_join(target, "newdir")
343
+ if target_slash:
344
+ t += "/"
345
+
346
+ # Without recursive
347
+ fs.put(local_join(source, "subdir", "*"), t)
348
+ assert fs.isdir(fs_join(target, "newdir"))
349
+ assert fs.isfile(fs_join(target, "newdir", "subfile1"))
350
+ assert fs.isfile(fs_join(target, "newdir", "subfile2"))
351
+ assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
352
+ assert not fs.exists(fs_join(target, "newdir", "nesteddir", "nestedfile"))
353
+ assert not fs.exists(fs_join(target, "subdir"))
354
+ assert not fs.exists(fs_join(target, "newdir", "subdir"))
355
+
356
+ fs.rm(fs_join(target, "newdir"), recursive=True)
357
+ assert not fs.exists(fs_join(target, "newdir"))
358
+
359
+ # With recursive
360
+ for glob, recursive in zip(["*", "**"], [True, False]):
361
+ fs.put(local_join(source, "subdir", glob), t, recursive=recursive)
362
+ assert fs.isdir(fs_join(target, "newdir"))
363
+ assert fs.isfile(fs_join(target, "newdir", "subfile1"))
364
+ assert fs.isfile(fs_join(target, "newdir", "subfile2"))
365
+ assert fs.isdir(fs_join(target, "newdir", "nesteddir"))
366
+ assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile"))
367
+ assert not fs.exists(fs_join(target, "subdir"))
368
+ assert not fs.exists(fs_join(target, "newdir", "subdir"))
369
+
370
+ fs.rm(fs_join(target, "newdir"), recursive=True)
371
+ assert not fs.exists(fs_join(target, "newdir"))
372
+
373
+ # Limit recursive by maxdepth
374
+ fs.put(
375
+ local_join(source, "subdir", glob),
376
+ t,
377
+ recursive=recursive,
378
+ maxdepth=1,
379
+ )
380
+ assert fs.isdir(fs_join(target, "newdir"))
381
+ assert fs.isfile(fs_join(target, "newdir", "subfile1"))
382
+ assert fs.isfile(fs_join(target, "newdir", "subfile2"))
383
+ assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
384
+ assert not fs.exists(fs_join(target, "subdir"))
385
+ assert not fs.exists(fs_join(target, "newdir", "subdir"))
386
+
387
+ fs.rm(fs_join(target, "newdir"), recursive=True)
388
+ assert not fs.exists(fs_join(target, "newdir"))
389
+
390
+ @pytest.mark.parametrize(
391
+ GLOB_EDGE_CASES_TESTS["argnames"],
392
+ GLOB_EDGE_CASES_TESTS["argvalues"],
393
+ )
394
+ def test_put_glob_edge_cases(
395
+ self,
396
+ path,
397
+ recursive,
398
+ maxdepth,
399
+ expected,
400
+ fs,
401
+ fs_join,
402
+ fs_target,
403
+ local_glob_edge_cases_files,
404
+ local_join,
405
+ fs_sanitize_path,
406
+ ):
407
+ # Copy scenario 1g
408
+ source = local_glob_edge_cases_files
409
+
410
+ target = fs_target
411
+
412
+ for new_dir, target_slash in product([True, False], [True, False]):
413
+ fs.mkdir(target)
414
+
415
+ t = fs_join(target, "newdir") if new_dir else target
416
+ t = t + "/" if target_slash else t
417
+
418
+ fs.put(local_join(source, path), t, recursive=recursive, maxdepth=maxdepth)
419
+
420
+ output = fs.find(target)
421
+ if new_dir:
422
+ prefixed_expected = [
423
+ fs_sanitize_path(fs_join(target, "newdir", p)) for p in expected
424
+ ]
425
+ else:
426
+ prefixed_expected = [
427
+ fs_sanitize_path(fs_join(target, p)) for p in expected
428
+ ]
429
+ assert sorted(output) == sorted(prefixed_expected)
430
+
431
+ try:
432
+ fs.rm(target, recursive=True)
433
+ except FileNotFoundError:
434
+ pass
435
+
436
+ def test_put_list_of_files_to_existing_directory(
437
+ self,
438
+ fs,
439
+ fs_join,
440
+ fs_target,
441
+ local_join,
442
+ local_bulk_operations_scenario_0,
443
+ supports_empty_directories,
444
+ ):
445
+ # Copy scenario 2a
446
+ source = local_bulk_operations_scenario_0
447
+
448
+ target = fs_target
449
+ fs.mkdir(target)
450
+ if not supports_empty_directories:
451
+ # Force target directory to exist by adding a dummy file
452
+ dummy = fs_join(target, "dummy")
453
+ fs.touch(dummy)
454
+ assert fs.isdir(target)
455
+
456
+ source_files = [
457
+ local_join(source, "file1"),
458
+ local_join(source, "file2"),
459
+ local_join(source, "subdir", "subfile1"),
460
+ ]
461
+
462
+ for target_slash in [False, True]:
463
+ t = target + "/" if target_slash else target
464
+
465
+ fs.put(source_files, t)
466
+ assert fs.isfile(fs_join(target, "file1"))
467
+ assert fs.isfile(fs_join(target, "file2"))
468
+ assert fs.isfile(fs_join(target, "subfile1"))
469
+
470
+ fs.rm(
471
+ [
472
+ fs_join(target, "file1"),
473
+ fs_join(target, "file2"),
474
+ fs_join(target, "subfile1"),
475
+ ],
476
+ recursive=True,
477
+ )
478
+ assert fs.ls(target, detail=False) == (
479
+ [] if supports_empty_directories else [dummy]
480
+ )
481
+
482
+ def test_put_list_of_files_to_new_directory(
483
+ self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
484
+ ):
485
+ # Copy scenario 2b
486
+ source = local_bulk_operations_scenario_0
487
+
488
+ target = fs_target
489
+ fs.mkdir(target)
490
+
491
+ source_files = [
492
+ local_join(source, "file1"),
493
+ local_join(source, "file2"),
494
+ local_join(source, "subdir", "subfile1"),
495
+ ]
496
+
497
+ fs.put(source_files, fs_join(target, "newdir") + "/") # Note trailing slash
498
+ assert fs.isdir(fs_join(target, "newdir"))
499
+ assert fs.isfile(fs_join(target, "newdir", "file1"))
500
+ assert fs.isfile(fs_join(target, "newdir", "file2"))
501
+ assert fs.isfile(fs_join(target, "newdir", "subfile1"))
502
+
503
+ def test_put_directory_recursive(
504
+ self, fs, fs_join, fs_target, local_fs, local_join, local_path
505
+ ):
506
+ # https://github.com/fsspec/filesystem_spec/issues/1062
507
+ # Recursive cp/get/put of source directory into non-existent target directory.
508
+ src = local_join(local_path, "src")
509
+ src_file = local_join(src, "file")
510
+ local_fs.mkdir(src)
511
+ local_fs.touch(src_file)
512
+
513
+ target = fs_target
514
+
515
+ # put without slash
516
+ assert not fs.exists(target)
517
+ for loop in range(2):
518
+ fs.put(src, target, recursive=True)
519
+ assert fs.isdir(target)
520
+
521
+ if loop == 0:
522
+ assert fs.isfile(fs_join(target, "file"))
523
+ assert not fs.exists(fs_join(target, "src"))
524
+ else:
525
+ assert fs.isfile(fs_join(target, "file"))
526
+ assert fs.isdir(fs_join(target, "src"))
527
+ assert fs.isfile(fs_join(target, "src", "file"))
528
+
529
+ fs.rm(target, recursive=True)
530
+
531
+ # put with slash
532
+ assert not fs.exists(target)
533
+ for loop in range(2):
534
+ fs.put(src + "/", target, recursive=True)
535
+ assert fs.isdir(target)
536
+ assert fs.isfile(fs_join(target, "file"))
537
+ assert not fs.exists(fs_join(target, "src"))
538
+
539
+ def test_put_directory_without_files_with_same_name_prefix(
540
+ self,
541
+ fs,
542
+ fs_join,
543
+ fs_target,
544
+ local_join,
545
+ local_dir_and_file_with_same_name_prefix,
546
+ supports_empty_directories,
547
+ ):
548
+ # Create the test dirs
549
+ source = local_dir_and_file_with_same_name_prefix
550
+ target = fs_target
551
+
552
+ # Test without glob
553
+ fs.put(local_join(source, "subdir"), fs_target, recursive=True)
554
+
555
+ assert fs.isfile(fs_join(fs_target, "subfile.txt"))
556
+ assert not fs.isfile(fs_join(fs_target, "subdir.txt"))
557
+
558
+ fs.rm([fs_join(target, "subfile.txt")])
559
+ if supports_empty_directories:
560
+ assert fs.ls(target) == []
561
+ else:
562
+ assert not fs.exists(target)
563
+
564
+ # Test with glob
565
+ fs.put(local_join(source, "subdir*"), fs_target, recursive=True)
566
+
567
+ assert fs.isdir(fs_join(fs_target, "subdir"))
568
+ assert fs.isfile(fs_join(fs_target, "subdir", "subfile.txt"))
569
+ assert fs.isfile(fs_join(fs_target, "subdir.txt"))
570
+
571
+ def test_copy_with_source_and_destination_as_list(
572
+ self, fs, fs_target, fs_join, local_join, local_10_files_with_hashed_names
573
+ ):
574
+ # Create the test dir
575
+ source = local_10_files_with_hashed_names
576
+ target = fs_target
577
+
578
+ # Create list of files for source and destination
579
+ source_files = []
580
+ destination_files = []
581
+ for i in range(10):
582
+ hashed_i = md5(str(i).encode("utf-8")).hexdigest()
583
+ source_files.append(local_join(source, f"{hashed_i}.txt"))
584
+ destination_files.append(fs_join(target, f"{hashed_i}.txt"))
585
+
586
+ # Copy and assert order was kept
587
+ fs.put(lpath=source_files, rpath=destination_files)
588
+
589
+ for i in range(10):
590
+ file_content = fs.cat(destination_files[i]).decode("utf-8")
591
+ assert file_content == str(i)
tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/__init__.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ import torch
7
+
8
+ from torch._functorch.deprecated import (
9
+ combine_state_for_ensemble,
10
+ functionalize,
11
+ grad,
12
+ grad_and_value,
13
+ hessian,
14
+ jacfwd,
15
+ jacrev,
16
+ jvp,
17
+ make_functional,
18
+ make_functional_with_buffers,
19
+ vjp,
20
+ vmap,
21
+ )
22
+
23
+ # utilities. Maybe these should go in their own namespace in the future?
24
+ from torch._functorch.make_functional import (
25
+ FunctionalModule,
26
+ FunctionalModuleWithBuffers,
27
+ )
28
+
29
+ # Top-level APIs. Please think carefully before adding something to the
30
+ # top-level namespace:
31
+ # - private helper functions should go into torch._functorch
32
+ # - very experimental things should go into functorch.experimental
33
+ # - compilation related things should go into functorch.compile
34
+
35
+ # Was never documented
36
+ from torch._functorch.python_key import make_fx
37
+
38
+ __version__ = torch.__version__
tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/control_flow.cpython-311.pyc ADDED
Binary file (534 Bytes). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/ops.cpython-311.pyc ADDED
Binary file (286 Bytes). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/control_flow.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from torch import cond # noqa: F401
2
+ from torch._higher_order_ops.cond import UnsupportedAliasMutationException # noqa: F401
3
+
4
+ from torch._higher_order_ops.map import ( # noqa: F401
5
+ _stack_pytree,
6
+ _unstack_pytree,
7
+ map,
8
+ )
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaProfilerTypedefs.h ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2020-2021 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef CUDAPROFILERTYPEDEFS_H
51
+ #define CUDAPROFILERTYPEDEFS_H
52
+
53
+ #include <cudaProfiler.h>
54
+
55
+ #ifdef __cplusplus
56
+ extern "C" {
57
+ #endif // __cplusplus
58
+
59
+ /*
60
+ * Macros for the latest version for each driver function in cudaProfiler.h
61
+ */
62
+ #define PFN_cuProfilerInitialize PFN_cuProfilerInitialize_v4000
63
+ #define PFN_cuProfilerStart PFN_cuProfilerStart_v4000
64
+ #define PFN_cuProfilerStop PFN_cuProfilerStop_v4000
65
+
66
+
67
+ /**
68
+ * Type definitions for functions defined in cudaProfiler.h
69
+ */
70
+ typedef CUresult (CUDAAPI *PFN_cuProfilerInitialize_v4000)(const char *configFile, const char *outputFile, CUoutput_mode outputMode);
71
+ typedef CUresult (CUDAAPI *PFN_cuProfilerStart_v4000)(void);
72
+ typedef CUresult (CUDAAPI *PFN_cuProfilerStop_v4000)(void);
73
+
74
+ #ifdef __cplusplus
75
+ }
76
+ #endif // __cplusplus
77
+
78
+ #endif // file guard
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaVDPAU.h ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef CUDAVDPAU_H
51
+ #define CUDAVDPAU_H
52
+
53
+ #ifdef CUDA_FORCE_API_VERSION
54
+ #error "CUDA_FORCE_API_VERSION is no longer supported."
55
+ #endif
56
+
57
+ #define cuVDPAUCtxCreate cuVDPAUCtxCreate_v2
58
+
59
+ #ifdef __cplusplus
60
+ extern "C" {
61
+ #endif
62
+
63
+ /**
64
+ * \defgroup CUDA_VDPAU VDPAU Interoperability
65
+ * \ingroup CUDA_DRIVER
66
+ *
67
+ * ___MANBRIEF___ VDPAU interoperability functions of the low-level CUDA driver
68
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
69
+ *
70
+ * This section describes the VDPAU interoperability functions of the
71
+ * low-level CUDA driver application programming interface.
72
+ *
73
+ * @{
74
+ */
75
+
76
+ /**
77
+ * \brief Gets the CUDA device associated with a VDPAU device
78
+ *
79
+ * Returns in \p *pDevice the CUDA device associated with a \p vdpDevice, if
80
+ * applicable.
81
+ *
82
+ * \param pDevice - Device associated with vdpDevice
83
+ * \param vdpDevice - A VdpDevice handle
84
+ * \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
85
+ *
86
+ * \return
87
+ * ::CUDA_SUCCESS,
88
+ * ::CUDA_ERROR_DEINITIALIZED,
89
+ * ::CUDA_ERROR_NOT_INITIALIZED,
90
+ * ::CUDA_ERROR_INVALID_CONTEXT,
91
+ * ::CUDA_ERROR_INVALID_VALUE
92
+ * \notefnerr
93
+ *
94
+ * \sa ::cuCtxCreate, ::cuVDPAUCtxCreate, ::cuGraphicsVDPAURegisterVideoSurface,
95
+ * ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource,
96
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
97
+ * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
98
+ * ::cudaVDPAUGetDevice
99
+ */
100
+ CUresult CUDAAPI cuVDPAUGetDevice(CUdevice *pDevice, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
101
+
102
+ /**
103
+ * \brief Create a CUDA context for interoperability with VDPAU
104
+ *
105
+ * Creates a new CUDA context, initializes VDPAU interoperability, and
106
+ * associates the CUDA context with the calling thread. It must be called
107
+ * before performing any other VDPAU interoperability operations. It may fail
108
+ * if the needed VDPAU driver facilities are not available. For usage of the
109
+ * \p flags parameter, see ::cuCtxCreate().
110
+ *
111
+ * \param pCtx - Returned CUDA context
112
+ * \param flags - Options for CUDA context creation
113
+ * \param device - Device on which to create the context
114
+ * \param vdpDevice - The VdpDevice to interop with
115
+ * \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
116
+ *
117
+ * \return
118
+ * ::CUDA_SUCCESS,
119
+ * ::CUDA_ERROR_DEINITIALIZED,
120
+ * ::CUDA_ERROR_NOT_INITIALIZED,
121
+ * ::CUDA_ERROR_INVALID_CONTEXT,
122
+ * ::CUDA_ERROR_INVALID_VALUE,
123
+ * ::CUDA_ERROR_OUT_OF_MEMORY
124
+ * \notefnerr
125
+ *
126
+ * \sa ::cuCtxCreate, ::cuGraphicsVDPAURegisterVideoSurface,
127
+ * ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource,
128
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
129
+ * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
130
+ * ::cuVDPAUGetDevice
131
+ */
132
+ CUresult CUDAAPI cuVDPAUCtxCreate(CUcontext *pCtx, unsigned int flags, CUdevice device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
133
+
134
+ /**
135
+ * \brief Registers a VDPAU VdpVideoSurface object
136
+ *
137
+ * Registers the VdpVideoSurface specified by \p vdpSurface for access by
138
+ * CUDA. A handle to the registered object is returned as \p pCudaResource.
139
+ * The surface's intended usage is specified using \p flags, as follows:
140
+ *
141
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
142
+ * resource will be used. It is therefore assumed that this resource will be
143
+ * read from and written to by CUDA. This is the default value.
144
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA
145
+ * will not write to this resource.
146
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that
147
+ * CUDA will not read from this resource and will write over the
148
+ * entire contents of the resource, so none of the data previously
149
+ * stored in the resource will be preserved.
150
+ *
151
+ * The VdpVideoSurface is presented as an array of subresources that may be
152
+ * accessed using pointers returned by ::cuGraphicsSubResourceGetMappedArray.
153
+ * The exact number of valid \p arrayIndex values depends on the VDPAU surface
154
+ * format. The mapping is shown in the table below. \p mipLevel must be 0.
155
+ *
156
+ * \htmlonly
157
+ * <table>
158
+ * <tr><th>VdpChromaType </th><th>arrayIndex</th><th>Size </th><th>Format</th><th>Content </th></tr>
159
+ * <tr><td rowspan="4" valign="top">VDP_CHROMA_TYPE_420</td><td>0 </td><td>w x h/2</td><td>R8 </td><td>Top-field luma </td></tr>
160
+ * <tr> <td>1 </td><td>w x h/2</td><td>R8 </td><td>Bottom-field luma </td></tr>
161
+ * <tr> <td>2 </td><td>w/2 x h/4</td><td>R8G8 </td><td>Top-field chroma </td></tr>
162
+ * <tr> <td>3 </td><td>w/2 x h/4</td><td>R8G8 </td><td>Bottom-field chroma</td></tr>
163
+ * <tr><td rowspan="4" valign="top">VDP_CHROMA_TYPE_422</td><td>0 </td><td>w x h/2</td><td>R8 </td><td>Top-field luma </td></tr>
164
+ * <tr> <td>1 </td><td>w x h/2</td><td>R8 </td><td>Bottom-field luma </td></tr>
165
+ * <tr> <td>2 </td><td>w/2 x h/2</td><td>R8G8 </td><td>Top-field chroma </td></tr>
166
+ * <tr> <td>3 </td><td>w/2 x h/2</td><td>R8G8 </td><td>Bottom-field chroma</td></tr>
167
+ * </table>
168
+ * \endhtmlonly
169
+ *
170
+ * \latexonly
171
+ * \begin{tabular}{|l|l|l|l|l|}
172
+ * \hline
173
+ * VdpChromaType & arrayIndex & Size & Format & Content \\
174
+ * \hline
175
+ * VDP\_CHROMA\_TYPE\_420 & 0 & w x h/2 & R8 & Top-field luma \\
176
+ * & 1 & w x h/2 & R8 & Bottom-field luma \\
177
+ * & 2 & w/2 x h/4 & R8G8 & Top-field chroma \\
178
+ * & 3 & w/2 x h/4 & R8G8 & Bottom-field chroma \\
179
+ * \hline
180
+ * VDP\_CHROMA\_TYPE\_422 & 0 & w x h/2 & R8 & Top-field luma \\
181
+ * & 1 & w x h/2 & R8 & Bottom-field luma \\
182
+ * & 2 & w/2 x h/2 & R8G8 & Top-field chroma \\
183
+ * & 3 & w/2 x h/2 & R8G8 & Bottom-field chroma \\
184
+ * \hline
185
+ * \end{tabular}
186
+ * \endlatexonly
187
+ *
188
+ * \param pCudaResource - Pointer to the returned object handle
189
+ * \param vdpSurface - The VdpVideoSurface to be registered
190
+ * \param flags - Map flags
191
+ *
192
+ * \return
193
+ * ::CUDA_SUCCESS,
194
+ * ::CUDA_ERROR_INVALID_HANDLE,
195
+ * ::CUDA_ERROR_ALREADY_MAPPED,
196
+ * ::CUDA_ERROR_INVALID_CONTEXT,
197
+ * \notefnerr
198
+ *
199
+ * \sa ::cuCtxCreate, ::cuVDPAUCtxCreate,
200
+ * ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource,
201
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
202
+ * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
203
+ * ::cuVDPAUGetDevice,
204
+ * ::cudaGraphicsVDPAURegisterVideoSurface
205
+ */
206
+ CUresult CUDAAPI cuGraphicsVDPAURegisterVideoSurface(CUgraphicsResource *pCudaResource, VdpVideoSurface vdpSurface, unsigned int flags);
207
+
208
+ /**
209
+ * \brief Registers a VDPAU VdpOutputSurface object
210
+ *
211
+ * Registers the VdpOutputSurface specified by \p vdpSurface for access by
212
+ * CUDA. A handle to the registered object is returned as \p pCudaResource.
213
+ * The surface's intended usage is specified using \p flags, as follows:
214
+ *
215
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
216
+ * resource will be used. It is therefore assumed that this resource will be
217
+ * read from and written to by CUDA. This is the default value.
218
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA
219
+ * will not write to this resource.
220
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that
221
+ * CUDA will not read from this resource and will write over the
222
+ * entire contents of the resource, so none of the data previously
223
+ * stored in the resource will be preserved.
224
+ *
225
+ * The VdpOutputSurface is presented as an array of subresources that may be
226
+ * accessed using pointers returned by ::cuGraphicsSubResourceGetMappedArray.
227
+ * The exact number of valid \p arrayIndex values depends on the VDPAU surface
228
+ * format. The mapping is shown in the table below. \p mipLevel must be 0.
229
+ *
230
+ * \htmlonly
231
+ * <table>
232
+ * <tr><th>VdpRGBAFormat </th><th>arrayIndex</th><th>Size </th><th>Format </th><th>Content </th></tr>
233
+ * <tr><td>VDP_RGBA_FORMAT_B8G8R8A8 </td><td>0 </td><td>w x h</td><td>ARGB8 </td><td>Entire surface</td></tr>
234
+ * <tr><td>VDP_RGBA_FORMAT_R10G10B10A2</td><td>0 </td><td>w x h</td><td>A2BGR10</td><td>Entire surface</td></tr>
235
+ * </table>
236
+ * \endhtmlonly
237
+ *
238
+ * \latexonly
239
+ * \begin{tabular}{|l|l|l|l|l|}
240
+ * \hline
241
+ * VdpRGBAFormat & arrayIndex & Size & Format & Content \\
242
+ * \hline
243
+ * VDP\_RGBA\_FORMAT\_B8G8R8A8 & 0 & w x h & ARGB8 & Entire surface \\
244
+ * VDP\_RGBA\_FORMAT\_R10G10B10A2 & 0 & w x h & A2BGR10 & Entire surface \\
245
+ * \hline
246
+ * \end{tabular}
247
+ * \endlatexonly
248
+ *
249
+ * \param pCudaResource - Pointer to the returned object handle
250
+ * \param vdpSurface - The VdpOutputSurface to be registered
251
+ * \param flags - Map flags
252
+ *
253
+ * \return
254
+ * ::CUDA_SUCCESS,
255
+ * ::CUDA_ERROR_INVALID_HANDLE,
256
+ * ::CUDA_ERROR_ALREADY_MAPPED,
257
+ * ::CUDA_ERROR_INVALID_CONTEXT,
258
+ * \notefnerr
259
+ *
260
+ * \sa ::cuCtxCreate, ::cuVDPAUCtxCreate,
261
+ * ::cuGraphicsVDPAURegisterVideoSurface, ::cuGraphicsUnregisterResource,
262
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
263
+ * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
264
+ * ::cuVDPAUGetDevice,
265
+ * ::cudaGraphicsVDPAURegisterOutputSurface
266
+ */
267
+ CUresult CUDAAPI cuGraphicsVDPAURegisterOutputSurface(CUgraphicsResource *pCudaResource, VdpOutputSurface vdpSurface, unsigned int flags);
268
+
269
+ /** @} */ /* END CUDA_VDPAU */
270
+
271
+
272
+ #if defined(__CUDA_API_VERSION_INTERNAL)
273
+ #undef cuVDPAUCtxCreate
274
+
275
+ CUresult CUDAAPI cuVDPAUCtxCreate(CUcontext *pCtx, unsigned int flags, CUdevice device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
276
+ #endif /* __CUDA_API_VERSION_INTERNAL */
277
+
278
+ #ifdef __cplusplus
279
+ };
280
+ #endif
281
+
282
+ #endif
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_atomic_functions.h ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__SM_20_ATOMIC_FUNCTIONS_H__)
51
+ #define __SM_20_ATOMIC_FUNCTIONS_H__
52
+
53
+ #if defined(__CUDACC_RTC__)
54
+ #define __SM_20_ATOMIC_FUNCTIONS_DECL__ __device__
55
+ #else /* __CUDACC_RTC__ */
56
+ #define __SM_20_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
57
+ #endif /* __CUDACC_RTC__ */
58
+
59
+ #if defined(__cplusplus) && defined(__CUDACC__)
60
+
61
+ /*******************************************************************************
62
+ * *
63
+ * *
64
+ * *
65
+ *******************************************************************************/
66
+
67
+ #include "cuda_runtime_api.h"
68
+
69
+ #ifndef __CUDA_ARCH__
70
+ #define __DEF_IF_HOST { }
71
+ #else /* !__CUDA_ARCH__ */
72
+ #define __DEF_IF_HOST ;
73
+ #endif /* __CUDA_ARCH__ */
74
+
75
+
76
+ #ifdef __CUDA_ARCH__
77
+ extern "C"
78
+ {
79
+ extern __device__ __device_builtin__ float __fAtomicAdd(float *address, float val);
80
+ }
81
+ #endif /* __CUDA_ARCH__ */
82
+
83
+ /*******************************************************************************
84
+ * *
85
+ * *
86
+ * *
87
+ *******************************************************************************/
88
+
89
+ __SM_20_ATOMIC_FUNCTIONS_DECL__ float atomicAdd(float *address, float val) __DEF_IF_HOST
90
+
91
+ #endif /* __cplusplus && __CUDACC__ */
92
+
93
+ #undef __DEF_IF_HOST
94
+ #undef __SM_20_ATOMIC_FUNCTIONS_DECL__
95
+
96
+ #if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
97
+ #include "sm_20_atomic_functions.hpp"
98
+ #endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
99
+
100
+ #endif /* !__SM_20_ATOMIC_FUNCTIONS_H__ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_atomic_functions.hpp ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__SM_20_ATOMIC_FUNCTIONS_HPP__)
51
+ #define __SM_20_ATOMIC_FUNCTIONS_HPP__
52
+
53
+ #if defined(__CUDACC_RTC__)
54
+ #define __SM_20_ATOMIC_FUNCTIONS_DECL__ __device__
55
+ #else /* __CUDACC_RTC__ */
56
+ #define __SM_20_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
57
+ #endif /* __CUDACC_RTC__ */
58
+
59
+ #if defined(__cplusplus) && defined(__CUDACC__)
60
+
61
+ /*******************************************************************************
62
+ * *
63
+ * *
64
+ * *
65
+ *******************************************************************************/
66
+
67
+ #include "cuda_runtime_api.h"
68
+
69
+ /*******************************************************************************
70
+ * *
71
+ * *
72
+ * *
73
+ *******************************************************************************/
74
+
75
+ __SM_20_ATOMIC_FUNCTIONS_DECL__ float atomicAdd(float *address, float val)
76
+ {
77
+ return __fAtomicAdd(address, val);
78
+ }
79
+
80
+ #endif /* __cplusplus && __CUDACC__ */
81
+
82
+ #undef __SM_20_ATOMIC_FUNCTIONS_DECL__
83
+
84
+ #endif /* !__SM_20_ATOMIC_FUNCTIONS_HPP__ */
85
+
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_35_intrinsics.h ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ * Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
4
+
5
+ *
6
+
7
+ * NOTICE TO LICENSEE:
8
+
9
+ *
10
+
11
+ * This source code and/or documentation ("Licensed Deliverables") are
12
+
13
+ * subject to NVIDIA intellectual property rights under U.S. and
14
+
15
+ * international Copyright laws.
16
+
17
+ *
18
+
19
+ * These Licensed Deliverables contained herein is PROPRIETARY and
20
+
21
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
22
+
23
+ * conditions of a form of NVIDIA software license agreement by and
24
+
25
+ * between NVIDIA and Licensee ("License Agreement") or electronically
26
+
27
+ * accepted by Licensee. Notwithstanding any terms or conditions to
28
+
29
+ * the contrary in the License Agreement, reproduction or disclosure
30
+
31
+ * of the Licensed Deliverables to any third party without the express
32
+
33
+ * written consent of NVIDIA is prohibited.
34
+
35
+ *
36
+
37
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
38
+
39
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
40
+
41
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
42
+
43
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
44
+
45
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
46
+
47
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
48
+
49
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
50
+
51
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
52
+
53
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
54
+
55
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
56
+
57
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
58
+
59
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
60
+
61
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
62
+
63
+ * OF THESE LICENSED DELIVERABLES.
64
+
65
+ *
66
+
67
+ * U.S. Government End Users. These Licensed Deliverables are a
68
+
69
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
70
+
71
+ * 1995), consisting of "commercial computer software" and "commercial
72
+
73
+ * computer software documentation" as such terms are used in 48
74
+
75
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
76
+
77
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
78
+
79
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
80
+
81
+ * U.S. Government End Users acquire the Licensed Deliverables with
82
+
83
+ * only those rights set forth herein.
84
+
85
+ *
86
+
87
+ * Any use of the Licensed Deliverables in individual and commercial
88
+
89
+ * software must include, in the user documentation and internal
90
+
91
+ * comments to the code, the above Disclaimer and U.S. Government End
92
+
93
+ * Users Notice.
94
+
95
+ */
96
+
97
+
98
+
99
+ #if !defined(__SM_35_INTRINSICS_H__)
100
+
101
+ #define __SM_35_INTRINSICS_H__
102
+
103
+
104
+
105
+ /**********************************************************************************
106
+
107
+ * All sm_35 intrinsics are supported by sm_32 so simply include its header file *
108
+
109
+ **********************************************************************************/
110
+
111
+ #include "sm_32_intrinsics.h"
112
+
113
+
114
+
115
+ #endif /* !__SM_35_INTRINSICS_H__ */
116
+
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/vector_functions.hpp ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__VECTOR_FUNCTIONS_HPP__)
51
+ #define __VECTOR_FUNCTIONS_HPP__
52
+
53
+ /*******************************************************************************
54
+ * *
55
+ * *
56
+ * *
57
+ *******************************************************************************/
58
+
59
+ #include "cuda_runtime_api.h"
60
+
61
+ #if defined(__CUDACC_RTC__)
62
+ #define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
63
+ #else /* !__CUDACC_RTC__ */
64
+ #define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
65
+ #endif /* __CUDACC_RTC__ */
66
+
67
+ /*******************************************************************************
68
+ * *
69
+ * *
70
+ * *
71
+ *******************************************************************************/
72
+
73
+ __VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x)
74
+ {
75
+ char1 t; t.x = x; return t;
76
+ }
77
+
78
+ __VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x)
79
+ {
80
+ uchar1 t; t.x = x; return t;
81
+ }
82
+
83
+ __VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y)
84
+ {
85
+ char2 t; t.x = x; t.y = y; return t;
86
+ }
87
+
88
+ __VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y)
89
+ {
90
+ uchar2 t; t.x = x; t.y = y; return t;
91
+ }
92
+
93
+ __VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z)
94
+ {
95
+ char3 t; t.x = x; t.y = y; t.z = z; return t;
96
+ }
97
+
98
+ __VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z)
99
+ {
100
+ uchar3 t; t.x = x; t.y = y; t.z = z; return t;
101
+ }
102
+
103
+ __VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w)
104
+ {
105
+ char4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
106
+ }
107
+
108
+ __VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w)
109
+ {
110
+ uchar4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
111
+ }
112
+
113
+ __VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x)
114
+ {
115
+ short1 t; t.x = x; return t;
116
+ }
117
+
118
+ __VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x)
119
+ {
120
+ ushort1 t; t.x = x; return t;
121
+ }
122
+
123
+ __VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y)
124
+ {
125
+ short2 t; t.x = x; t.y = y; return t;
126
+ }
127
+
128
+ __VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y)
129
+ {
130
+ ushort2 t; t.x = x; t.y = y; return t;
131
+ }
132
+
133
+ __VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z)
134
+ {
135
+ short3 t; t.x = x; t.y = y; t.z = z; return t;
136
+ }
137
+
138
+ __VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z)
139
+ {
140
+ ushort3 t; t.x = x; t.y = y; t.z = z; return t;
141
+ }
142
+
143
+ __VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w)
144
+ {
145
+ short4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
146
+ }
147
+
148
+ __VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
149
+ {
150
+ ushort4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
151
+ }
152
+
153
+ __VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x)
154
+ {
155
+ int1 t; t.x = x; return t;
156
+ }
157
+
158
+ __VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x)
159
+ {
160
+ uint1 t; t.x = x; return t;
161
+ }
162
+
163
+ __VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y)
164
+ {
165
+ int2 t; t.x = x; t.y = y; return t;
166
+ }
167
+
168
+ __VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y)
169
+ {
170
+ uint2 t; t.x = x; t.y = y; return t;
171
+ }
172
+
173
+ __VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z)
174
+ {
175
+ int3 t; t.x = x; t.y = y; t.z = z; return t;
176
+ }
177
+
178
+ __VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z)
179
+ {
180
+ uint3 t; t.x = x; t.y = y; t.z = z; return t;
181
+ }
182
+
183
+ __VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w)
184
+ {
185
+ int4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
186
+ }
187
+
188
+ __VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w)
189
+ {
190
+ uint4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
191
+ }
192
+
193
+ __VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x)
194
+ {
195
+ long1 t; t.x = x; return t;
196
+ }
197
+
198
+ __VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x)
199
+ {
200
+ ulong1 t; t.x = x; return t;
201
+ }
202
+
203
+ __VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y)
204
+ {
205
+ long2 t; t.x = x; t.y = y; return t;
206
+ }
207
+
208
+ __VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y)
209
+ {
210
+ ulong2 t; t.x = x; t.y = y; return t;
211
+ }
212
+
213
+ __VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z)
214
+ {
215
+ long3 t; t.x = x; t.y = y; t.z = z; return t;
216
+ }
217
+
218
+ __VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z)
219
+ {
220
+ ulong3 t; t.x = x; t.y = y; t.z = z; return t;
221
+ }
222
+
223
+ __VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w)
224
+ {
225
+ long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
226
+ }
227
+
228
+ __VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w)
229
+ {
230
+ ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
231
+ }
232
+
233
+ __VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x)
234
+ {
235
+ float1 t; t.x = x; return t;
236
+ }
237
+
238
+ __VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y)
239
+ {
240
+ float2 t; t.x = x; t.y = y; return t;
241
+ }
242
+
243
+ __VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z)
244
+ {
245
+ float3 t; t.x = x; t.y = y; t.z = z; return t;
246
+ }
247
+
248
+ __VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w)
249
+ {
250
+ float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
251
+ }
252
+
253
+ __VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x)
254
+ {
255
+ longlong1 t; t.x = x; return t;
256
+ }
257
+
258
+ __VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x)
259
+ {
260
+ ulonglong1 t; t.x = x; return t;
261
+ }
262
+
263
+ __VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y)
264
+ {
265
+ longlong2 t; t.x = x; t.y = y; return t;
266
+ }
267
+
268
+ __VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y)
269
+ {
270
+ ulonglong2 t; t.x = x; t.y = y; return t;
271
+ }
272
+
273
+ __VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z)
274
+ {
275
+ longlong3 t; t.x = x; t.y = y; t.z = z; return t;
276
+ }
277
+
278
+ __VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z)
279
+ {
280
+ ulonglong3 t; t.x = x; t.y = y; t.z = z; return t;
281
+ }
282
+
283
+ __VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w)
284
+ {
285
+ longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
286
+ }
287
+
288
+ __VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w)
289
+ {
290
+ ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
291
+ }
292
+
293
+ __VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x)
294
+ {
295
+ double1 t; t.x = x; return t;
296
+ }
297
+
298
+ __VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y)
299
+ {
300
+ double2 t; t.x = x; t.y = y; return t;
301
+ }
302
+
303
+ __VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z)
304
+ {
305
+ double3 t; t.x = x; t.y = y; t.z = z; return t;
306
+ }
307
+
308
+ __VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w)
309
+ {
310
+ double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
311
+ }
312
+
313
+ #undef __VECTOR_FUNCTIONS_DECL__
314
+
315
+ #endif /* !__VECTOR_FUNCTIONS_HPP__ */
316
+
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (213 Bytes). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (221 Bytes). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_train.h ADDED
@@ -0,0 +1,540 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2017-2022 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /* cudnn_adv_train : cuDNN's advanced and experimental features.
51
+
52
+ */
53
+
54
+ #if !defined(CUDNN_ADV_TRAIN_H_)
55
+ #define CUDNN_ADV_TRAIN_H_
56
+
57
+ #include <cuda_runtime.h>
58
+ #include <stdint.h>
59
+
60
+ #include "cudnn_version.h"
61
+ #include "cudnn_ops_infer.h"
62
+ #include "cudnn_ops_train.h"
63
+ #include "cudnn_adv_infer.h"
64
+
65
+ /* These version numbers are autogenerated, do not edit manually. */
66
+ #define CUDNN_ADV_TRAIN_MAJOR 8
67
+ #define CUDNN_ADV_TRAIN_MINOR 7
68
+ #define CUDNN_ADV_TRAIN_PATCH 0
69
+
70
+ #if (CUDNN_ADV_TRAIN_MAJOR != CUDNN_MAJOR) || (CUDNN_ADV_TRAIN_MINOR != CUDNN_MINOR) || \
71
+ (CUDNN_ADV_TRAIN_PATCH != CUDNN_PATCHLEVEL)
72
+ #error Version mismatch in cuDNN ADV TRAIN!!!
73
+ #endif
74
+
75
+ #if defined(__cplusplus)
76
+ extern "C" {
77
+ #endif
78
+
79
+ typedef enum {
80
+ CUDNN_WGRAD_MODE_ADD = 0, /* add partial gradients to wgrad output buffers */
81
+ CUDNN_WGRAD_MODE_SET = 1, /* write partial gradients to wgrad output buffers */
82
+ } cudnnWgradMode_t;
83
+
84
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
85
+ cudnnRNNForwardTraining(cudnnHandle_t handle,
86
+ const cudnnRNNDescriptor_t rnnDesc,
87
+ const int seqLength,
88
+ const cudnnTensorDescriptor_t *xDesc,
89
+ const void *x,
90
+ const cudnnTensorDescriptor_t hxDesc,
91
+ const void *hx,
92
+ const cudnnTensorDescriptor_t cxDesc,
93
+ const void *cx,
94
+ const cudnnFilterDescriptor_t wDesc,
95
+ const void *w,
96
+ const cudnnTensorDescriptor_t *yDesc,
97
+ void *y,
98
+ const cudnnTensorDescriptor_t hyDesc,
99
+ void *hy,
100
+ const cudnnTensorDescriptor_t cyDesc,
101
+ void *cy,
102
+ void *workSpace,
103
+ size_t workSpaceSizeInBytes,
104
+ void *reserveSpace,
105
+ size_t reserveSpaceSizeInBytes);
106
+
107
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
108
+ cudnnRNNBackwardData(cudnnHandle_t handle,
109
+ const cudnnRNNDescriptor_t rnnDesc,
110
+ const int seqLength,
111
+ const cudnnTensorDescriptor_t *yDesc,
112
+ const void *y,
113
+ const cudnnTensorDescriptor_t *dyDesc,
114
+ const void *dy,
115
+ const cudnnTensorDescriptor_t dhyDesc,
116
+ const void *dhy,
117
+ const cudnnTensorDescriptor_t dcyDesc,
118
+ const void *dcy,
119
+ const cudnnFilterDescriptor_t wDesc,
120
+ const void *w,
121
+ const cudnnTensorDescriptor_t hxDesc,
122
+ const void *hx,
123
+ const cudnnTensorDescriptor_t cxDesc,
124
+ const void *cx,
125
+ const cudnnTensorDescriptor_t *dxDesc,
126
+ void *dx,
127
+ const cudnnTensorDescriptor_t dhxDesc,
128
+ void *dhx,
129
+ const cudnnTensorDescriptor_t dcxDesc,
130
+ void *dcx,
131
+ void *workSpace,
132
+ size_t workSpaceSizeInBytes,
133
+ void *reserveSpace,
134
+ size_t reserveSpaceSizeInBytes);
135
+
136
+ cudnnStatus_t CUDNNWINAPI
137
+ cudnnRNNBackwardData_v8(cudnnHandle_t handle,
138
+ cudnnRNNDescriptor_t rnnDesc,
139
+ const int32_t devSeqLengths[],
140
+ cudnnRNNDataDescriptor_t yDesc,
141
+ const void *y,
142
+ const void *dy,
143
+ cudnnRNNDataDescriptor_t xDesc,
144
+ void *dx,
145
+ cudnnTensorDescriptor_t hDesc,
146
+ const void *hx,
147
+ const void *dhy,
148
+ void *dhx,
149
+ cudnnTensorDescriptor_t cDesc,
150
+ const void *cx,
151
+ const void *dcy,
152
+ void *dcx,
153
+ size_t weightSpaceSize,
154
+ const void *weightSpace,
155
+ size_t workSpaceSize,
156
+ void *workSpace,
157
+ size_t reserveSpaceSize,
158
+ void *reserveSpace);
159
+
160
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
161
+ cudnnRNNBackwardWeights(cudnnHandle_t handle,
162
+ const cudnnRNNDescriptor_t rnnDesc,
163
+ const int seqLength,
164
+ const cudnnTensorDescriptor_t *xDesc,
165
+ const void *x,
166
+ const cudnnTensorDescriptor_t hxDesc,
167
+ const void *hx,
168
+ const cudnnTensorDescriptor_t *yDesc,
169
+ const void *y,
170
+ const void *workSpace,
171
+ size_t workSpaceSizeInBytes,
172
+ const cudnnFilterDescriptor_t dwDesc,
173
+ void *dw,
174
+ const void *reserveSpace,
175
+ size_t reserveSpaceSizeInBytes);
176
+
177
+ cudnnStatus_t CUDNNWINAPI
178
+ cudnnRNNBackwardWeights_v8(cudnnHandle_t handle,
179
+ cudnnRNNDescriptor_t rnnDesc,
180
+ cudnnWgradMode_t addGrad,
181
+ const int32_t devSeqLengths[],
182
+ cudnnRNNDataDescriptor_t xDesc,
183
+ const void *x,
184
+ cudnnTensorDescriptor_t hDesc,
185
+ const void *hx,
186
+ cudnnRNNDataDescriptor_t yDesc,
187
+ const void *y,
188
+ size_t weightSpaceSize,
189
+ void *dweightSpace,
190
+ size_t workSpaceSize,
191
+ void *workSpace,
192
+ size_t reserveSpaceSize,
193
+ void *reserveSpace);
194
+
195
+ /* RNN EX API */
196
+
197
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
198
+ cudnnRNNForwardTrainingEx(cudnnHandle_t handle,
199
+ const cudnnRNNDescriptor_t rnnDesc,
200
+ const cudnnRNNDataDescriptor_t xDesc,
201
+ const void *x,
202
+ const cudnnTensorDescriptor_t hxDesc,
203
+ const void *hx,
204
+ const cudnnTensorDescriptor_t cxDesc,
205
+ const void *cx,
206
+ const cudnnFilterDescriptor_t wDesc,
207
+ const void *w,
208
+ const cudnnRNNDataDescriptor_t yDesc,
209
+ void *y,
210
+ const cudnnTensorDescriptor_t hyDesc,
211
+ void *hy,
212
+ const cudnnTensorDescriptor_t cyDesc,
213
+ void *cy,
214
+ const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
215
+ const void *keys, /* reserved, should pass NULL */
216
+ const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
217
+ void *cAttn, /* reserved, should pass NULL */
218
+ const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
219
+ void *iAttn, /* reserved, should pass NULL */
220
+ const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
221
+ void *queries, /* reserved, should pass NULL */
222
+ void *workSpace,
223
+ size_t workSpaceSizeInBytes,
224
+ void *reserveSpace,
225
+ size_t reserveSpaceSizeInBytes);
226
+
227
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
228
+ cudnnRNNBackwardDataEx(cudnnHandle_t handle,
229
+ const cudnnRNNDescriptor_t rnnDesc,
230
+ const cudnnRNNDataDescriptor_t yDesc,
231
+ const void *y,
232
+ const cudnnRNNDataDescriptor_t dyDesc,
233
+ const void *dy,
234
+ const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */
235
+ const void *dcAttn, /* reserved, should pass NULL */
236
+ const cudnnTensorDescriptor_t dhyDesc,
237
+ const void *dhy,
238
+ const cudnnTensorDescriptor_t dcyDesc,
239
+ const void *dcy,
240
+ const cudnnFilterDescriptor_t wDesc,
241
+ const void *w,
242
+ const cudnnTensorDescriptor_t hxDesc,
243
+ const void *hx,
244
+ const cudnnTensorDescriptor_t cxDesc,
245
+ const void *cx,
246
+ const cudnnRNNDataDescriptor_t dxDesc,
247
+ void *dx,
248
+ const cudnnTensorDescriptor_t dhxDesc,
249
+ void *dhx,
250
+ const cudnnTensorDescriptor_t dcxDesc,
251
+ void *dcx,
252
+ const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */
253
+ void *dkeys, /* reserved, should pass NULL */
254
+ void *workSpace,
255
+ size_t workSpaceSizeInBytes,
256
+ void *reserveSpace,
257
+ size_t reserveSpaceSizeInBytes);
258
+
259
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
260
+ cudnnRNNBackwardWeightsEx(cudnnHandle_t handle,
261
+ const cudnnRNNDescriptor_t rnnDesc,
262
+ const cudnnRNNDataDescriptor_t xDesc,
263
+ const void *x,
264
+ const cudnnTensorDescriptor_t hxDesc,
265
+ const void *hx,
266
+ const cudnnRNNDataDescriptor_t yDesc,
267
+ const void *y,
268
+ void *workSpace,
269
+ size_t workSpaceSizeInBytes,
270
+ const cudnnFilterDescriptor_t dwDesc,
271
+ void *dw,
272
+ void *reserveSpace,
273
+ size_t reserveSpaceSizeInBytes);
274
+
275
+ /* RNN FIND API */
276
+
277
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
278
+ cudnnGetRNNForwardTrainingAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count);
279
+
280
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
281
+ cudnnFindRNNForwardTrainingAlgorithmEx(cudnnHandle_t handle,
282
+ const cudnnRNNDescriptor_t rnnDesc,
283
+ const int seqLength,
284
+ const cudnnTensorDescriptor_t *xDesc,
285
+ const void *x,
286
+ const cudnnTensorDescriptor_t hxDesc,
287
+ const void *hx,
288
+ const cudnnTensorDescriptor_t cxDesc,
289
+ const void *cx,
290
+ const cudnnFilterDescriptor_t wDesc,
291
+ const void *w,
292
+ const cudnnTensorDescriptor_t *yDesc,
293
+ void *y,
294
+ const cudnnTensorDescriptor_t hyDesc,
295
+ void *hy,
296
+ const cudnnTensorDescriptor_t cyDesc,
297
+ void *cy,
298
+ const float findIntensity,
299
+ const int requestedAlgoCount,
300
+ int *returnedAlgoCount,
301
+ cudnnAlgorithmPerformance_t *perfResults,
302
+ void *workspace,
303
+ size_t workSpaceSizeInBytes,
304
+ void *reserveSpace,
305
+ size_t reserveSpaceSizeInBytes);
306
+
307
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
308
+ cudnnGetRNNBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count);
309
+
310
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
311
+ cudnnFindRNNBackwardDataAlgorithmEx(cudnnHandle_t handle,
312
+ const cudnnRNNDescriptor_t rnnDesc,
313
+ const int seqLength,
314
+ const cudnnTensorDescriptor_t *yDesc,
315
+ const void *y,
316
+ const cudnnTensorDescriptor_t *dyDesc,
317
+ const void *dy,
318
+ const cudnnTensorDescriptor_t dhyDesc,
319
+ const void *dhy,
320
+ const cudnnTensorDescriptor_t dcyDesc,
321
+ const void *dcy,
322
+ const cudnnFilterDescriptor_t wDesc,
323
+ const void *w,
324
+ const cudnnTensorDescriptor_t hxDesc,
325
+ const void *hx,
326
+ const cudnnTensorDescriptor_t cxDesc,
327
+ const void *cx,
328
+ const cudnnTensorDescriptor_t *dxDesc,
329
+ void *dx,
330
+ const cudnnTensorDescriptor_t dhxDesc,
331
+ void *dhx,
332
+ const cudnnTensorDescriptor_t dcxDesc,
333
+ void *dcx,
334
+ const float findIntensity,
335
+ const int requestedAlgoCount,
336
+ int *returnedAlgoCount,
337
+ cudnnAlgorithmPerformance_t *perfResults,
338
+ void *workspace,
339
+ size_t workSpaceSizeInBytes,
340
+ void *reserveSpace,
341
+ size_t reserveSpaceSizeInBytes);
342
+
343
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
344
+ cudnnGetRNNBackwardWeightsAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count);
345
+
346
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
347
+ cudnnFindRNNBackwardWeightsAlgorithmEx(cudnnHandle_t handle,
348
+ const cudnnRNNDescriptor_t rnnDesc,
349
+ const int seqLength,
350
+ const cudnnTensorDescriptor_t *xDesc,
351
+ const void *x,
352
+ const cudnnTensorDescriptor_t hxDesc,
353
+ const void *hx,
354
+ const cudnnTensorDescriptor_t *yDesc,
355
+ const void *y,
356
+ const float findIntensity,
357
+ const int requestedAlgoCount,
358
+ int *returnedAlgoCount,
359
+ cudnnAlgorithmPerformance_t *perfResults,
360
+ const void *workspace,
361
+ size_t workSpaceSizeInBytes,
362
+ const cudnnFilterDescriptor_t dwDesc,
363
+ void *dw,
364
+ const void *reserveSpace,
365
+ size_t reserveSpaceSizeInBytes);
366
+
367
+ cudnnStatus_t CUDNNWINAPI
368
+ cudnnMultiHeadAttnBackwardData(cudnnHandle_t handle,
369
+ const cudnnAttnDescriptor_t attnDesc,
370
+ const int loWinIdx[],
371
+ const int hiWinIdx[],
372
+ const int devSeqLengthsDQDO[],
373
+ const int devSeqLengthsDKDV[],
374
+ const cudnnSeqDataDescriptor_t doDesc,
375
+ const void *dout,
376
+ const cudnnSeqDataDescriptor_t dqDesc,
377
+ void *dqueries,
378
+ const void *queries,
379
+ const cudnnSeqDataDescriptor_t dkDesc,
380
+ void *dkeys,
381
+ const void *keys,
382
+ const cudnnSeqDataDescriptor_t dvDesc,
383
+ void *dvalues,
384
+ const void *values,
385
+ size_t weightSizeInBytes,
386
+ const void *weights,
387
+ size_t workSpaceSizeInBytes,
388
+ void *workSpace,
389
+ size_t reserveSpaceSizeInBytes,
390
+ void *reserveSpace);
391
+
392
+ cudnnStatus_t CUDNNWINAPI
393
+ cudnnMultiHeadAttnBackwardWeights(cudnnHandle_t handle,
394
+ const cudnnAttnDescriptor_t attnDesc,
395
+ cudnnWgradMode_t addGrad,
396
+ const cudnnSeqDataDescriptor_t qDesc,
397
+ const void *queries,
398
+ const cudnnSeqDataDescriptor_t kDesc,
399
+ const void *keys,
400
+ const cudnnSeqDataDescriptor_t vDesc,
401
+ const void *values,
402
+ const cudnnSeqDataDescriptor_t doDesc,
403
+ const void *dout,
404
+ size_t weightSizeInBytes,
405
+ const void *weights,
406
+ void *dweights,
407
+ size_t workSpaceSizeInBytes,
408
+ void *workSpace,
409
+ size_t reserveSpaceSizeInBytes,
410
+ void *reserveSpace);
411
+
412
+ /*
413
+ * CTC (Connectionist Temporal Classification) loss descriptor create/destory/set/get functions
414
+ */
415
+ /* Input normalization mode for loss function */
416
+ typedef enum {
417
+ CUDNN_LOSS_NORMALIZATION_NONE = 0,
418
+ CUDNN_LOSS_NORMALIZATION_SOFTMAX = 1,
419
+ } cudnnLossNormalizationMode_t;
420
+
421
+ cudnnStatus_t CUDNNWINAPI
422
+ cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc);
423
+
424
+ cudnnStatus_t CUDNNWINAPI
425
+ cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType);
426
+
427
+ cudnnStatus_t CUDNNWINAPI
428
+ cudnnSetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
429
+ cudnnDataType_t compType,
430
+ cudnnLossNormalizationMode_t normMode,
431
+ cudnnNanPropagation_t gradMode);
432
+
433
+ cudnnStatus_t CUDNNWINAPI
434
+ cudnnSetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
435
+ cudnnDataType_t compType,
436
+ cudnnLossNormalizationMode_t normMode,
437
+ cudnnNanPropagation_t gradMode,
438
+ int maxLabelLength);
439
+
440
+ cudnnStatus_t CUDNNWINAPI
441
+ cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType);
442
+
443
+ cudnnStatus_t CUDNNWINAPI
444
+ cudnnGetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
445
+ cudnnDataType_t *compType,
446
+ cudnnLossNormalizationMode_t *normMode,
447
+ cudnnNanPropagation_t *gradMode);
448
+
449
+ cudnnStatus_t CUDNNWINAPI
450
+ cudnnGetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
451
+ cudnnDataType_t *compType,
452
+ cudnnLossNormalizationMode_t *normMode,
453
+ cudnnNanPropagation_t *gradMode,
454
+ int *maxLabelLength);
455
+
456
+ cudnnStatus_t CUDNNWINAPI
457
+ cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc);
458
+
459
+ /* return the ctc costs and gradients, given the probabilities and labels */
460
+ cudnnStatus_t CUDNNWINAPI
461
+ cudnnCTCLoss(
462
+ cudnnHandle_t handle,
463
+ const cudnnTensorDescriptor_t
464
+ probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the
465
+ mini batch size, A is the alphabet size) */
466
+ const void *probs, /* probabilities after softmax, in GPU memory */
467
+ const int hostLabels[], /* labels, in CPU memory */
468
+ const int hostLabelLengths[], /* the length of each label, in CPU memory */
469
+ const int hostInputLengths[], /* the lengths of timing steps in each batch, in CPU memory */
470
+ void *costs, /* the returned costs of CTC, in GPU memory */
471
+ const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
472
+ void *gradients, /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
473
+ cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
474
+ cudnnCTCLossDescriptor_t ctcLossDesc,
475
+ void *workspace, /* pointer to the workspace, in GPU memory */
476
+ size_t workSpaceSizeInBytes); /* size of the workspace */
477
+
478
+ /* return the ctc costs and gradients, given the probabilities and labels */
479
+ cudnnStatus_t CUDNNWINAPI
480
+ cudnnCTCLoss_v8(
481
+ cudnnHandle_t handle,
482
+ cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
483
+ cudnnCTCLossDescriptor_t ctcLossDesc,
484
+ const cudnnTensorDescriptor_t
485
+ probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the
486
+ mini batch size, A is the alphabet size) */
487
+ const void *probs, /* probabilities after softmax, in GPU memory */
488
+ const int labels[], /* labels, in GPU memory */
489
+ const int labelLengths[], /* the length of each label, in GPU memory */
490
+ const int inputLengths[], /* the lengths of timing steps in each batch, in GPU memory */
491
+ void *costs, /* the returned costs of CTC, in GPU memory */
492
+ const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
493
+ void *gradients, /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
494
+ size_t workSpaceSizeInBytes, /* size of the workspace */
495
+ void *workspace); /* pointer to the workspace, in GPU memory */
496
+
497
+ /* return the workspace size needed for ctc */
498
+ cudnnStatus_t CUDNNWINAPI
499
+ cudnnGetCTCLossWorkspaceSize(
500
+ cudnnHandle_t handle,
501
+ const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
502
+ timing steps, N is the mini batch size, A is the alphabet size) */
503
+ const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
504
+ dimensions are T,N,A. To compute costs
505
+ only, set it to NULL */
506
+ const int *labels, /* labels, in CPU memory */
507
+ const int *labelLengths, /* the length of each label, in CPU memory */
508
+ const int *inputLengths, /* the lengths of timing steps in each batch, in CPU memory */
509
+ cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
510
+ cudnnCTCLossDescriptor_t ctcLossDesc,
511
+ size_t *sizeInBytes); /* pointer to the returned workspace size */
512
+
513
+ /* return the workspace size needed for ctc */
514
+ cudnnStatus_t CUDNNWINAPI
515
+ cudnnGetCTCLossWorkspaceSize_v8(
516
+ cudnnHandle_t handle,
517
+ cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
518
+ cudnnCTCLossDescriptor_t ctcLossDesc,
519
+ const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
520
+ timing steps, N is the mini batch size, A is the alphabet size) */
521
+ const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
522
+ dimensions are T,N,A. To compute costs
523
+ only, set it to NULL */
524
+ size_t *sizeInBytes); /* pointer to the returned workspace size */
525
+
526
+ /*
527
+ * \brief Cross-library version checker.
528
+ * This function is implemented differently in each sub-library. Each sublib
529
+ * checks whether its own version matches that of its dependencies.
530
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
531
+ * CUDNN_STATUS_VERSION_MISMATCH if the versions are inconsistent.
532
+ */
533
+ cudnnStatus_t CUDNNWINAPI
534
+ cudnnAdvTrainVersionCheck(void);
535
+
536
+ #if defined(__cplusplus)
537
+ }
538
+ #endif
539
+
540
+ #endif /* CUDNN_ADV_TRAIN_H_ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_backend.h ADDED
@@ -0,0 +1,600 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2017-2022 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef _CUDNN_BACKEND_H_
51
+ #define _CUDNN_BACKEND_H_
52
+
53
+ /*
54
+ * The content in this header file is under development to be included in cudnn.h in the future
55
+ * Production code should have all include of this header file remove.
56
+ */
57
+
58
+ #include "cudnn_ops_infer.h"
59
+ #include "cudnn_cnn_infer.h"
60
+
61
+ /* NOTE: definition in extern "C" to be copied later to public header */
62
+ #if defined(__cplusplus)
63
+ extern "C" {
64
+ #endif
65
+
66
+ typedef void *cudnnBackendDescriptor_t;
67
+
68
+ typedef struct cudnnFractionStruct {
69
+ int64_t numerator;
70
+ int64_t denominator;
71
+ } cudnnFraction_t;
72
+
73
+ typedef enum {
74
+ CUDNN_POINTWISE_ADD = 0,
75
+ CUDNN_POINTWISE_ADD_SQUARE = 5,
76
+ CUDNN_POINTWISE_DIV = 6,
77
+ CUDNN_POINTWISE_MAX = 3,
78
+ CUDNN_POINTWISE_MIN = 2,
79
+ CUDNN_POINTWISE_MOD = 7,
80
+ CUDNN_POINTWISE_MUL = 1,
81
+ CUDNN_POINTWISE_POW = 8,
82
+ CUDNN_POINTWISE_SUB = 9,
83
+
84
+ CUDNN_POINTWISE_ABS = 10,
85
+ CUDNN_POINTWISE_CEIL = 11,
86
+ CUDNN_POINTWISE_COS = 12,
87
+ CUDNN_POINTWISE_EXP = 13,
88
+ CUDNN_POINTWISE_FLOOR = 14,
89
+ CUDNN_POINTWISE_LOG = 15,
90
+ CUDNN_POINTWISE_NEG = 16,
91
+ CUDNN_POINTWISE_RSQRT = 17,
92
+ CUDNN_POINTWISE_SIN = 18,
93
+ CUDNN_POINTWISE_SQRT = 4,
94
+ CUDNN_POINTWISE_TAN = 19,
95
+ CUDNN_POINTWISE_ERF = 20,
96
+ CUDNN_POINTWISE_IDENTITY = 21,
97
+
98
+ CUDNN_POINTWISE_RELU_FWD = 100,
99
+ CUDNN_POINTWISE_TANH_FWD = 101,
100
+ CUDNN_POINTWISE_SIGMOID_FWD = 102,
101
+ CUDNN_POINTWISE_ELU_FWD = 103,
102
+ CUDNN_POINTWISE_GELU_FWD = 104,
103
+ CUDNN_POINTWISE_SOFTPLUS_FWD = 105,
104
+ CUDNN_POINTWISE_SWISH_FWD = 106,
105
+ CUDNN_POINTWISE_GELU_APPROX_TANH_FWD = 107,
106
+
107
+ CUDNN_POINTWISE_RELU_BWD = 200,
108
+ CUDNN_POINTWISE_TANH_BWD = 201,
109
+ CUDNN_POINTWISE_SIGMOID_BWD = 202,
110
+ CUDNN_POINTWISE_ELU_BWD = 203,
111
+ CUDNN_POINTWISE_GELU_BWD = 204,
112
+ CUDNN_POINTWISE_SOFTPLUS_BWD = 205,
113
+ CUDNN_POINTWISE_SWISH_BWD = 206,
114
+ CUDNN_POINTWISE_GELU_APPROX_TANH_BWD = 207,
115
+
116
+ CUDNN_POINTWISE_CMP_EQ = 300,
117
+ CUDNN_POINTWISE_CMP_NEQ = 301,
118
+ CUDNN_POINTWISE_CMP_GT = 302,
119
+ CUDNN_POINTWISE_CMP_GE = 303,
120
+ CUDNN_POINTWISE_CMP_LT = 304,
121
+ CUDNN_POINTWISE_CMP_LE = 305,
122
+
123
+ CUDNN_POINTWISE_LOGICAL_AND = 400,
124
+ CUDNN_POINTWISE_LOGICAL_OR = 401,
125
+ CUDNN_POINTWISE_LOGICAL_NOT = 402,
126
+
127
+ CUDNN_POINTWISE_GEN_INDEX = 501,
128
+
129
+ CUDNN_POINTWISE_BINARY_SELECT = 601,
130
+ } cudnnPointwiseMode_t;
131
+
132
+ typedef enum {
133
+ CUDNN_RESAMPLE_NEAREST = 0,
134
+ CUDNN_RESAMPLE_BILINEAR = 1,
135
+ CUDNN_RESAMPLE_AVGPOOL = 2,
136
+ CUDNN_RESAMPLE_AVGPOOL_INCLUDE_PADDING = 2,
137
+ CUDNN_RESAMPLE_AVGPOOL_EXCLUDE_PADDING = 4,
138
+ CUDNN_RESAMPLE_MAXPOOL = 3,
139
+ } cudnnResampleMode_t;
140
+
141
+ typedef enum {
142
+ CUDNN_SIGNAL_SET = 0,
143
+ CUDNN_SIGNAL_WAIT = 1,
144
+ } cudnnSignalMode_t;
145
+
146
+ typedef enum {
147
+ CUDNN_GENSTATS_SUM_SQSUM = 0,
148
+ } cudnnGenStatsMode_t;
149
+
150
+ typedef enum {
151
+ CUDNN_BN_FINALIZE_STATISTICS_TRAINING = 0,
152
+ CUDNN_BN_FINALIZE_STATISTICS_INFERENCE = 1,
153
+ } cudnnBnFinalizeStatsMode_t;
154
+
155
+ typedef enum {
156
+ CUDNN_RNG_DISTRIBUTION_BERNOULLI,
157
+ CUDNN_RNG_DISTRIBUTION_UNIFORM,
158
+ CUDNN_RNG_DISTRIBUTION_NORMAL,
159
+ } cudnnRngDistribution_t;
160
+
161
+ typedef enum {
162
+ CUDNN_ATTR_POINTWISE_MODE = 0,
163
+ CUDNN_ATTR_POINTWISE_MATH_PREC = 1,
164
+ CUDNN_ATTR_POINTWISE_NAN_PROPAGATION = 2,
165
+ CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP = 3,
166
+ CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP = 4,
167
+ CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE = 5,
168
+ CUDNN_ATTR_POINTWISE_ELU_ALPHA = 6,
169
+ CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA = 7,
170
+ CUDNN_ATTR_POINTWISE_SWISH_BETA = 8,
171
+ CUDNN_ATTR_POINTWISE_AXIS = 9,
172
+
173
+ CUDNN_ATTR_CONVOLUTION_COMP_TYPE = 100,
174
+ CUDNN_ATTR_CONVOLUTION_CONV_MODE = 101,
175
+ CUDNN_ATTR_CONVOLUTION_DILATIONS = 102,
176
+ CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES = 103,
177
+ CUDNN_ATTR_CONVOLUTION_POST_PADDINGS = 104,
178
+ CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS = 105,
179
+ CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS = 106,
180
+
181
+ CUDNN_ATTR_ENGINEHEUR_MODE = 200,
182
+ CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH = 201,
183
+ CUDNN_ATTR_ENGINEHEUR_RESULTS = 202,
184
+
185
+ CUDNN_ATTR_ENGINECFG_ENGINE = 300,
186
+ CUDNN_ATTR_ENGINECFG_INTERMEDIATE_INFO = 301,
187
+ CUDNN_ATTR_ENGINECFG_KNOB_CHOICES = 302,
188
+
189
+ CUDNN_ATTR_EXECUTION_PLAN_HANDLE = 400,
190
+ CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG = 401,
191
+ CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE = 402,
192
+ CUDNN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDS = 403,
193
+ CUDNN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS = 404,
194
+ CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION = 405,
195
+
196
+ CUDNN_ATTR_INTERMEDIATE_INFO_UNIQUE_ID = 500,
197
+ CUDNN_ATTR_INTERMEDIATE_INFO_SIZE = 501,
198
+ CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_DATA_UIDS = 502,
199
+ CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_ATTRIBUTES = 503,
200
+
201
+ CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE = 600,
202
+ CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE = 601,
203
+
204
+ CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA = 700,
205
+ CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA = 701,
206
+ CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC = 702,
207
+ CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W = 703,
208
+ CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X = 704,
209
+ CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y = 705,
210
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA = 706,
211
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA = 707,
212
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC = 708,
213
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W = 709,
214
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX = 710,
215
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY = 711,
216
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA = 712,
217
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA = 713,
218
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC = 714,
219
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW = 715,
220
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X = 716,
221
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY = 717,
222
+
223
+ CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR = 750,
224
+ CUDNN_ATTR_OPERATION_POINTWISE_XDESC = 751,
225
+ CUDNN_ATTR_OPERATION_POINTWISE_BDESC = 752,
226
+ CUDNN_ATTR_OPERATION_POINTWISE_YDESC = 753,
227
+ CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1 = 754,
228
+ CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2 = 755,
229
+ CUDNN_ATTR_OPERATION_POINTWISE_DXDESC = 756,
230
+ CUDNN_ATTR_OPERATION_POINTWISE_DYDESC = 757,
231
+ CUDNN_ATTR_OPERATION_POINTWISE_TDESC = 758,
232
+
233
+ CUDNN_ATTR_OPERATION_GENSTATS_MODE = 770,
234
+ CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC = 771,
235
+ CUDNN_ATTR_OPERATION_GENSTATS_XDESC = 772,
236
+ CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC = 773,
237
+ CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC = 774,
238
+
239
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE = 780,
240
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC = 781,
241
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC = 782,
242
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC = 783,
243
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC = 784,
244
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC = 785,
245
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC = 786,
246
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC = 787,
247
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC = 788,
248
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC = 789,
249
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC = 790,
250
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC = 791,
251
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC = 792,
252
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC = 793,
253
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC = 794,
254
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC = 795,
255
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERATE_FACTOR_DESC = 796,
256
+
257
+ CUDNN_ATTR_OPERATIONGRAPH_HANDLE = 800,
258
+ CUDNN_ATTR_OPERATIONGRAPH_OPS = 801,
259
+ CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT = 802,
260
+
261
+ CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT = 900,
262
+ CUDNN_ATTR_TENSOR_DATA_TYPE = 901,
263
+ CUDNN_ATTR_TENSOR_DIMENSIONS = 902,
264
+ CUDNN_ATTR_TENSOR_STRIDES = 903,
265
+ CUDNN_ATTR_TENSOR_VECTOR_COUNT = 904,
266
+ CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION = 905,
267
+ CUDNN_ATTR_TENSOR_UNIQUE_ID = 906,
268
+ CUDNN_ATTR_TENSOR_IS_VIRTUAL = 907,
269
+ CUDNN_ATTR_TENSOR_IS_BY_VALUE = 908,
270
+ CUDNN_ATTR_TENSOR_REORDERING_MODE = 909,
271
+
272
+ CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS = 1000,
273
+ CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS = 1001,
274
+ CUDNN_ATTR_VARIANT_PACK_INTERMEDIATES = 1002,
275
+ CUDNN_ATTR_VARIANT_PACK_WORKSPACE = 1003,
276
+
277
+ CUDNN_ATTR_LAYOUT_INFO_TENSOR_UID = 1100,
278
+ CUDNN_ATTR_LAYOUT_INFO_TYPES = 1101,
279
+
280
+ CUDNN_ATTR_KNOB_INFO_TYPE = 1200,
281
+ CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE = 1201,
282
+ CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE = 1202,
283
+ CUDNN_ATTR_KNOB_INFO_STRIDE = 1203,
284
+
285
+ CUDNN_ATTR_ENGINE_OPERATION_GRAPH = 1300,
286
+ CUDNN_ATTR_ENGINE_GLOBAL_INDEX = 1301,
287
+ CUDNN_ATTR_ENGINE_KNOB_INFO = 1302,
288
+ CUDNN_ATTR_ENGINE_NUMERICAL_NOTE = 1303,
289
+ CUDNN_ATTR_ENGINE_LAYOUT_INFO = 1304,
290
+ CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE = 1305,
291
+
292
+ CUDNN_ATTR_MATMUL_COMP_TYPE = 1500,
293
+
294
+ CUDNN_ATTR_OPERATION_MATMUL_ADESC = 1520,
295
+ CUDNN_ATTR_OPERATION_MATMUL_BDESC = 1521,
296
+ CUDNN_ATTR_OPERATION_MATMUL_CDESC = 1522,
297
+ CUDNN_ATTR_OPERATION_MATMUL_DESC = 1523,
298
+ CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT = 1524,
299
+ CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC = 1525,
300
+ CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC = 1526,
301
+ CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC = 1527,
302
+
303
+ CUDNN_ATTR_REDUCTION_OPERATOR = 1600,
304
+ CUDNN_ATTR_REDUCTION_COMP_TYPE = 1601,
305
+
306
+ CUDNN_ATTR_OPERATION_REDUCTION_XDESC = 1610,
307
+ CUDNN_ATTR_OPERATION_REDUCTION_YDESC = 1611,
308
+ CUDNN_ATTR_OPERATION_REDUCTION_DESC = 1612,
309
+
310
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC = 1620,
311
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MEAN_DESC = 1621,
312
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_INVSTD_DESC = 1622,
313
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_BN_SCALE_DESC = 1623,
314
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_X_DESC = 1624,
315
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DY_DESC = 1625,
316
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_SCALE_DESC = 1626,
317
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_BIAS_DESC = 1627,
318
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_DY_SCALE_DESC = 1628,
319
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_X_SCALE_DESC = 1629,
320
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_BIAS = 1630,
321
+
322
+ CUDNN_ATTR_RESAMPLE_MODE = 1700,
323
+ CUDNN_ATTR_RESAMPLE_COMP_TYPE = 1701,
324
+ CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS = 1702,
325
+ CUDNN_ATTR_RESAMPLE_POST_PADDINGS = 1703,
326
+ CUDNN_ATTR_RESAMPLE_PRE_PADDINGS = 1704,
327
+ CUDNN_ATTR_RESAMPLE_STRIDES = 1705,
328
+ CUDNN_ATTR_RESAMPLE_WINDOW_DIMS = 1706,
329
+ CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION = 1707,
330
+ CUDNN_ATTR_RESAMPLE_PADDING_MODE = 1708,
331
+
332
+ CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC = 1710,
333
+ CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC = 1711,
334
+ CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC = 1712,
335
+ CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA = 1713,
336
+ CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA = 1714,
337
+ CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC = 1716,
338
+
339
+ CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC = 1720,
340
+ CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC = 1721,
341
+ CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC = 1722,
342
+ CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA = 1723,
343
+ CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA = 1724,
344
+ CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC = 1725,
345
+ CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC = 1726,
346
+ CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC = 1727,
347
+
348
+ CUDNN_ATTR_OPERATION_CONCAT_AXIS = 1800,
349
+ CUDNN_ATTR_OPERATION_CONCAT_INPUT_DESCS = 1801,
350
+ CUDNN_ATTR_OPERATION_CONCAT_INPLACE_INDEX = 1802,
351
+ CUDNN_ATTR_OPERATION_CONCAT_OUTPUT_DESC = 1803,
352
+
353
+ CUDNN_ATTR_OPERATION_SIGNAL_MODE = 1900,
354
+ CUDNN_ATTR_OPERATION_SIGNAL_FLAGDESC = 1901,
355
+ CUDNN_ATTR_OPERATION_SIGNAL_VALUE = 1902,
356
+ CUDNN_ATTR_OPERATION_SIGNAL_XDESC = 1903,
357
+ CUDNN_ATTR_OPERATION_SIGNAL_YDESC = 1904,
358
+
359
+ CUDNN_ATTR_OPERATION_NORM_FWD_MODE = 2000,
360
+ CUDNN_ATTR_OPERATION_NORM_FWD_PHASE = 2001,
361
+ CUDNN_ATTR_OPERATION_NORM_FWD_XDESC = 2002,
362
+ CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC = 2003,
363
+ CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC = 2004,
364
+ CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC = 2005,
365
+ CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC = 2006,
366
+ CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC = 2007,
367
+ CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESC = 2008,
368
+ CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC = 2009,
369
+ CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC = 2010,
370
+ CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC = 2011,
371
+ CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC = 2012,
372
+ CUDNN_ATTR_OPERATION_NORM_FWD_YDESC = 2013,
373
+ CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS = 2014,
374
+
375
+ CUDNN_ATTR_OPERATION_NORM_BWD_MODE = 2100,
376
+ CUDNN_ATTR_OPERATION_NORM_BWD_XDESC = 2101,
377
+ CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC = 2102,
378
+ CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC = 2103,
379
+ CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC = 2104,
380
+ CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC = 2105,
381
+ CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESC = 2106,
382
+ CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC = 2107,
383
+ CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC = 2108,
384
+ CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC = 2109,
385
+ CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS = 2110,
386
+
387
+ CUDNN_ATTR_OPERATION_RESHAPE_XDESC = 2200,
388
+ CUDNN_ATTR_OPERATION_RESHAPE_YDESC = 2201,
389
+
390
+ CUDNN_ATTR_RNG_DISTRIBUTION = 2300,
391
+ CUDNN_ATTR_RNG_NORMAL_DIST_MEAN = 2301,
392
+ CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION = 2302,
393
+ CUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM = 2303,
394
+ CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM = 2304,
395
+ CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY = 2305,
396
+
397
+ CUDNN_ATTR_OPERATION_RNG_YDESC = 2310,
398
+ CUDNN_ATTR_OPERATION_RNG_SEED = 2311,
399
+ CUDNN_ATTR_OPERATION_RNG_DESC = 2312,
400
+
401
+ } cudnnBackendAttributeName_t;
402
+
403
+ typedef enum {
404
+ CUDNN_TYPE_HANDLE = 0,
405
+ CUDNN_TYPE_DATA_TYPE,
406
+ CUDNN_TYPE_BOOLEAN,
407
+ CUDNN_TYPE_INT64,
408
+ CUDNN_TYPE_FLOAT,
409
+ CUDNN_TYPE_DOUBLE,
410
+ CUDNN_TYPE_VOID_PTR,
411
+ CUDNN_TYPE_CONVOLUTION_MODE,
412
+ CUDNN_TYPE_HEUR_MODE,
413
+ CUDNN_TYPE_KNOB_TYPE,
414
+ CUDNN_TYPE_NAN_PROPOGATION,
415
+ CUDNN_TYPE_NUMERICAL_NOTE,
416
+ CUDNN_TYPE_LAYOUT_TYPE,
417
+ CUDNN_TYPE_ATTRIB_NAME,
418
+ CUDNN_TYPE_POINTWISE_MODE,
419
+ CUDNN_TYPE_BACKEND_DESCRIPTOR,
420
+ CUDNN_TYPE_GENSTATS_MODE,
421
+ CUDNN_TYPE_BN_FINALIZE_STATS_MODE,
422
+ CUDNN_TYPE_REDUCTION_OPERATOR_TYPE,
423
+ CUDNN_TYPE_BEHAVIOR_NOTE,
424
+ CUDNN_TYPE_TENSOR_REORDERING_MODE,
425
+ CUDNN_TYPE_RESAMPLE_MODE,
426
+ CUDNN_TYPE_PADDING_MODE,
427
+ CUDNN_TYPE_INT32,
428
+ CUDNN_TYPE_CHAR,
429
+ CUDNN_TYPE_SIGNAL_MODE,
430
+ CUDNN_TYPE_FRACTION,
431
+ CUDNN_TYPE_NORM_MODE,
432
+ CUDNN_TYPE_NORM_FWD_PHASE,
433
+ CUDNN_TYPE_RNG_DISTRIBUTION
434
+ } cudnnBackendAttributeType_t;
435
+
436
+ typedef enum {
437
+ CUDNN_BACKEND_POINTWISE_DESCRIPTOR = 0,
438
+ CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR,
439
+ CUDNN_BACKEND_ENGINE_DESCRIPTOR,
440
+ CUDNN_BACKEND_ENGINECFG_DESCRIPTOR,
441
+ CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR,
442
+ CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR,
443
+ CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR,
444
+ CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR,
445
+ CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR,
446
+ CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR,
447
+ CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR,
448
+ CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR,
449
+ CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR,
450
+ CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR,
451
+ CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR,
452
+ CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR,
453
+ CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR,
454
+ CUDNN_BACKEND_TENSOR_DESCRIPTOR,
455
+ CUDNN_BACKEND_MATMUL_DESCRIPTOR,
456
+ CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR,
457
+ CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR,
458
+ CUDNN_BACKEND_REDUCTION_DESCRIPTOR,
459
+ CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR,
460
+ CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR,
461
+ CUDNN_BACKEND_RESAMPLE_DESCRIPTOR,
462
+ CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR,
463
+ CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR,
464
+ CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR,
465
+ CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR,
466
+ CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR,
467
+ CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR,
468
+ CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR,
469
+ CUDNN_BACKEND_RNG_DESCRIPTOR,
470
+ CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR
471
+ } cudnnBackendDescriptorType_t;
472
+
473
+ typedef enum {
474
+ CUDNN_NUMERICAL_NOTE_TENSOR_CORE = 0,
475
+ CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS,
476
+ CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION,
477
+ CUDNN_NUMERICAL_NOTE_FFT,
478
+ CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC,
479
+ CUDNN_NUMERICAL_NOTE_WINOGRAD,
480
+ CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4,
481
+ CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6,
482
+ CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13,
483
+ CUDNN_NUMERICAL_NOTE_TYPE_COUNT,
484
+ } cudnnBackendNumericalNote_t;
485
+
486
+ typedef enum {
487
+ CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION = 0,
488
+ CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER = 1,
489
+ CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER = 2,
490
+ CUDNN_BEHAVIOR_NOTE_TYPE_COUNT,
491
+ } cudnnBackendBehaviorNote_t;
492
+
493
+ typedef enum {
494
+ CUDNN_KNOB_TYPE_SPLIT_K = 0,
495
+ CUDNN_KNOB_TYPE_SWIZZLE = 1,
496
+ CUDNN_KNOB_TYPE_TILE_SIZE = 2,
497
+ CUDNN_KNOB_TYPE_USE_TEX = 3,
498
+ CUDNN_KNOB_TYPE_EDGE = 4,
499
+ CUDNN_KNOB_TYPE_KBLOCK = 5,
500
+ CUDNN_KNOB_TYPE_LDGA = 6,
501
+ CUDNN_KNOB_TYPE_LDGB = 7,
502
+ CUDNN_KNOB_TYPE_CHUNK_K = 8,
503
+ CUDNN_KNOB_TYPE_SPLIT_H = 9,
504
+ CUDNN_KNOB_TYPE_WINO_TILE = 10,
505
+ CUDNN_KNOB_TYPE_MULTIPLY = 11,
506
+ CUDNN_KNOB_TYPE_SPLIT_K_BUF = 12,
507
+ CUDNN_KNOB_TYPE_TILEK = 13,
508
+ CUDNN_KNOB_TYPE_STAGES = 14,
509
+ CUDNN_KNOB_TYPE_REDUCTION_MODE = 15,
510
+ CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE = 16,
511
+ CUDNN_KNOB_TYPE_SPLIT_K_SLC = 17,
512
+ CUDNN_KNOB_TYPE_IDX_MODE = 18,
513
+ CUDNN_KNOB_TYPE_SLICED = 19,
514
+ CUDNN_KNOB_TYPE_SPLIT_RS = 20,
515
+ CUDNN_KNOB_TYPE_SINGLEBUFFER = 21,
516
+ CUDNN_KNOB_TYPE_LDGC = 22,
517
+ CUDNN_KNOB_TYPE_SPECFILT = 23,
518
+ CUDNN_KNOB_TYPE_KERNEL_CFG = 24,
519
+ CUDNN_KNOB_TYPE_WORKSPACE = 25,
520
+ CUDNN_KNOB_TYPE_TILE_CGA = 26,
521
+ CUDNN_KNOB_TYPE_TILE_CGA_M = 27,
522
+ CUDNN_KNOB_TYPE_TILE_CGA_N = 28,
523
+
524
+ CUDNN_KNOB_TYPE_COUNTS,
525
+ } cudnnBackendKnobType_t;
526
+
527
+ typedef enum {
528
+ CUDNN_LAYOUT_TYPE_PREFERRED_NCHW = 0,
529
+ CUDNN_LAYOUT_TYPE_PREFERRED_NHWC = 1,
530
+ CUDNN_LAYOUT_TYPE_PREFERRED_PAD4CK = 2,
531
+ CUDNN_LAYOUT_TYPE_PREFERRED_PAD8CK = 3,
532
+ CUDNN_LAYOUT_TYPE_COUNT = 4,
533
+ } cudnnBackendLayoutType_t;
534
+
535
+ typedef enum {
536
+ CUDNN_HEUR_MODE_INSTANT = 0,
537
+ CUDNN_HEUR_MODE_B = 1,
538
+ CUDNN_HEUR_MODE_FALLBACK = 2,
539
+ CUDNN_HEUR_MODE_A = 3,
540
+ CUDNN_HEUR_MODES_COUNT = 4,
541
+ } cudnnBackendHeurMode_t;
542
+
543
+ typedef enum {
544
+ CUDNN_TENSOR_REORDERING_NONE = 0,
545
+ CUDNN_TENSOR_REORDERING_INT8x32 = 1,
546
+ } cudnnBackendTensorReordering_t;
547
+
548
+ typedef enum {
549
+ CUDNN_ZERO_PAD = 0,
550
+ CUDNN_NEG_INF_PAD = 1,
551
+ CUDNN_EDGE_VAL_PAD = 2,
552
+ } cudnnPaddingMode_t;
553
+
554
+ typedef enum {
555
+ CUDNN_LAYER_NORM = 0,
556
+ CUDNN_INSTANCE_NORM = 1,
557
+ CUDNN_BATCH_NORM = 2,
558
+ CUDNN_GROUP_NORM = 3,
559
+ } cudnnBackendNormMode_t;
560
+
561
+ typedef enum {
562
+ CUDNN_NORM_FWD_INFERENCE = 0,
563
+ CUDNN_NORM_FWD_TRAINING = 1,
564
+ } cudnnBackendNormFwdPhase_t;
565
+
566
+ cudnnStatus_t CUDNNWINAPI
567
+ cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t *descriptor);
568
+
569
+ cudnnStatus_t CUDNNWINAPI
570
+ cudnnBackendDestroyDescriptor(cudnnBackendDescriptor_t descriptor);
571
+
572
+ cudnnStatus_t CUDNNWINAPI
573
+ cudnnBackendInitialize(cudnnBackendDescriptor_t descriptor);
574
+
575
+ cudnnStatus_t CUDNNWINAPI
576
+ cudnnBackendFinalize(cudnnBackendDescriptor_t descriptor);
577
+
578
+ cudnnStatus_t CUDNNWINAPI
579
+ cudnnBackendSetAttribute(cudnnBackendDescriptor_t descriptor,
580
+ cudnnBackendAttributeName_t attributeName,
581
+ cudnnBackendAttributeType_t attributeType,
582
+ int64_t elementCount,
583
+ const void *arrayOfElements);
584
+
585
+ cudnnStatus_t CUDNNWINAPI
586
+ cudnnBackendGetAttribute(cudnnBackendDescriptor_t const descriptor,
587
+ cudnnBackendAttributeName_t attributeName,
588
+ cudnnBackendAttributeType_t attributeType,
589
+ int64_t requestedElementCount,
590
+ int64_t *elementCount,
591
+ void *arrayOfElements);
592
+
593
+ cudnnStatus_t CUDNNWINAPI
594
+ cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack);
595
+
596
+ #if defined(__cplusplus)
597
+ }
598
+ #endif
599
+
600
+ #endif /* _CUDNN_BACKEND_H_ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_infer.h ADDED
@@ -0,0 +1,1183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2017-2022 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /*
51
+ * cudnn_ops_infer : cuDNN's basic definitions and inference operations.
52
+ */
53
+
54
+ #if !defined(CUDNN_OPS_INFER_H_)
55
+ #define CUDNN_OPS_INFER_H_
56
+
57
+ #include <cuda_runtime.h>
58
+ #include <stdint.h>
59
+
60
+ #include "cudnn_version.h"
61
+
62
+ /* These version numbers are autogenerated, do not edit manually. */
63
+ #define CUDNN_OPS_INFER_MAJOR 8
64
+ #define CUDNN_OPS_INFER_MINOR 7
65
+ #define CUDNN_OPS_INFER_PATCH 0
66
+
67
+ #if (CUDNN_OPS_INFER_MAJOR != CUDNN_MAJOR) || (CUDNN_OPS_INFER_MINOR != CUDNN_MINOR) || \
68
+ (CUDNN_OPS_INFER_PATCH != CUDNN_PATCHLEVEL)
69
+ #error Version mismatch in cuDNN OPS INFER!!!
70
+ #endif
71
+
72
+ #ifndef CUDNNWINAPI
73
+ #ifdef _WIN32
74
+ #define CUDNNWINAPI __stdcall
75
+ #else
76
+ #define CUDNNWINAPI
77
+ #endif
78
+ #endif
79
+
80
+ /* Warnings for deprecated API-s are enabled using the CUDNN_WARN_DEPRECATED macro */
81
+ #if defined(CUDNN_WARN_DEPRECATED) && (defined(__GNUC__) || defined(__clang__))
82
+ /* GCC, Intel C/C++, Cray C/C++, CLANG, IBM XL C/C++ little endian */
83
+ #define CUDNN_DEPRECATED __attribute__((deprecated))
84
+ #elif defined(CUDNN_WARN_DEPRECATED) && defined(_MSC_VER)
85
+ /* Microsoft Visual C++ */
86
+ #define CUDNN_DEPRECATED __declspec(deprecated)
87
+ #elif defined(CUDNN_WARN_DEPRECATED) && (__cplusplus >= 201402L)
88
+ /* C++14 compilers */
89
+ #define CUDNN_DEPRECATED [[deprecated]]
90
+ #else
91
+ /* No support for the deprecated attribute */
92
+ #define CUDNN_DEPRECATED
93
+ #endif
94
+
95
+ #if defined(__cplusplus)
96
+ extern "C" {
97
+ #endif
98
+
99
+ struct cudnnContext;
100
+ typedef struct cudnnContext *cudnnHandle_t;
101
+
102
+ size_t CUDNNWINAPI
103
+ cudnnGetVersion(void);
104
+
105
+ size_t CUDNNWINAPI
106
+ cudnnGetMaxDeviceVersion(void);
107
+
108
+ /* Returns CUDA Runtime version statically linked against cudnn */
109
+ size_t CUDNNWINAPI
110
+ cudnnGetCudartVersion(void);
111
+
112
+ /*
113
+ * CUDNN return codes
114
+ */
115
+ typedef enum {
116
+ CUDNN_STATUS_SUCCESS = 0,
117
+ CUDNN_STATUS_NOT_INITIALIZED = 1,
118
+ CUDNN_STATUS_ALLOC_FAILED = 2,
119
+ CUDNN_STATUS_BAD_PARAM = 3,
120
+ CUDNN_STATUS_INTERNAL_ERROR = 4,
121
+ CUDNN_STATUS_INVALID_VALUE = 5,
122
+ CUDNN_STATUS_ARCH_MISMATCH = 6,
123
+ CUDNN_STATUS_MAPPING_ERROR = 7,
124
+ CUDNN_STATUS_EXECUTION_FAILED = 8,
125
+ CUDNN_STATUS_NOT_SUPPORTED = 9,
126
+ CUDNN_STATUS_LICENSE_ERROR = 10,
127
+ CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING = 11,
128
+ CUDNN_STATUS_RUNTIME_IN_PROGRESS = 12,
129
+ CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 13,
130
+ CUDNN_STATUS_VERSION_MISMATCH = 14,
131
+ } cudnnStatus_t;
132
+
133
+ /* human-readable error messages */
134
+ const char *CUDNNWINAPI
135
+ cudnnGetErrorString(cudnnStatus_t status);
136
+
137
+ /* Forward definition in this version only */
138
+ typedef struct cudnnRuntimeTag_t cudnnRuntimeTag_t;
139
+
140
+ typedef enum {
141
+ CUDNN_ERRQUERY_RAWCODE = 0,
142
+ CUDNN_ERRQUERY_NONBLOCKING = 1,
143
+ CUDNN_ERRQUERY_BLOCKING = 2,
144
+ } cudnnErrQueryMode_t;
145
+
146
+ cudnnStatus_t CUDNNWINAPI
147
+ cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag);
148
+
149
+ #ifndef __LIBRARY_TYPES_H__
150
+
151
+ typedef enum libraryPropertyType_t { MAJOR_VERSION, MINOR_VERSION, PATCH_LEVEL } libraryPropertyType;
152
+
153
+ #endif
154
+
155
+ cudnnStatus_t CUDNNWINAPI
156
+ cudnnGetProperty(libraryPropertyType type, int *value);
157
+
158
+ cudnnStatus_t CUDNNWINAPI
159
+ cudnnCreate(cudnnHandle_t *handle);
160
+ cudnnStatus_t CUDNNWINAPI
161
+ cudnnDestroy(cudnnHandle_t handle);
162
+ cudnnStatus_t CUDNNWINAPI
163
+ cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId);
164
+ cudnnStatus_t CUDNNWINAPI
165
+ cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId);
166
+
167
+ /* Data structures to represent Image/Filter and the Neural Network Layer */
168
+ typedef struct cudnnTensorStruct *cudnnTensorDescriptor_t;
169
+ typedef struct cudnnPoolingStruct *cudnnPoolingDescriptor_t;
170
+ typedef struct cudnnFilterStruct *cudnnFilterDescriptor_t;
171
+ typedef struct cudnnLRNStruct *cudnnLRNDescriptor_t;
172
+ typedef struct cudnnActivationStruct *cudnnActivationDescriptor_t;
173
+ typedef struct cudnnSpatialTransformerStruct *cudnnSpatialTransformerDescriptor_t;
174
+ typedef struct cudnnOpTensorStruct *cudnnOpTensorDescriptor_t;
175
+ typedef struct cudnnReduceTensorStruct *cudnnReduceTensorDescriptor_t;
176
+ typedef struct cudnnCTCLossStruct *cudnnCTCLossDescriptor_t;
177
+ typedef struct cudnnTensorTransformStruct *cudnnTensorTransformDescriptor_t;
178
+ /*
179
+ * CUDNN data type
180
+ */
181
+ typedef enum {
182
+ CUDNN_DATA_FLOAT = 0,
183
+ CUDNN_DATA_DOUBLE = 1,
184
+ CUDNN_DATA_HALF = 2,
185
+ CUDNN_DATA_INT8 = 3,
186
+ CUDNN_DATA_INT32 = 4,
187
+ CUDNN_DATA_INT8x4 = 5,
188
+ CUDNN_DATA_UINT8 = 6,
189
+ CUDNN_DATA_UINT8x4 = 7,
190
+ CUDNN_DATA_INT8x32 = 8,
191
+ CUDNN_DATA_BFLOAT16 = 9,
192
+ CUDNN_DATA_INT64 = 10,
193
+ CUDNN_DATA_BOOLEAN = 11,
194
+ CUDNN_DATA_FP8_E4M3 = 12,
195
+ CUDNN_DATA_FP8_E5M2 = 13,
196
+ CUDNN_DATA_FAST_FLOAT_FOR_FP8 = 14,
197
+ } cudnnDataType_t;
198
+
199
+ /*
200
+ * CUDNN math type
201
+ */
202
+ typedef enum {
203
+ CUDNN_DEFAULT_MATH = 0,
204
+ CUDNN_TENSOR_OP_MATH = 1,
205
+ CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION = 2,
206
+ CUDNN_FMA_MATH = 3,
207
+ } cudnnMathType_t;
208
+
209
+ /*
210
+ * CUDNN propagate Nan
211
+ */
212
+ typedef enum {
213
+ CUDNN_NOT_PROPAGATE_NAN = 0,
214
+ CUDNN_PROPAGATE_NAN = 1,
215
+ } cudnnNanPropagation_t;
216
+
217
+ /*
218
+ * CUDNN Determinism
219
+ */
220
+ typedef enum {
221
+ CUDNN_NON_DETERMINISTIC = 0,
222
+ CUDNN_DETERMINISTIC = 1,
223
+ } cudnnDeterminism_t;
224
+
225
+ /* Maximum supported number of tensor dimensions */
226
+ #define CUDNN_DIM_MAX 8
227
+
228
+ /* Create an instance of a generic Tensor descriptor */
229
+ cudnnStatus_t CUDNNWINAPI
230
+ cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc);
231
+
232
+ typedef enum {
233
+ CUDNN_TENSOR_NCHW = 0, /* row major (wStride = 1, hStride = w) */
234
+ CUDNN_TENSOR_NHWC = 1, /* feature maps interleaved ( cStride = 1 )*/
235
+ CUDNN_TENSOR_NCHW_VECT_C = 2, /* each image point is vector of element of C, vector length in data type */
236
+ } cudnnTensorFormat_t;
237
+
238
+ cudnnStatus_t CUDNNWINAPI
239
+ cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc,
240
+ cudnnTensorFormat_t format,
241
+ cudnnDataType_t dataType, /* image data type */
242
+ int n, /* number of inputs (batch size) */
243
+ int c, /* number of input feature maps */
244
+ int h, /* height of input section */
245
+ int w); /* width of input section */
246
+
247
+ cudnnStatus_t CUDNNWINAPI
248
+ cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
249
+ cudnnDataType_t dataType, /* image data type */
250
+ int n, /* number of inputs (batch size) */
251
+ int c, /* number of input feature maps */
252
+ int h, /* height of input section */
253
+ int w, /* width of input section */
254
+ int nStride,
255
+ int cStride,
256
+ int hStride,
257
+ int wStride);
258
+
259
+ cudnnStatus_t CUDNNWINAPI
260
+ cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc,
261
+ cudnnDataType_t *dataType, /* image data type */
262
+ int *n, /* number of inputs (batch size) */
263
+ int *c, /* number of input feature maps */
264
+ int *h, /* height of input section */
265
+ int *w, /* width of input section */
266
+ int *nStride,
267
+ int *cStride,
268
+ int *hStride,
269
+ int *wStride);
270
+
271
+ cudnnStatus_t CUDNNWINAPI
272
+ cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
273
+ cudnnDataType_t dataType,
274
+ int nbDims,
275
+ const int dimA[],
276
+ const int strideA[]);
277
+
278
+ cudnnStatus_t CUDNNWINAPI
279
+ cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
280
+ cudnnTensorFormat_t format,
281
+ cudnnDataType_t dataType,
282
+ int nbDims,
283
+ const int dimA[]);
284
+
285
+ cudnnStatus_t CUDNNWINAPI
286
+ cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc,
287
+ int nbDimsRequested,
288
+ cudnnDataType_t *dataType,
289
+ int *nbDims,
290
+ int dimA[],
291
+ int strideA[]);
292
+
293
+ cudnnStatus_t CUDNNWINAPI
294
+ cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size);
295
+
296
+ /* PixelOffset( n, c, h, w ) = n *input_stride + c * feature_stride + h * h_stride + w * w_stride
297
+
298
+ 1)Example of all images in row major order one batch of features after the other (with an optional padding on row)
299
+ input_stride : c x h x h_stride
300
+ feature_stride : h x h_stride
301
+ h_stride : >= w ( h_stride = w if no padding)
302
+ w_stride : 1
303
+
304
+
305
+ 2)Example of all images in row major with features maps interleaved
306
+ input_stride : c x h x h_stride
307
+ feature_stride : 1
308
+ h_stride : w x c
309
+ w_stride : c
310
+
311
+ 3)Example of all images in column major order one batch of features after the other (with optional padding on column)
312
+ input_stride : c x w x w_stride
313
+ feature_stride : w x w_stride
314
+ h_stride : 1
315
+ w_stride : >= h
316
+
317
+ */
318
+
319
+ /* Destroy an instance of Tensor4d descriptor */
320
+ cudnnStatus_t CUDNNWINAPI
321
+ cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc);
322
+
323
+ /* Fold/unfold transforms */
324
+ typedef enum {
325
+ CUDNN_TRANSFORM_FOLD = 0U,
326
+ CUDNN_TRANSFORM_UNFOLD = 1U,
327
+ } cudnnFoldingDirection_t;
328
+
329
+ /** Create a destination descriptor for cudnnTransformTensor */
330
+ cudnnStatus_t CUDNNWINAPI
331
+ cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t transformDesc,
332
+ const cudnnTensorDescriptor_t srcDesc,
333
+ cudnnTensorDescriptor_t destDesc,
334
+ size_t *destSizeInBytes);
335
+
336
+ /** Create an empty tensor transform descriptor */
337
+ cudnnStatus_t CUDNNWINAPI
338
+ cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescriptor_t *transformDesc);
339
+
340
+ /** Initialize a previously created tensor transform descriptor. */
341
+ cudnnStatus_t CUDNNWINAPI
342
+ cudnnSetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
343
+ const uint32_t nbDims,
344
+ const cudnnTensorFormat_t destFormat,
345
+ const int32_t padBeforeA[],
346
+ const int32_t padAfterA[],
347
+ const uint32_t foldA[],
348
+ const cudnnFoldingDirection_t direction);
349
+
350
+ /**
351
+ * Retrieves the values stored in a previously initialized tensor transform
352
+ * descriptor.
353
+ */
354
+ cudnnStatus_t CUDNNWINAPI
355
+ cudnnGetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
356
+ uint32_t nbDimsRequested,
357
+ cudnnTensorFormat_t *destFormat,
358
+ int32_t padBeforeA[],
359
+ int32_t padAfterA[],
360
+ uint32_t foldA[],
361
+ cudnnFoldingDirection_t *direction);
362
+
363
+ /**
364
+ * Destroys a previously created tensor transform descriptor.
365
+ */
366
+ cudnnStatus_t CUDNNWINAPI
367
+ cudnnDestroyTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc);
368
+
369
+ /* Tensor layout conversion helper (y = alpha * x + beta * y) */
370
+ cudnnStatus_t CUDNNWINAPI
371
+ cudnnTransformTensor(cudnnHandle_t handle,
372
+ const void *alpha,
373
+ const cudnnTensorDescriptor_t xDesc,
374
+ const void *x,
375
+ const void *beta,
376
+ const cudnnTensorDescriptor_t yDesc,
377
+ void *y);
378
+
379
+ cudnnStatus_t CUDNNWINAPI
380
+ cudnnTransformTensorEx(cudnnHandle_t handle,
381
+ const cudnnTensorTransformDescriptor_t transDesc,
382
+ const void *alpha,
383
+ const cudnnTensorDescriptor_t srcDesc,
384
+ const void *srcData,
385
+ const void *beta,
386
+ const cudnnTensorDescriptor_t destDesc,
387
+ void *destData);
388
+
389
+ /* Tensor Bias addition : C = alpha * A + beta * C */
390
+ cudnnStatus_t CUDNNWINAPI
391
+ cudnnAddTensor(cudnnHandle_t handle,
392
+ const void *alpha,
393
+ const cudnnTensorDescriptor_t aDesc,
394
+ const void *A,
395
+ const void *beta,
396
+ const cudnnTensorDescriptor_t cDesc,
397
+ void *C);
398
+
399
+ /*
400
+ * CUDNN OpTensor op type
401
+ */
402
+ typedef enum {
403
+ CUDNN_OP_TENSOR_ADD = 0,
404
+ CUDNN_OP_TENSOR_MUL = 1,
405
+ CUDNN_OP_TENSOR_MIN = 2,
406
+ CUDNN_OP_TENSOR_MAX = 3,
407
+ CUDNN_OP_TENSOR_SQRT = 4,
408
+ CUDNN_OP_TENSOR_NOT = 5,
409
+ } cudnnOpTensorOp_t;
410
+
411
+ cudnnStatus_t CUDNNWINAPI
412
+ cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc);
413
+
414
+ cudnnStatus_t CUDNNWINAPI
415
+ cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc,
416
+ cudnnOpTensorOp_t opTensorOp,
417
+ cudnnDataType_t opTensorCompType,
418
+ cudnnNanPropagation_t opTensorNanOpt);
419
+
420
+ cudnnStatus_t CUDNNWINAPI
421
+ cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
422
+ cudnnOpTensorOp_t *opTensorOp,
423
+ cudnnDataType_t *opTensorCompType,
424
+ cudnnNanPropagation_t *opTensorNanOpt);
425
+
426
+ cudnnStatus_t CUDNNWINAPI
427
+ cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc);
428
+
429
+ /* Tensor operation : C = op( alpha1 * A, alpha2 * B ) + beta * C */
430
+ /* B tensor is ignored for CUDNN_OP_TENSOR_SQRT, CUDNN_OP_TENSOR_NOT. */
431
+ cudnnStatus_t CUDNNWINAPI
432
+ cudnnOpTensor(cudnnHandle_t handle,
433
+ const cudnnOpTensorDescriptor_t opTensorDesc,
434
+ const void *alpha1,
435
+ const cudnnTensorDescriptor_t aDesc,
436
+ const void *A,
437
+ const void *alpha2,
438
+ const cudnnTensorDescriptor_t bDesc,
439
+ const void *B,
440
+ const void *beta,
441
+ const cudnnTensorDescriptor_t cDesc,
442
+ void *C);
443
+
444
+ /*
445
+ * CUDNN ReduceTensor op type
446
+ */
447
+ typedef enum {
448
+ CUDNN_REDUCE_TENSOR_ADD = 0,
449
+ CUDNN_REDUCE_TENSOR_MUL = 1,
450
+ CUDNN_REDUCE_TENSOR_MIN = 2,
451
+ CUDNN_REDUCE_TENSOR_MAX = 3,
452
+ CUDNN_REDUCE_TENSOR_AMAX = 4,
453
+ CUDNN_REDUCE_TENSOR_AVG = 5,
454
+ CUDNN_REDUCE_TENSOR_NORM1 = 6,
455
+ CUDNN_REDUCE_TENSOR_NORM2 = 7,
456
+ CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8,
457
+ } cudnnReduceTensorOp_t;
458
+
459
+ /*
460
+ * CUDNN ReduceTensor indices type
461
+ */
462
+ typedef enum {
463
+ CUDNN_REDUCE_TENSOR_NO_INDICES = 0,
464
+ CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1,
465
+ } cudnnReduceTensorIndices_t;
466
+
467
+ /*
468
+ * CUDNN tensor indices type size (all unsigned)
469
+ * Currently not supported, default is 32 bit unsigned.
470
+ */
471
+ typedef enum {
472
+ CUDNN_32BIT_INDICES = 0,
473
+ CUDNN_64BIT_INDICES = 1,
474
+ CUDNN_16BIT_INDICES = 2,
475
+ CUDNN_8BIT_INDICES = 3,
476
+ } cudnnIndicesType_t;
477
+
478
+ cudnnStatus_t CUDNNWINAPI
479
+ cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc);
480
+
481
+ cudnnStatus_t CUDNNWINAPI
482
+ cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc,
483
+ cudnnReduceTensorOp_t reduceTensorOp,
484
+ cudnnDataType_t reduceTensorCompType,
485
+ cudnnNanPropagation_t reduceTensorNanOpt,
486
+ cudnnReduceTensorIndices_t reduceTensorIndices,
487
+ cudnnIndicesType_t reduceTensorIndicesType);
488
+
489
+ cudnnStatus_t CUDNNWINAPI
490
+ cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc,
491
+ cudnnReduceTensorOp_t *reduceTensorOp,
492
+ cudnnDataType_t *reduceTensorCompType,
493
+ cudnnNanPropagation_t *reduceTensorNanOpt,
494
+ cudnnReduceTensorIndices_t *reduceTensorIndices,
495
+ cudnnIndicesType_t *reduceTensorIndicesType);
496
+
497
+ cudnnStatus_t CUDNNWINAPI
498
+ cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc);
499
+
500
+ /* Helper function to return the minimum size of the index space to be passed to the reduction given the input and
501
+ * output tensors */
502
+ cudnnStatus_t CUDNNWINAPI
503
+ cudnnGetReductionIndicesSize(cudnnHandle_t handle,
504
+ const cudnnReduceTensorDescriptor_t reduceTensorDesc,
505
+ const cudnnTensorDescriptor_t aDesc,
506
+ const cudnnTensorDescriptor_t cDesc,
507
+ size_t *sizeInBytes);
508
+
509
+ /* Helper function to return the minimum size of the workspace to be passed to the reduction given the input and output
510
+ * tensors */
511
+ cudnnStatus_t CUDNNWINAPI
512
+ cudnnGetReductionWorkspaceSize(cudnnHandle_t handle,
513
+ const cudnnReduceTensorDescriptor_t reduceTensorDesc,
514
+ const cudnnTensorDescriptor_t aDesc,
515
+ const cudnnTensorDescriptor_t cDesc,
516
+ size_t *sizeInBytes);
517
+
518
+ /* Tensor operation : C = reduce op( alpha * A ) + beta * C */
519
+ /* The NaN propagation enum applies to only the min and max reduce ops; the other reduce ops propagate NaN as usual. */
520
+ /* The indices space is ignored for reduce ops other than min or max. */
521
+ cudnnStatus_t CUDNNWINAPI
522
+ cudnnReduceTensor(cudnnHandle_t handle,
523
+ const cudnnReduceTensorDescriptor_t reduceTensorDesc,
524
+ void *indices,
525
+ size_t indicesSizeInBytes,
526
+ void *workspace,
527
+ size_t workspaceSizeInBytes,
528
+ const void *alpha,
529
+ const cudnnTensorDescriptor_t aDesc,
530
+ const void *A,
531
+ const void *beta,
532
+ const cudnnTensorDescriptor_t cDesc,
533
+ void *C);
534
+
535
+ /* Set all values of a tensor to a given value : y[i] = value[0] */
536
+ cudnnStatus_t CUDNNWINAPI
537
+ cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr);
538
+
539
+ /* Scale all values of a tensor by a given factor : y[i] = alpha * y[i] */
540
+ cudnnStatus_t CUDNNWINAPI
541
+ cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha);
542
+
543
+ /* Create an instance of FilterStruct */
544
+ cudnnStatus_t CUDNNWINAPI
545
+ cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc);
546
+
547
+ cudnnStatus_t CUDNNWINAPI
548
+ cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
549
+ cudnnDataType_t dataType, /* image data type */
550
+ cudnnTensorFormat_t format,
551
+ int k, /* number of output feature maps */
552
+ int c, /* number of input feature maps */
553
+ int h, /* height of each input filter */
554
+ int w); /* width of each input filter */
555
+
556
+ cudnnStatus_t CUDNNWINAPI
557
+ cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
558
+ cudnnDataType_t *dataType, /* image data type */
559
+ cudnnTensorFormat_t *format,
560
+ int *k, /* number of output feature maps */
561
+ int *c, /* number of input feature maps */
562
+ int *h, /* height of each input filter */
563
+ int *w); /* width of each input filter */
564
+
565
+ cudnnStatus_t CUDNNWINAPI
566
+ cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc,
567
+ cudnnDataType_t dataType, /* image data type */
568
+ cudnnTensorFormat_t format,
569
+ int nbDims,
570
+ const int filterDimA[]);
571
+
572
+ cudnnStatus_t CUDNNWINAPI
573
+ cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc,
574
+ int nbDimsRequested,
575
+ cudnnDataType_t *dataType, /* image data type */
576
+ cudnnTensorFormat_t *format,
577
+ int *nbDims,
578
+ int filterDimA[]);
579
+ cudnnStatus_t CUDNNWINAPI
580
+ cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t *size);
581
+
582
+ cudnnStatus_t CUDNNWINAPI
583
+ cudnnTransformFilter(cudnnHandle_t handle,
584
+ const cudnnTensorTransformDescriptor_t transDesc,
585
+ const void *alpha,
586
+ const cudnnFilterDescriptor_t srcDesc,
587
+ const void *srcData,
588
+ const void *beta,
589
+ const cudnnFilterDescriptor_t destDesc,
590
+ void *destData);
591
+
592
+ cudnnStatus_t CUDNNWINAPI
593
+ cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc);
594
+
595
+ /*
596
+ * softmax algorithm
597
+ */
598
+ typedef enum {
599
+ CUDNN_SOFTMAX_FAST = 0, /* straightforward implementation */
600
+ CUDNN_SOFTMAX_ACCURATE = 1, /* subtract max from every point to avoid overflow */
601
+ CUDNN_SOFTMAX_LOG = 2
602
+ } cudnnSoftmaxAlgorithm_t;
603
+
604
+ typedef enum {
605
+ CUDNN_SOFTMAX_MODE_INSTANCE = 0, /* compute the softmax over all C, H, W for each N */
606
+ CUDNN_SOFTMAX_MODE_CHANNEL = 1 /* compute the softmax over all C for each H, W, N */
607
+ } cudnnSoftmaxMode_t;
608
+
609
+ /* Softmax functions: All of the form "output = alpha * Op(inputs) + beta * output" */
610
+
611
+ /* Function to perform forward softmax */
612
+ cudnnStatus_t CUDNNWINAPI
613
+ cudnnSoftmaxForward(cudnnHandle_t handle,
614
+ cudnnSoftmaxAlgorithm_t algo,
615
+ cudnnSoftmaxMode_t mode,
616
+ const void *alpha,
617
+ const cudnnTensorDescriptor_t xDesc,
618
+ const void *x,
619
+ const void *beta,
620
+ const cudnnTensorDescriptor_t yDesc,
621
+ void *y);
622
+
623
+ /*
624
+ * pooling mode
625
+ */
626
+ typedef enum {
627
+ CUDNN_POOLING_MAX = 0,
628
+ CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1, /* count for average includes padded values */
629
+ CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2, /* count for average does not include padded values */
630
+ CUDNN_POOLING_MAX_DETERMINISTIC = 3
631
+ } cudnnPoolingMode_t;
632
+
633
+ /* Create an instance of pooling descriptor */
634
+ cudnnStatus_t CUDNNWINAPI
635
+ cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc);
636
+
637
+ cudnnStatus_t CUDNNWINAPI
638
+ cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
639
+ cudnnPoolingMode_t mode,
640
+ cudnnNanPropagation_t maxpoolingNanOpt,
641
+ int windowHeight,
642
+ int windowWidth,
643
+ int verticalPadding,
644
+ int horizontalPadding,
645
+ int verticalStride,
646
+ int horizontalStride);
647
+
648
+ cudnnStatus_t CUDNNWINAPI
649
+ cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
650
+ cudnnPoolingMode_t *mode,
651
+ cudnnNanPropagation_t *maxpoolingNanOpt,
652
+ int *windowHeight,
653
+ int *windowWidth,
654
+ int *verticalPadding,
655
+ int *horizontalPadding,
656
+ int *verticalStride,
657
+ int *horizontalStride);
658
+
659
+ cudnnStatus_t CUDNNWINAPI
660
+ cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc,
661
+ const cudnnPoolingMode_t mode,
662
+ const cudnnNanPropagation_t maxpoolingNanOpt,
663
+ int nbDims,
664
+ const int windowDimA[],
665
+ const int paddingA[],
666
+ const int strideA[]);
667
+
668
+ cudnnStatus_t CUDNNWINAPI
669
+ cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
670
+ int nbDimsRequested,
671
+ cudnnPoolingMode_t *mode,
672
+ cudnnNanPropagation_t *maxpoolingNanOpt,
673
+ int *nbDims,
674
+ int windowDimA[],
675
+ int paddingA[],
676
+ int strideA[]);
677
+
678
+ cudnnStatus_t CUDNNWINAPI
679
+ cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
680
+ const cudnnTensorDescriptor_t inputTensorDesc,
681
+ int nbDims,
682
+ int outputTensorDimA[]);
683
+
684
+ cudnnStatus_t CUDNNWINAPI
685
+ cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
686
+ const cudnnTensorDescriptor_t inputTensorDesc,
687
+ int *n,
688
+ int *c,
689
+ int *h,
690
+ int *w);
691
+
692
+ /* Destroy an instance of pooling descriptor */
693
+ cudnnStatus_t CUDNNWINAPI
694
+ cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc);
695
+
696
+ /* Pooling functions: All of the form "output = alpha * Op(inputs) + beta * output" */
697
+
698
+ /* Function to perform forward pooling */
699
+ cudnnStatus_t CUDNNWINAPI
700
+ cudnnPoolingForward(cudnnHandle_t handle,
701
+ const cudnnPoolingDescriptor_t poolingDesc,
702
+ const void *alpha,
703
+ const cudnnTensorDescriptor_t xDesc,
704
+ const void *x,
705
+ const void *beta,
706
+ const cudnnTensorDescriptor_t yDesc,
707
+ void *y);
708
+
709
+ /*
710
+ * activation mode
711
+ */
712
+ typedef enum {
713
+ CUDNN_ACTIVATION_SIGMOID = 0,
714
+ CUDNN_ACTIVATION_RELU = 1,
715
+ CUDNN_ACTIVATION_TANH = 2,
716
+ CUDNN_ACTIVATION_CLIPPED_RELU = 3,
717
+ CUDNN_ACTIVATION_ELU = 4,
718
+ CUDNN_ACTIVATION_IDENTITY = 5,
719
+ CUDNN_ACTIVATION_SWISH = 6
720
+ } cudnnActivationMode_t;
721
+
722
+ /* Activation functions: All of the form "output = alpha * Op(inputs) + beta * output" */
723
+ cudnnStatus_t CUDNNWINAPI
724
+ cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc);
725
+
726
+ cudnnStatus_t CUDNNWINAPI
727
+ cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
728
+ cudnnActivationMode_t mode,
729
+ cudnnNanPropagation_t reluNanOpt,
730
+ double coef); /* ceiling for clipped RELU, alpha for ELU */
731
+
732
+ cudnnStatus_t CUDNNWINAPI
733
+ cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
734
+ cudnnActivationMode_t *mode,
735
+ cudnnNanPropagation_t *reluNanOpt,
736
+ double *coef); /* ceiling for clipped RELU, alpha for ELU */
737
+
738
+ cudnnStatus_t CUDNNWINAPI
739
+ cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double swish_beta);
740
+
741
+ cudnnStatus_t CUDNNWINAPI
742
+ cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double *swish_beta);
743
+
744
+ cudnnStatus_t CUDNNWINAPI
745
+ cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc);
746
+
747
+ /* Function to perform forward activation */
748
+ cudnnStatus_t CUDNNWINAPI
749
+ cudnnActivationForward(cudnnHandle_t handle,
750
+ cudnnActivationDescriptor_t activationDesc,
751
+ const void *alpha,
752
+ const cudnnTensorDescriptor_t xDesc,
753
+ const void *x,
754
+ const void *beta,
755
+ const cudnnTensorDescriptor_t yDesc,
756
+ void *y);
757
+
758
+ /*
759
+ * Create an instance of LRN (Local Response Normalization) descriptor
760
+ * Uses lrnN=5, lrnAlpha=1e-4, lrnBeta=0.75, lrnK=2.0 as defaults from Krizhevsky'12 ImageNet paper
761
+ */
762
+ cudnnStatus_t CUDNNWINAPI
763
+ cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc);
764
+
765
+ #define CUDNN_LRN_MIN_N 1 /* minimum allowed lrnN */
766
+ #define CUDNN_LRN_MAX_N 16 /* maximum allowed lrnN */
767
+ #define CUDNN_LRN_MIN_K 1e-5 /* minimum allowed lrnK */
768
+ #define CUDNN_LRN_MIN_BETA 0.01 /* minimum allowed lrnBeta */
769
+
770
+ /* LRN layer mode */
771
+ typedef enum {
772
+ CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0, /* Normalize across tensor's dimA[1] dimension */
773
+ } cudnnLRNMode_t;
774
+
775
+ /*
776
+ * Uses a window [center-lookBehind, center+lookAhead], where
777
+ * lookBehind = floor( (lrnN-1)/2 ), lookAhead = lrnN-lookBehind-1.
778
+ * Values of double parameters cast to tensor data type.
779
+ */
780
+ cudnnStatus_t CUDNNWINAPI
781
+ cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK);
782
+ /*
783
+ * Retrieve the settings currently stored in an LRN layer descriptor
784
+ * Any of the provided pointers can be NULL (no corresponding value will be returned)
785
+ */
786
+ cudnnStatus_t CUDNNWINAPI
787
+ cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK);
788
+
789
+ /* Destroy an instance of LRN descriptor */
790
+ cudnnStatus_t CUDNNWINAPI
791
+ cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc);
792
+
793
+ /* LRN functions: output = alpha * normalize(x) + beta * old_y */
794
+
795
+ /* LRN cross-channel forward computation. Double parameters cast to tensor data type */
796
+ cudnnStatus_t CUDNNWINAPI
797
+ cudnnLRNCrossChannelForward(cudnnHandle_t handle,
798
+ cudnnLRNDescriptor_t normDesc,
799
+ cudnnLRNMode_t lrnMode,
800
+ const void *alpha,
801
+ const cudnnTensorDescriptor_t xDesc,
802
+ const void *x,
803
+ const void *beta,
804
+ const cudnnTensorDescriptor_t yDesc,
805
+ void *y);
806
+
807
+ typedef enum {
808
+ CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0,
809
+ } cudnnDivNormMode_t;
810
+
811
+ /* LCN/divisive normalization functions: y = alpha * normalize(x) + beta * y */
812
+ cudnnStatus_t CUDNNWINAPI
813
+ cudnnDivisiveNormalizationForward(cudnnHandle_t handle,
814
+ cudnnLRNDescriptor_t normDesc,
815
+ cudnnDivNormMode_t mode,
816
+ const void *alpha,
817
+ const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
818
+ const void *x,
819
+ const void *means, /* if NULL, means are assumed to be zero */
820
+ void *temp,
821
+ void *temp2,
822
+ const void *beta,
823
+ const cudnnTensorDescriptor_t yDesc,
824
+ void *y);
825
+
826
+ typedef enum {
827
+ /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
828
+ CUDNN_BATCHNORM_PER_ACTIVATION = 0,
829
+
830
+ /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
831
+ CUDNN_BATCHNORM_SPATIAL = 1,
832
+
833
+ /*
834
+ * bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors).
835
+ * May be faster than CUDNN_BATCHNORM_SPATIAL but imposes some limits on the range of values
836
+ */
837
+ CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2,
838
+ } cudnnBatchNormMode_t;
839
+
840
+ #define CUDNN_BN_MIN_EPSILON 0.0 /* Minimum epsilon allowed to be used in the Batch Normalization formula */
841
+
842
+ /*
843
+ * Derives a tensor descriptor from layer data descriptor for BatchNormalization
844
+ * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
845
+ * bnScaleBiasMeanVarDesc and bnScaleBiasDiffDesc in Batch Normalization forward and backward functions.
846
+ */
847
+ cudnnStatus_t CUDNNWINAPI
848
+ cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc,
849
+ const cudnnTensorDescriptor_t xDesc,
850
+ cudnnBatchNormMode_t mode);
851
+
852
+ typedef enum {
853
+ CUDNN_BATCHNORM_OPS_BN = 0, /* do batch normalization only */
854
+ CUDNN_BATCHNORM_OPS_BN_ACTIVATION = 1, /* do batchNorm, then activation */
855
+ CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2, /* do batchNorm, then elemWiseAdd, then activation */
856
+ } cudnnBatchNormOps_t;
857
+
858
+ /*
859
+ * Performs Batch Normalization during Inference:
860
+ * y[i] = bnScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + bnBias[k]
861
+ * with bnScale, bnBias, runningMean, runningInvVariance tensors indexed
862
+ * according to spatial or per-activation mode. Refer to cudnnBatchNormalizationForwardTraining
863
+ * above for notes on function arguments.
864
+ */
865
+ cudnnStatus_t CUDNNWINAPI
866
+ cudnnBatchNormalizationForwardInference(cudnnHandle_t handle,
867
+ cudnnBatchNormMode_t mode,
868
+ const void *alpha, /* alpha[0] = result blend factor */
869
+ const void *beta, /* beta[0] = dest layer blend factor */
870
+ const cudnnTensorDescriptor_t xDesc,
871
+ const void *x, /* NxCxHxW */
872
+ const cudnnTensorDescriptor_t yDesc,
873
+ void *y, /* NxCxHxW */
874
+ const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
875
+ const void *bnScale,
876
+ const void *bnBias,
877
+ const void *estimatedMean,
878
+ const void *estimatedVariance,
879
+ double epsilon);
880
+
881
+ typedef enum {
882
+ /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
883
+ CUDNN_NORM_PER_ACTIVATION = 0,
884
+
885
+ /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
886
+ CUDNN_NORM_PER_CHANNEL = 1,
887
+ } cudnnNormMode_t;
888
+
889
+ typedef enum { CUDNN_NORM_ALGO_STANDARD = 0, CUDNN_NORM_ALGO_PERSIST = 1 } cudnnNormAlgo_t;
890
+
891
+ /*
892
+ * Derives a tensor descriptor from layer data descriptor for Normalization
893
+ * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
894
+ * normScaleBiasMeanVarDesc and normScaleBiasDiffDesc in Normalization forward and backward functions.
895
+ */
896
+ cudnnStatus_t CUDNNWINAPI
897
+ cudnnDeriveNormTensorDescriptor(cudnnTensorDescriptor_t derivedNormScaleBiasDesc,
898
+ cudnnTensorDescriptor_t derivedNormMeanVarDesc,
899
+ const cudnnTensorDescriptor_t xDesc,
900
+ cudnnNormMode_t mode,
901
+ int groupCnt); /* Place hold for future work, should be set to 1 now*/
902
+
903
+ typedef enum {
904
+ CUDNN_NORM_OPS_NORM = 0, /* do normalization only */
905
+ CUDNN_NORM_OPS_NORM_ACTIVATION = 1, /* do Norm, then activation */
906
+ CUDNN_NORM_OPS_NORM_ADD_ACTIVATION = 2, /* do Norm, then elemWiseAdd, then activation */
907
+ } cudnnNormOps_t;
908
+
909
+ /*
910
+ * Performs Normalization during Inference:
911
+ * y[i] = normScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + normBias[k]
912
+ * with normScale, normBias, runningMean, runningInvVariance tensors indexed
913
+ * according to per-channel or per-activation mode. Refer to cudnnNormalizationForwardTraining
914
+ * above for notes on function arguments.
915
+ */
916
+ cudnnStatus_t CUDNNWINAPI
917
+ cudnnNormalizationForwardInference(cudnnHandle_t handle,
918
+ cudnnNormMode_t mode,
919
+ cudnnNormOps_t normOps,
920
+ cudnnNormAlgo_t algo,
921
+ const void *alpha, /* alpha[0] = result blend factor */
922
+ const void *beta, /* beta[0] = dest layer blend factor */
923
+ const cudnnTensorDescriptor_t xDesc,
924
+ const void *x, /* NxCxHxW */
925
+ const cudnnTensorDescriptor_t normScaleBiasDesc,
926
+ const void *normScale,
927
+ const void *normBias,
928
+ const cudnnTensorDescriptor_t normMeanVarDesc,
929
+ const void *estimatedMean,
930
+ const void *estimatedVariance,
931
+ const cudnnTensorDescriptor_t zDesc,
932
+ const void *z,
933
+ cudnnActivationDescriptor_t activationDesc,
934
+ const cudnnTensorDescriptor_t yDesc,
935
+ void *y, /* NxCxHxW */
936
+ double epsilon,
937
+ int groupCnt); /* Place hold for future work*/
938
+
939
+ /* APIs for spatial transformer network*/
940
+ typedef enum {
941
+ CUDNN_SAMPLER_BILINEAR = 0,
942
+ } cudnnSamplerType_t;
943
+
944
+ cudnnStatus_t CUDNNWINAPI
945
+ cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc);
946
+
947
+ cudnnStatus_t CUDNNWINAPI
948
+ cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc,
949
+ cudnnSamplerType_t samplerType,
950
+ cudnnDataType_t dataType,
951
+ const int nbDims,
952
+ const int dimA[]);
953
+
954
+ cudnnStatus_t CUDNNWINAPI
955
+ cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc);
956
+
957
+ cudnnStatus_t CUDNNWINAPI
958
+ cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle,
959
+ const cudnnSpatialTransformerDescriptor_t stDesc,
960
+ const void *theta,
961
+ void *grid);
962
+
963
+ cudnnStatus_t CUDNNWINAPI
964
+ cudnnSpatialTfSamplerForward(cudnnHandle_t handle,
965
+ cudnnSpatialTransformerDescriptor_t stDesc,
966
+ const void *alpha,
967
+ const cudnnTensorDescriptor_t xDesc,
968
+ const void *x,
969
+ const void *grid,
970
+ const void *beta,
971
+ cudnnTensorDescriptor_t yDesc,
972
+ void *y);
973
+
974
+ typedef struct cudnnDropoutStruct *cudnnDropoutDescriptor_t;
975
+
976
+ cudnnStatus_t CUDNNWINAPI
977
+ cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc);
978
+
979
+ cudnnStatus_t CUDNNWINAPI
980
+ cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc);
981
+
982
+ /*helper function to determine size of the states to be passed to cudnnSetDropoutDescriptor */
983
+ cudnnStatus_t CUDNNWINAPI
984
+ cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes);
985
+
986
+ /*helper function to determine size of the reserve space to be passed to dropout forward/backward calls */
987
+ cudnnStatus_t CUDNNWINAPI
988
+ cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes);
989
+
990
+ cudnnStatus_t CUDNNWINAPI
991
+ cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
992
+ cudnnHandle_t handle,
993
+ float dropout,
994
+ void *states,
995
+ size_t stateSizeInBytes,
996
+ unsigned long long seed);
997
+
998
+ /* Restores the dropout descriptor to a previously saved-off state */
999
+ cudnnStatus_t CUDNNWINAPI
1000
+ cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
1001
+ cudnnHandle_t handle,
1002
+ float dropout,
1003
+ void *states,
1004
+ size_t stateSizeInBytes,
1005
+ unsigned long long seed);
1006
+
1007
+ cudnnStatus_t CUDNNWINAPI
1008
+ cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
1009
+ cudnnHandle_t handle,
1010
+ float *dropout,
1011
+ void **states,
1012
+ unsigned long long *seed);
1013
+
1014
+ cudnnStatus_t CUDNNWINAPI
1015
+ cudnnDropoutForward(cudnnHandle_t handle,
1016
+ const cudnnDropoutDescriptor_t dropoutDesc,
1017
+ const cudnnTensorDescriptor_t xdesc,
1018
+ const void *x,
1019
+ const cudnnTensorDescriptor_t ydesc,
1020
+ void *y,
1021
+ void *reserveSpace,
1022
+ size_t reserveSpaceSizeInBytes);
1023
+
1024
+ /* TODO: remove */
1025
+
1026
+ typedef struct cudnnAlgorithmStruct *cudnnAlgorithmDescriptor_t;
1027
+ typedef struct cudnnAlgorithmPerformanceStruct *cudnnAlgorithmPerformance_t;
1028
+
1029
+ /* TODO: move these enums out to the appropriate submodule */
1030
+ typedef enum {
1031
+ CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 0,
1032
+ CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1,
1033
+ CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2,
1034
+ CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 3,
1035
+ CUDNN_CONVOLUTION_FWD_ALGO_FFT = 4,
1036
+ CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 5,
1037
+ CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 6,
1038
+ CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED = 7,
1039
+ CUDNN_CONVOLUTION_FWD_ALGO_COUNT = 8
1040
+ } cudnnConvolutionFwdAlgo_t;
1041
+
1042
+ typedef enum {
1043
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 0, /* non-deterministic */
1044
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 1,
1045
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 2,
1046
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 3, /* non-deterministic */
1047
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD = 4, /* not implemented */
1048
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5,
1049
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING = 6,
1050
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT = 7
1051
+ } cudnnConvolutionBwdFilterAlgo_t;
1052
+
1053
+ typedef enum {
1054
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 0, /* non-deterministic */
1055
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 1,
1056
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 2,
1057
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 3,
1058
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 4,
1059
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5,
1060
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT = 6
1061
+ } cudnnConvolutionBwdDataAlgo_t;
1062
+
1063
+ typedef enum {
1064
+ CUDNN_RNN_ALGO_STANDARD = 0,
1065
+ CUDNN_RNN_ALGO_PERSIST_STATIC = 1,
1066
+ CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 2,
1067
+ CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H = 3,
1068
+ CUDNN_RNN_ALGO_COUNT = 4,
1069
+ } cudnnRNNAlgo_t;
1070
+
1071
+ typedef enum { CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0, CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 } cudnnCTCLossAlgo_t;
1072
+
1073
+ /* TODO: remove */
1074
+ typedef struct cudnnAlgorithmUnionStruct {
1075
+ union Algorithm {
1076
+ cudnnConvolutionFwdAlgo_t convFwdAlgo;
1077
+ cudnnConvolutionBwdFilterAlgo_t convBwdFilterAlgo;
1078
+ cudnnConvolutionBwdDataAlgo_t convBwdDataAlgo;
1079
+ cudnnRNNAlgo_t RNNAlgo;
1080
+ cudnnCTCLossAlgo_t CTCLossAlgo;
1081
+ } algo;
1082
+ } cudnnAlgorithm_t;
1083
+
1084
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1085
+ cudnnCreateAlgorithmDescriptor(cudnnAlgorithmDescriptor_t *algoDesc);
1086
+
1087
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1088
+ cudnnSetAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm);
1089
+
1090
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1091
+ cudnnGetAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm);
1092
+
1093
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1094
+ cudnnCopyAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest);
1095
+
1096
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1097
+ cudnnDestroyAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc);
1098
+
1099
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1100
+ cudnnCreateAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate);
1101
+
1102
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1103
+ cudnnSetAlgorithmPerformance(cudnnAlgorithmPerformance_t algoPerf,
1104
+ cudnnAlgorithmDescriptor_t algoDesc,
1105
+ cudnnStatus_t status,
1106
+ float time,
1107
+ size_t memory);
1108
+
1109
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1110
+ cudnnGetAlgorithmPerformance(const cudnnAlgorithmPerformance_t algoPerf,
1111
+ cudnnAlgorithmDescriptor_t *algoDesc,
1112
+ cudnnStatus_t *status,
1113
+ float *time,
1114
+ size_t *memory);
1115
+
1116
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1117
+ cudnnDestroyAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy);
1118
+
1119
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1120
+ cudnnGetAlgorithmSpaceSize(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc, size_t *algoSpaceSizeInBytes);
1121
+
1122
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1123
+ cudnnSaveAlgorithm(cudnnHandle_t handle,
1124
+ cudnnAlgorithmDescriptor_t algoDesc,
1125
+ void *algoSpace,
1126
+ size_t algoSpaceSizeInBytes);
1127
+
1128
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1129
+ cudnnRestoreAlgorithm(cudnnHandle_t handle,
1130
+ void *algoSpace,
1131
+ size_t algoSpaceSizeInBytes,
1132
+ cudnnAlgorithmDescriptor_t algoDesc);
1133
+
1134
+ typedef enum {
1135
+ CUDNN_SEV_FATAL = 0,
1136
+ CUDNN_SEV_ERROR = 1,
1137
+ CUDNN_SEV_WARNING = 2,
1138
+ CUDNN_SEV_INFO = 3,
1139
+ } cudnnSeverity_t;
1140
+
1141
+ /* Message masks to be used with cudnnSetCallback() */
1142
+ #define CUDNN_SEV_ERROR_EN (1U << CUDNN_SEV_ERROR)
1143
+ #define CUDNN_SEV_WARNING_EN (1U << CUDNN_SEV_WARNING)
1144
+ #define CUDNN_SEV_INFO_EN (1U << CUDNN_SEV_INFO)
1145
+
1146
+ /* struct containing useful informaiton for each API call */
1147
+ typedef struct cudnnDebugStruct {
1148
+ unsigned cudnn_version;
1149
+ cudnnStatus_t cudnnStatus;
1150
+ unsigned time_sec; /* epoch time in seconds */
1151
+ unsigned time_usec; /* microseconds part of epoch time */
1152
+ unsigned time_delta; /* time since start in seconds */
1153
+ cudnnHandle_t handle; /* cudnn handle */
1154
+ cudaStream_t stream; /* cuda stream ID */
1155
+ unsigned long long pid; /* process ID */
1156
+ unsigned long long tid; /* thread ID */
1157
+ int cudaDeviceId; /* CUDA device ID */
1158
+ int reserved[15]; /* reserved for future use */
1159
+ } cudnnDebug_t;
1160
+
1161
+ typedef void (*cudnnCallback_t)(cudnnSeverity_t sev, void *udata, const cudnnDebug_t *dbg, const char *msg);
1162
+
1163
+ cudnnStatus_t CUDNNWINAPI
1164
+ cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr);
1165
+
1166
+ cudnnStatus_t CUDNNWINAPI
1167
+ cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr);
1168
+
1169
+ /*
1170
+ * \brief Cross-library version checker.
1171
+ * This function is implemented differently in each sub-library. Each sublib
1172
+ * checks whether its own version matches that of its dependencies.
1173
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
1174
+ * CUDNN_STATUS_VERSION_MISMATCH if the versions are inconsistent.
1175
+ */
1176
+ cudnnStatus_t CUDNNWINAPI
1177
+ cudnnOpsInferVersionCheck(void);
1178
+
1179
+ #if defined(__cplusplus)
1180
+ }
1181
+ #endif
1182
+
1183
+ #endif /* CUDNN_OPS_INFER_H_ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_infer_v8.h ADDED
@@ -0,0 +1,1183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2017-2022 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /*
51
+ * cudnn_ops_infer : cuDNN's basic definitions and inference operations.
52
+ */
53
+
54
+ #if !defined(CUDNN_OPS_INFER_H_)
55
+ #define CUDNN_OPS_INFER_H_
56
+
57
+ #include <cuda_runtime.h>
58
+ #include <stdint.h>
59
+
60
+ #include "cudnn_version.h"
61
+
62
+ /* These version numbers are autogenerated, do not edit manually. */
63
+ #define CUDNN_OPS_INFER_MAJOR 8
64
+ #define CUDNN_OPS_INFER_MINOR 7
65
+ #define CUDNN_OPS_INFER_PATCH 0
66
+
67
+ #if (CUDNN_OPS_INFER_MAJOR != CUDNN_MAJOR) || (CUDNN_OPS_INFER_MINOR != CUDNN_MINOR) || \
68
+ (CUDNN_OPS_INFER_PATCH != CUDNN_PATCHLEVEL)
69
+ #error Version mismatch in cuDNN OPS INFER!!!
70
+ #endif
71
+
72
+ #ifndef CUDNNWINAPI
73
+ #ifdef _WIN32
74
+ #define CUDNNWINAPI __stdcall
75
+ #else
76
+ #define CUDNNWINAPI
77
+ #endif
78
+ #endif
79
+
80
+ /* Warnings for deprecated API-s are enabled using the CUDNN_WARN_DEPRECATED macro */
81
+ #if defined(CUDNN_WARN_DEPRECATED) && (defined(__GNUC__) || defined(__clang__))
82
+ /* GCC, Intel C/C++, Cray C/C++, CLANG, IBM XL C/C++ little endian */
83
+ #define CUDNN_DEPRECATED __attribute__((deprecated))
84
+ #elif defined(CUDNN_WARN_DEPRECATED) && defined(_MSC_VER)
85
+ /* Microsoft Visual C++ */
86
+ #define CUDNN_DEPRECATED __declspec(deprecated)
87
+ #elif defined(CUDNN_WARN_DEPRECATED) && (__cplusplus >= 201402L)
88
+ /* C++14 compilers */
89
+ #define CUDNN_DEPRECATED [[deprecated]]
90
+ #else
91
+ /* No support for the deprecated attribute */
92
+ #define CUDNN_DEPRECATED
93
+ #endif
94
+
95
+ #if defined(__cplusplus)
96
+ extern "C" {
97
+ #endif
98
+
99
+ struct cudnnContext;
100
+ typedef struct cudnnContext *cudnnHandle_t;
101
+
102
+ size_t CUDNNWINAPI
103
+ cudnnGetVersion(void);
104
+
105
+ size_t CUDNNWINAPI
106
+ cudnnGetMaxDeviceVersion(void);
107
+
108
+ /* Returns CUDA Runtime version statically linked against cudnn */
109
+ size_t CUDNNWINAPI
110
+ cudnnGetCudartVersion(void);
111
+
112
+ /*
113
+ * CUDNN return codes
114
+ */
115
+ typedef enum {
116
+ CUDNN_STATUS_SUCCESS = 0,
117
+ CUDNN_STATUS_NOT_INITIALIZED = 1,
118
+ CUDNN_STATUS_ALLOC_FAILED = 2,
119
+ CUDNN_STATUS_BAD_PARAM = 3,
120
+ CUDNN_STATUS_INTERNAL_ERROR = 4,
121
+ CUDNN_STATUS_INVALID_VALUE = 5,
122
+ CUDNN_STATUS_ARCH_MISMATCH = 6,
123
+ CUDNN_STATUS_MAPPING_ERROR = 7,
124
+ CUDNN_STATUS_EXECUTION_FAILED = 8,
125
+ CUDNN_STATUS_NOT_SUPPORTED = 9,
126
+ CUDNN_STATUS_LICENSE_ERROR = 10,
127
+ CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING = 11,
128
+ CUDNN_STATUS_RUNTIME_IN_PROGRESS = 12,
129
+ CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 13,
130
+ CUDNN_STATUS_VERSION_MISMATCH = 14,
131
+ } cudnnStatus_t;
132
+
133
+ /* human-readable error messages */
134
+ const char *CUDNNWINAPI
135
+ cudnnGetErrorString(cudnnStatus_t status);
136
+
137
+ /* Forward definition in this version only */
138
+ typedef struct cudnnRuntimeTag_t cudnnRuntimeTag_t;
139
+
140
+ typedef enum {
141
+ CUDNN_ERRQUERY_RAWCODE = 0,
142
+ CUDNN_ERRQUERY_NONBLOCKING = 1,
143
+ CUDNN_ERRQUERY_BLOCKING = 2,
144
+ } cudnnErrQueryMode_t;
145
+
146
+ cudnnStatus_t CUDNNWINAPI
147
+ cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag);
148
+
149
+ #ifndef __LIBRARY_TYPES_H__
150
+
151
+ typedef enum libraryPropertyType_t { MAJOR_VERSION, MINOR_VERSION, PATCH_LEVEL } libraryPropertyType;
152
+
153
+ #endif
154
+
155
+ cudnnStatus_t CUDNNWINAPI
156
+ cudnnGetProperty(libraryPropertyType type, int *value);
157
+
158
+ cudnnStatus_t CUDNNWINAPI
159
+ cudnnCreate(cudnnHandle_t *handle);
160
+ cudnnStatus_t CUDNNWINAPI
161
+ cudnnDestroy(cudnnHandle_t handle);
162
+ cudnnStatus_t CUDNNWINAPI
163
+ cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId);
164
+ cudnnStatus_t CUDNNWINAPI
165
+ cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId);
166
+
167
+ /* Data structures to represent Image/Filter and the Neural Network Layer */
168
+ typedef struct cudnnTensorStruct *cudnnTensorDescriptor_t;
169
+ typedef struct cudnnPoolingStruct *cudnnPoolingDescriptor_t;
170
+ typedef struct cudnnFilterStruct *cudnnFilterDescriptor_t;
171
+ typedef struct cudnnLRNStruct *cudnnLRNDescriptor_t;
172
+ typedef struct cudnnActivationStruct *cudnnActivationDescriptor_t;
173
+ typedef struct cudnnSpatialTransformerStruct *cudnnSpatialTransformerDescriptor_t;
174
+ typedef struct cudnnOpTensorStruct *cudnnOpTensorDescriptor_t;
175
+ typedef struct cudnnReduceTensorStruct *cudnnReduceTensorDescriptor_t;
176
+ typedef struct cudnnCTCLossStruct *cudnnCTCLossDescriptor_t;
177
+ typedef struct cudnnTensorTransformStruct *cudnnTensorTransformDescriptor_t;
178
+ /*
179
+ * CUDNN data type
180
+ */
181
+ typedef enum {
182
+ CUDNN_DATA_FLOAT = 0,
183
+ CUDNN_DATA_DOUBLE = 1,
184
+ CUDNN_DATA_HALF = 2,
185
+ CUDNN_DATA_INT8 = 3,
186
+ CUDNN_DATA_INT32 = 4,
187
+ CUDNN_DATA_INT8x4 = 5,
188
+ CUDNN_DATA_UINT8 = 6,
189
+ CUDNN_DATA_UINT8x4 = 7,
190
+ CUDNN_DATA_INT8x32 = 8,
191
+ CUDNN_DATA_BFLOAT16 = 9,
192
+ CUDNN_DATA_INT64 = 10,
193
+ CUDNN_DATA_BOOLEAN = 11,
194
+ CUDNN_DATA_FP8_E4M3 = 12,
195
+ CUDNN_DATA_FP8_E5M2 = 13,
196
+ CUDNN_DATA_FAST_FLOAT_FOR_FP8 = 14,
197
+ } cudnnDataType_t;
198
+
199
+ /*
200
+ * CUDNN math type
201
+ */
202
+ typedef enum {
203
+ CUDNN_DEFAULT_MATH = 0,
204
+ CUDNN_TENSOR_OP_MATH = 1,
205
+ CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION = 2,
206
+ CUDNN_FMA_MATH = 3,
207
+ } cudnnMathType_t;
208
+
209
+ /*
210
+ * CUDNN propagate Nan
211
+ */
212
+ typedef enum {
213
+ CUDNN_NOT_PROPAGATE_NAN = 0,
214
+ CUDNN_PROPAGATE_NAN = 1,
215
+ } cudnnNanPropagation_t;
216
+
217
+ /*
218
+ * CUDNN Determinism
219
+ */
220
+ typedef enum {
221
+ CUDNN_NON_DETERMINISTIC = 0,
222
+ CUDNN_DETERMINISTIC = 1,
223
+ } cudnnDeterminism_t;
224
+
225
+ /* Maximum supported number of tensor dimensions */
226
+ #define CUDNN_DIM_MAX 8
227
+
228
+ /* Create an instance of a generic Tensor descriptor */
229
+ cudnnStatus_t CUDNNWINAPI
230
+ cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc);
231
+
232
+ typedef enum {
233
+ CUDNN_TENSOR_NCHW = 0, /* row major (wStride = 1, hStride = w) */
234
+ CUDNN_TENSOR_NHWC = 1, /* feature maps interleaved ( cStride = 1 )*/
235
+ CUDNN_TENSOR_NCHW_VECT_C = 2, /* each image point is vector of element of C, vector length in data type */
236
+ } cudnnTensorFormat_t;
237
+
238
+ cudnnStatus_t CUDNNWINAPI
239
+ cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc,
240
+ cudnnTensorFormat_t format,
241
+ cudnnDataType_t dataType, /* image data type */
242
+ int n, /* number of inputs (batch size) */
243
+ int c, /* number of input feature maps */
244
+ int h, /* height of input section */
245
+ int w); /* width of input section */
246
+
247
+ cudnnStatus_t CUDNNWINAPI
248
+ cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
249
+ cudnnDataType_t dataType, /* image data type */
250
+ int n, /* number of inputs (batch size) */
251
+ int c, /* number of input feature maps */
252
+ int h, /* height of input section */
253
+ int w, /* width of input section */
254
+ int nStride,
255
+ int cStride,
256
+ int hStride,
257
+ int wStride);
258
+
259
+ cudnnStatus_t CUDNNWINAPI
260
+ cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc,
261
+ cudnnDataType_t *dataType, /* image data type */
262
+ int *n, /* number of inputs (batch size) */
263
+ int *c, /* number of input feature maps */
264
+ int *h, /* height of input section */
265
+ int *w, /* width of input section */
266
+ int *nStride,
267
+ int *cStride,
268
+ int *hStride,
269
+ int *wStride);
270
+
271
+ cudnnStatus_t CUDNNWINAPI
272
+ cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
273
+ cudnnDataType_t dataType,
274
+ int nbDims,
275
+ const int dimA[],
276
+ const int strideA[]);
277
+
278
+ cudnnStatus_t CUDNNWINAPI
279
+ cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
280
+ cudnnTensorFormat_t format,
281
+ cudnnDataType_t dataType,
282
+ int nbDims,
283
+ const int dimA[]);
284
+
285
+ cudnnStatus_t CUDNNWINAPI
286
+ cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc,
287
+ int nbDimsRequested,
288
+ cudnnDataType_t *dataType,
289
+ int *nbDims,
290
+ int dimA[],
291
+ int strideA[]);
292
+
293
+ cudnnStatus_t CUDNNWINAPI
294
+ cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size);
295
+
296
+ /* PixelOffset( n, c, h, w ) = n *input_stride + c * feature_stride + h * h_stride + w * w_stride
297
+
298
+ 1)Example of all images in row major order one batch of features after the other (with an optional padding on row)
299
+ input_stride : c x h x h_stride
300
+ feature_stride : h x h_stride
301
+ h_stride : >= w ( h_stride = w if no padding)
302
+ w_stride : 1
303
+
304
+
305
+ 2)Example of all images in row major with features maps interleaved
306
+ input_stride : c x h x h_stride
307
+ feature_stride : 1
308
+ h_stride : w x c
309
+ w_stride : c
310
+
311
+ 3)Example of all images in column major order one batch of features after the other (with optional padding on column)
312
+ input_stride : c x w x w_stride
313
+ feature_stride : w x w_stride
314
+ h_stride : 1
315
+ w_stride : >= h
316
+
317
+ */
318
+
319
+ /* Destroy an instance of Tensor4d descriptor */
320
+ cudnnStatus_t CUDNNWINAPI
321
+ cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc);
322
+
323
+ /* Fold/unfold transforms */
324
+ typedef enum {
325
+ CUDNN_TRANSFORM_FOLD = 0U,
326
+ CUDNN_TRANSFORM_UNFOLD = 1U,
327
+ } cudnnFoldingDirection_t;
328
+
329
+ /** Create a destination descriptor for cudnnTransformTensor */
330
+ cudnnStatus_t CUDNNWINAPI
331
+ cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t transformDesc,
332
+ const cudnnTensorDescriptor_t srcDesc,
333
+ cudnnTensorDescriptor_t destDesc,
334
+ size_t *destSizeInBytes);
335
+
336
+ /** Create an empty tensor transform descriptor */
337
+ cudnnStatus_t CUDNNWINAPI
338
+ cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescriptor_t *transformDesc);
339
+
340
+ /** Initialize a previously created tensor transform descriptor. */
341
+ cudnnStatus_t CUDNNWINAPI
342
+ cudnnSetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
343
+ const uint32_t nbDims,
344
+ const cudnnTensorFormat_t destFormat,
345
+ const int32_t padBeforeA[],
346
+ const int32_t padAfterA[],
347
+ const uint32_t foldA[],
348
+ const cudnnFoldingDirection_t direction);
349
+
350
+ /**
351
+ * Retrieves the values stored in a previously initialized tensor transform
352
+ * descriptor.
353
+ */
354
+ cudnnStatus_t CUDNNWINAPI
355
+ cudnnGetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
356
+ uint32_t nbDimsRequested,
357
+ cudnnTensorFormat_t *destFormat,
358
+ int32_t padBeforeA[],
359
+ int32_t padAfterA[],
360
+ uint32_t foldA[],
361
+ cudnnFoldingDirection_t *direction);
362
+
363
+ /**
364
+ * Destroys a previously created tensor transform descriptor.
365
+ */
366
+ cudnnStatus_t CUDNNWINAPI
367
+ cudnnDestroyTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc);
368
+
369
+ /* Tensor layout conversion helper (y = alpha * x + beta * y) */
370
+ cudnnStatus_t CUDNNWINAPI
371
+ cudnnTransformTensor(cudnnHandle_t handle,
372
+ const void *alpha,
373
+ const cudnnTensorDescriptor_t xDesc,
374
+ const void *x,
375
+ const void *beta,
376
+ const cudnnTensorDescriptor_t yDesc,
377
+ void *y);
378
+
379
+ cudnnStatus_t CUDNNWINAPI
380
+ cudnnTransformTensorEx(cudnnHandle_t handle,
381
+ const cudnnTensorTransformDescriptor_t transDesc,
382
+ const void *alpha,
383
+ const cudnnTensorDescriptor_t srcDesc,
384
+ const void *srcData,
385
+ const void *beta,
386
+ const cudnnTensorDescriptor_t destDesc,
387
+ void *destData);
388
+
389
+ /* Tensor Bias addition : C = alpha * A + beta * C */
390
+ cudnnStatus_t CUDNNWINAPI
391
+ cudnnAddTensor(cudnnHandle_t handle,
392
+ const void *alpha,
393
+ const cudnnTensorDescriptor_t aDesc,
394
+ const void *A,
395
+ const void *beta,
396
+ const cudnnTensorDescriptor_t cDesc,
397
+ void *C);
398
+
399
+ /*
400
+ * CUDNN OpTensor op type
401
+ */
402
+ typedef enum {
403
+ CUDNN_OP_TENSOR_ADD = 0,
404
+ CUDNN_OP_TENSOR_MUL = 1,
405
+ CUDNN_OP_TENSOR_MIN = 2,
406
+ CUDNN_OP_TENSOR_MAX = 3,
407
+ CUDNN_OP_TENSOR_SQRT = 4,
408
+ CUDNN_OP_TENSOR_NOT = 5,
409
+ } cudnnOpTensorOp_t;
410
+
411
+ cudnnStatus_t CUDNNWINAPI
412
+ cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc);
413
+
414
+ cudnnStatus_t CUDNNWINAPI
415
+ cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc,
416
+ cudnnOpTensorOp_t opTensorOp,
417
+ cudnnDataType_t opTensorCompType,
418
+ cudnnNanPropagation_t opTensorNanOpt);
419
+
420
+ cudnnStatus_t CUDNNWINAPI
421
+ cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
422
+ cudnnOpTensorOp_t *opTensorOp,
423
+ cudnnDataType_t *opTensorCompType,
424
+ cudnnNanPropagation_t *opTensorNanOpt);
425
+
426
+ cudnnStatus_t CUDNNWINAPI
427
+ cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc);
428
+
429
+ /* Tensor operation : C = op( alpha1 * A, alpha2 * B ) + beta * C */
430
+ /* B tensor is ignored for CUDNN_OP_TENSOR_SQRT, CUDNN_OP_TENSOR_NOT. */
431
+ cudnnStatus_t CUDNNWINAPI
432
+ cudnnOpTensor(cudnnHandle_t handle,
433
+ const cudnnOpTensorDescriptor_t opTensorDesc,
434
+ const void *alpha1,
435
+ const cudnnTensorDescriptor_t aDesc,
436
+ const void *A,
437
+ const void *alpha2,
438
+ const cudnnTensorDescriptor_t bDesc,
439
+ const void *B,
440
+ const void *beta,
441
+ const cudnnTensorDescriptor_t cDesc,
442
+ void *C);
443
+
444
+ /*
445
+ * CUDNN ReduceTensor op type
446
+ */
447
+ typedef enum {
448
+ CUDNN_REDUCE_TENSOR_ADD = 0,
449
+ CUDNN_REDUCE_TENSOR_MUL = 1,
450
+ CUDNN_REDUCE_TENSOR_MIN = 2,
451
+ CUDNN_REDUCE_TENSOR_MAX = 3,
452
+ CUDNN_REDUCE_TENSOR_AMAX = 4,
453
+ CUDNN_REDUCE_TENSOR_AVG = 5,
454
+ CUDNN_REDUCE_TENSOR_NORM1 = 6,
455
+ CUDNN_REDUCE_TENSOR_NORM2 = 7,
456
+ CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8,
457
+ } cudnnReduceTensorOp_t;
458
+
459
+ /*
460
+ * CUDNN ReduceTensor indices type
461
+ */
462
+ typedef enum {
463
+ CUDNN_REDUCE_TENSOR_NO_INDICES = 0,
464
+ CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1,
465
+ } cudnnReduceTensorIndices_t;
466
+
467
+ /*
468
+ * CUDNN tensor indices type size (all unsigned)
469
+ * Currently not supported, default is 32 bit unsigned.
470
+ */
471
+ typedef enum {
472
+ CUDNN_32BIT_INDICES = 0,
473
+ CUDNN_64BIT_INDICES = 1,
474
+ CUDNN_16BIT_INDICES = 2,
475
+ CUDNN_8BIT_INDICES = 3,
476
+ } cudnnIndicesType_t;
477
+
478
+ cudnnStatus_t CUDNNWINAPI
479
+ cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc);
480
+
481
+ cudnnStatus_t CUDNNWINAPI
482
+ cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc,
483
+ cudnnReduceTensorOp_t reduceTensorOp,
484
+ cudnnDataType_t reduceTensorCompType,
485
+ cudnnNanPropagation_t reduceTensorNanOpt,
486
+ cudnnReduceTensorIndices_t reduceTensorIndices,
487
+ cudnnIndicesType_t reduceTensorIndicesType);
488
+
489
+ cudnnStatus_t CUDNNWINAPI
490
+ cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc,
491
+ cudnnReduceTensorOp_t *reduceTensorOp,
492
+ cudnnDataType_t *reduceTensorCompType,
493
+ cudnnNanPropagation_t *reduceTensorNanOpt,
494
+ cudnnReduceTensorIndices_t *reduceTensorIndices,
495
+ cudnnIndicesType_t *reduceTensorIndicesType);
496
+
497
+ cudnnStatus_t CUDNNWINAPI
498
+ cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc);
499
+
500
+ /* Helper function to return the minimum size of the index space to be passed to the reduction given the input and
501
+ * output tensors */
502
+ cudnnStatus_t CUDNNWINAPI
503
+ cudnnGetReductionIndicesSize(cudnnHandle_t handle,
504
+ const cudnnReduceTensorDescriptor_t reduceTensorDesc,
505
+ const cudnnTensorDescriptor_t aDesc,
506
+ const cudnnTensorDescriptor_t cDesc,
507
+ size_t *sizeInBytes);
508
+
509
+ /* Helper function to return the minimum size of the workspace to be passed to the reduction given the input and output
510
+ * tensors */
511
+ cudnnStatus_t CUDNNWINAPI
512
+ cudnnGetReductionWorkspaceSize(cudnnHandle_t handle,
513
+ const cudnnReduceTensorDescriptor_t reduceTensorDesc,
514
+ const cudnnTensorDescriptor_t aDesc,
515
+ const cudnnTensorDescriptor_t cDesc,
516
+ size_t *sizeInBytes);
517
+
518
+ /* Tensor operation : C = reduce op( alpha * A ) + beta * C */
519
+ /* The NaN propagation enum applies to only the min and max reduce ops; the other reduce ops propagate NaN as usual. */
520
+ /* The indices space is ignored for reduce ops other than min or max. */
521
+ cudnnStatus_t CUDNNWINAPI
522
+ cudnnReduceTensor(cudnnHandle_t handle,
523
+ const cudnnReduceTensorDescriptor_t reduceTensorDesc,
524
+ void *indices,
525
+ size_t indicesSizeInBytes,
526
+ void *workspace,
527
+ size_t workspaceSizeInBytes,
528
+ const void *alpha,
529
+ const cudnnTensorDescriptor_t aDesc,
530
+ const void *A,
531
+ const void *beta,
532
+ const cudnnTensorDescriptor_t cDesc,
533
+ void *C);
534
+
535
+ /* Set all values of a tensor to a given value : y[i] = value[0] */
536
+ cudnnStatus_t CUDNNWINAPI
537
+ cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr);
538
+
539
+ /* Scale all values of a tensor by a given factor : y[i] = alpha * y[i] */
540
+ cudnnStatus_t CUDNNWINAPI
541
+ cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha);
542
+
543
+ /* Create an instance of FilterStruct */
544
+ cudnnStatus_t CUDNNWINAPI
545
+ cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc);
546
+
547
+ cudnnStatus_t CUDNNWINAPI
548
+ cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
549
+ cudnnDataType_t dataType, /* image data type */
550
+ cudnnTensorFormat_t format,
551
+ int k, /* number of output feature maps */
552
+ int c, /* number of input feature maps */
553
+ int h, /* height of each input filter */
554
+ int w); /* width of each input filter */
555
+
556
+ cudnnStatus_t CUDNNWINAPI
557
+ cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
558
+ cudnnDataType_t *dataType, /* image data type */
559
+ cudnnTensorFormat_t *format,
560
+ int *k, /* number of output feature maps */
561
+ int *c, /* number of input feature maps */
562
+ int *h, /* height of each input filter */
563
+ int *w); /* width of each input filter */
564
+
565
+ cudnnStatus_t CUDNNWINAPI
566
+ cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc,
567
+ cudnnDataType_t dataType, /* image data type */
568
+ cudnnTensorFormat_t format,
569
+ int nbDims,
570
+ const int filterDimA[]);
571
+
572
+ cudnnStatus_t CUDNNWINAPI
573
+ cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc,
574
+ int nbDimsRequested,
575
+ cudnnDataType_t *dataType, /* image data type */
576
+ cudnnTensorFormat_t *format,
577
+ int *nbDims,
578
+ int filterDimA[]);
579
+ cudnnStatus_t CUDNNWINAPI
580
+ cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t *size);
581
+
582
+ cudnnStatus_t CUDNNWINAPI
583
+ cudnnTransformFilter(cudnnHandle_t handle,
584
+ const cudnnTensorTransformDescriptor_t transDesc,
585
+ const void *alpha,
586
+ const cudnnFilterDescriptor_t srcDesc,
587
+ const void *srcData,
588
+ const void *beta,
589
+ const cudnnFilterDescriptor_t destDesc,
590
+ void *destData);
591
+
592
+ cudnnStatus_t CUDNNWINAPI
593
+ cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc);
594
+
595
+ /*
596
+ * softmax algorithm
597
+ */
598
+ typedef enum {
599
+ CUDNN_SOFTMAX_FAST = 0, /* straightforward implementation */
600
+ CUDNN_SOFTMAX_ACCURATE = 1, /* subtract max from every point to avoid overflow */
601
+ CUDNN_SOFTMAX_LOG = 2
602
+ } cudnnSoftmaxAlgorithm_t;
603
+
604
+ typedef enum {
605
+ CUDNN_SOFTMAX_MODE_INSTANCE = 0, /* compute the softmax over all C, H, W for each N */
606
+ CUDNN_SOFTMAX_MODE_CHANNEL = 1 /* compute the softmax over all C for each H, W, N */
607
+ } cudnnSoftmaxMode_t;
608
+
609
+ /* Softmax functions: All of the form "output = alpha * Op(inputs) + beta * output" */
610
+
611
+ /* Function to perform forward softmax */
612
+ cudnnStatus_t CUDNNWINAPI
613
+ cudnnSoftmaxForward(cudnnHandle_t handle,
614
+ cudnnSoftmaxAlgorithm_t algo,
615
+ cudnnSoftmaxMode_t mode,
616
+ const void *alpha,
617
+ const cudnnTensorDescriptor_t xDesc,
618
+ const void *x,
619
+ const void *beta,
620
+ const cudnnTensorDescriptor_t yDesc,
621
+ void *y);
622
+
623
+ /*
624
+ * pooling mode
625
+ */
626
+ typedef enum {
627
+ CUDNN_POOLING_MAX = 0,
628
+ CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1, /* count for average includes padded values */
629
+ CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2, /* count for average does not include padded values */
630
+ CUDNN_POOLING_MAX_DETERMINISTIC = 3
631
+ } cudnnPoolingMode_t;
632
+
633
+ /* Create an instance of pooling descriptor */
634
+ cudnnStatus_t CUDNNWINAPI
635
+ cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc);
636
+
637
+ cudnnStatus_t CUDNNWINAPI
638
+ cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
639
+ cudnnPoolingMode_t mode,
640
+ cudnnNanPropagation_t maxpoolingNanOpt,
641
+ int windowHeight,
642
+ int windowWidth,
643
+ int verticalPadding,
644
+ int horizontalPadding,
645
+ int verticalStride,
646
+ int horizontalStride);
647
+
648
+ cudnnStatus_t CUDNNWINAPI
649
+ cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
650
+ cudnnPoolingMode_t *mode,
651
+ cudnnNanPropagation_t *maxpoolingNanOpt,
652
+ int *windowHeight,
653
+ int *windowWidth,
654
+ int *verticalPadding,
655
+ int *horizontalPadding,
656
+ int *verticalStride,
657
+ int *horizontalStride);
658
+
659
+ cudnnStatus_t CUDNNWINAPI
660
+ cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc,
661
+ const cudnnPoolingMode_t mode,
662
+ const cudnnNanPropagation_t maxpoolingNanOpt,
663
+ int nbDims,
664
+ const int windowDimA[],
665
+ const int paddingA[],
666
+ const int strideA[]);
667
+
668
+ cudnnStatus_t CUDNNWINAPI
669
+ cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
670
+ int nbDimsRequested,
671
+ cudnnPoolingMode_t *mode,
672
+ cudnnNanPropagation_t *maxpoolingNanOpt,
673
+ int *nbDims,
674
+ int windowDimA[],
675
+ int paddingA[],
676
+ int strideA[]);
677
+
678
+ cudnnStatus_t CUDNNWINAPI
679
+ cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
680
+ const cudnnTensorDescriptor_t inputTensorDesc,
681
+ int nbDims,
682
+ int outputTensorDimA[]);
683
+
684
+ cudnnStatus_t CUDNNWINAPI
685
+ cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
686
+ const cudnnTensorDescriptor_t inputTensorDesc,
687
+ int *n,
688
+ int *c,
689
+ int *h,
690
+ int *w);
691
+
692
+ /* Destroy an instance of pooling descriptor */
693
+ cudnnStatus_t CUDNNWINAPI
694
+ cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc);
695
+
696
+ /* Pooling functions: All of the form "output = alpha * Op(inputs) + beta * output" */
697
+
698
+ /* Function to perform forward pooling */
699
+ cudnnStatus_t CUDNNWINAPI
700
+ cudnnPoolingForward(cudnnHandle_t handle,
701
+ const cudnnPoolingDescriptor_t poolingDesc,
702
+ const void *alpha,
703
+ const cudnnTensorDescriptor_t xDesc,
704
+ const void *x,
705
+ const void *beta,
706
+ const cudnnTensorDescriptor_t yDesc,
707
+ void *y);
708
+
709
+ /*
710
+ * activation mode
711
+ */
712
+ typedef enum {
713
+ CUDNN_ACTIVATION_SIGMOID = 0,
714
+ CUDNN_ACTIVATION_RELU = 1,
715
+ CUDNN_ACTIVATION_TANH = 2,
716
+ CUDNN_ACTIVATION_CLIPPED_RELU = 3,
717
+ CUDNN_ACTIVATION_ELU = 4,
718
+ CUDNN_ACTIVATION_IDENTITY = 5,
719
+ CUDNN_ACTIVATION_SWISH = 6
720
+ } cudnnActivationMode_t;
721
+
722
+ /* Activation functions: All of the form "output = alpha * Op(inputs) + beta * output" */
723
+ cudnnStatus_t CUDNNWINAPI
724
+ cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc);
725
+
726
+ cudnnStatus_t CUDNNWINAPI
727
+ cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
728
+ cudnnActivationMode_t mode,
729
+ cudnnNanPropagation_t reluNanOpt,
730
+ double coef); /* ceiling for clipped RELU, alpha for ELU */
731
+
732
+ cudnnStatus_t CUDNNWINAPI
733
+ cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
734
+ cudnnActivationMode_t *mode,
735
+ cudnnNanPropagation_t *reluNanOpt,
736
+ double *coef); /* ceiling for clipped RELU, alpha for ELU */
737
+
738
+ cudnnStatus_t CUDNNWINAPI
739
+ cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double swish_beta);
740
+
741
+ cudnnStatus_t CUDNNWINAPI
742
+ cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double *swish_beta);
743
+
744
+ cudnnStatus_t CUDNNWINAPI
745
+ cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc);
746
+
747
+ /* Function to perform forward activation */
748
+ cudnnStatus_t CUDNNWINAPI
749
+ cudnnActivationForward(cudnnHandle_t handle,
750
+ cudnnActivationDescriptor_t activationDesc,
751
+ const void *alpha,
752
+ const cudnnTensorDescriptor_t xDesc,
753
+ const void *x,
754
+ const void *beta,
755
+ const cudnnTensorDescriptor_t yDesc,
756
+ void *y);
757
+
758
+ /*
759
+ * Create an instance of LRN (Local Response Normalization) descriptor
760
+ * Uses lrnN=5, lrnAlpha=1e-4, lrnBeta=0.75, lrnK=2.0 as defaults from Krizhevsky'12 ImageNet paper
761
+ */
762
+ cudnnStatus_t CUDNNWINAPI
763
+ cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc);
764
+
765
+ #define CUDNN_LRN_MIN_N 1 /* minimum allowed lrnN */
766
+ #define CUDNN_LRN_MAX_N 16 /* maximum allowed lrnN */
767
+ #define CUDNN_LRN_MIN_K 1e-5 /* minimum allowed lrnK */
768
+ #define CUDNN_LRN_MIN_BETA 0.01 /* minimum allowed lrnBeta */
769
+
770
+ /* LRN layer mode */
771
+ typedef enum {
772
+ CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0, /* Normalize across tensor's dimA[1] dimension */
773
+ } cudnnLRNMode_t;
774
+
775
+ /*
776
+ * Uses a window [center-lookBehind, center+lookAhead], where
777
+ * lookBehind = floor( (lrnN-1)/2 ), lookAhead = lrnN-lookBehind-1.
778
+ * Values of double parameters cast to tensor data type.
779
+ */
780
+ cudnnStatus_t CUDNNWINAPI
781
+ cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK);
782
+ /*
783
+ * Retrieve the settings currently stored in an LRN layer descriptor
784
+ * Any of the provided pointers can be NULL (no corresponding value will be returned)
785
+ */
786
+ cudnnStatus_t CUDNNWINAPI
787
+ cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK);
788
+
789
+ /* Destroy an instance of LRN descriptor */
790
+ cudnnStatus_t CUDNNWINAPI
791
+ cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc);
792
+
793
+ /* LRN functions: output = alpha * normalize(x) + beta * old_y */
794
+
795
+ /* LRN cross-channel forward computation. Double parameters cast to tensor data type */
796
+ cudnnStatus_t CUDNNWINAPI
797
+ cudnnLRNCrossChannelForward(cudnnHandle_t handle,
798
+ cudnnLRNDescriptor_t normDesc,
799
+ cudnnLRNMode_t lrnMode,
800
+ const void *alpha,
801
+ const cudnnTensorDescriptor_t xDesc,
802
+ const void *x,
803
+ const void *beta,
804
+ const cudnnTensorDescriptor_t yDesc,
805
+ void *y);
806
+
807
+ typedef enum {
808
+ CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0,
809
+ } cudnnDivNormMode_t;
810
+
811
+ /* LCN/divisive normalization functions: y = alpha * normalize(x) + beta * y */
812
+ cudnnStatus_t CUDNNWINAPI
813
+ cudnnDivisiveNormalizationForward(cudnnHandle_t handle,
814
+ cudnnLRNDescriptor_t normDesc,
815
+ cudnnDivNormMode_t mode,
816
+ const void *alpha,
817
+ const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
818
+ const void *x,
819
+ const void *means, /* if NULL, means are assumed to be zero */
820
+ void *temp,
821
+ void *temp2,
822
+ const void *beta,
823
+ const cudnnTensorDescriptor_t yDesc,
824
+ void *y);
825
+
826
+ typedef enum {
827
+ /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
828
+ CUDNN_BATCHNORM_PER_ACTIVATION = 0,
829
+
830
+ /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
831
+ CUDNN_BATCHNORM_SPATIAL = 1,
832
+
833
+ /*
834
+ * bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors).
835
+ * May be faster than CUDNN_BATCHNORM_SPATIAL but imposes some limits on the range of values
836
+ */
837
+ CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2,
838
+ } cudnnBatchNormMode_t;
839
+
840
+ #define CUDNN_BN_MIN_EPSILON 0.0 /* Minimum epsilon allowed to be used in the Batch Normalization formula */
841
+
842
+ /*
843
+ * Derives a tensor descriptor from layer data descriptor for BatchNormalization
844
+ * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
845
+ * bnScaleBiasMeanVarDesc and bnScaleBiasDiffDesc in Batch Normalization forward and backward functions.
846
+ */
847
+ cudnnStatus_t CUDNNWINAPI
848
+ cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc,
849
+ const cudnnTensorDescriptor_t xDesc,
850
+ cudnnBatchNormMode_t mode);
851
+
852
+ typedef enum {
853
+ CUDNN_BATCHNORM_OPS_BN = 0, /* do batch normalization only */
854
+ CUDNN_BATCHNORM_OPS_BN_ACTIVATION = 1, /* do batchNorm, then activation */
855
+ CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2, /* do batchNorm, then elemWiseAdd, then activation */
856
+ } cudnnBatchNormOps_t;
857
+
858
+ /*
859
+ * Performs Batch Normalization during Inference:
860
+ * y[i] = bnScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + bnBias[k]
861
+ * with bnScale, bnBias, runningMean, runningInvVariance tensors indexed
862
+ * according to spatial or per-activation mode. Refer to cudnnBatchNormalizationForwardTraining
863
+ * above for notes on function arguments.
864
+ */
865
+ cudnnStatus_t CUDNNWINAPI
866
+ cudnnBatchNormalizationForwardInference(cudnnHandle_t handle,
867
+ cudnnBatchNormMode_t mode,
868
+ const void *alpha, /* alpha[0] = result blend factor */
869
+ const void *beta, /* beta[0] = dest layer blend factor */
870
+ const cudnnTensorDescriptor_t xDesc,
871
+ const void *x, /* NxCxHxW */
872
+ const cudnnTensorDescriptor_t yDesc,
873
+ void *y, /* NxCxHxW */
874
+ const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
875
+ const void *bnScale,
876
+ const void *bnBias,
877
+ const void *estimatedMean,
878
+ const void *estimatedVariance,
879
+ double epsilon);
880
+
881
+ typedef enum {
882
+ /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
883
+ CUDNN_NORM_PER_ACTIVATION = 0,
884
+
885
+ /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
886
+ CUDNN_NORM_PER_CHANNEL = 1,
887
+ } cudnnNormMode_t;
888
+
889
+ typedef enum { CUDNN_NORM_ALGO_STANDARD = 0, CUDNN_NORM_ALGO_PERSIST = 1 } cudnnNormAlgo_t;
890
+
891
+ /*
892
+ * Derives a tensor descriptor from layer data descriptor for Normalization
893
+ * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
894
+ * normScaleBiasMeanVarDesc and normScaleBiasDiffDesc in Normalization forward and backward functions.
895
+ */
896
+ cudnnStatus_t CUDNNWINAPI
897
+ cudnnDeriveNormTensorDescriptor(cudnnTensorDescriptor_t derivedNormScaleBiasDesc,
898
+ cudnnTensorDescriptor_t derivedNormMeanVarDesc,
899
+ const cudnnTensorDescriptor_t xDesc,
900
+ cudnnNormMode_t mode,
901
+ int groupCnt); /* Place hold for future work, should be set to 1 now*/
902
+
903
+ typedef enum {
904
+ CUDNN_NORM_OPS_NORM = 0, /* do normalization only */
905
+ CUDNN_NORM_OPS_NORM_ACTIVATION = 1, /* do Norm, then activation */
906
+ CUDNN_NORM_OPS_NORM_ADD_ACTIVATION = 2, /* do Norm, then elemWiseAdd, then activation */
907
+ } cudnnNormOps_t;
908
+
909
+ /*
910
+ * Performs Normalization during Inference:
911
+ * y[i] = normScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + normBias[k]
912
+ * with normScale, normBias, runningMean, runningInvVariance tensors indexed
913
+ * according to per-channel or per-activation mode. Refer to cudnnNormalizationForwardTraining
914
+ * above for notes on function arguments.
915
+ */
916
+ cudnnStatus_t CUDNNWINAPI
917
+ cudnnNormalizationForwardInference(cudnnHandle_t handle,
918
+ cudnnNormMode_t mode,
919
+ cudnnNormOps_t normOps,
920
+ cudnnNormAlgo_t algo,
921
+ const void *alpha, /* alpha[0] = result blend factor */
922
+ const void *beta, /* beta[0] = dest layer blend factor */
923
+ const cudnnTensorDescriptor_t xDesc,
924
+ const void *x, /* NxCxHxW */
925
+ const cudnnTensorDescriptor_t normScaleBiasDesc,
926
+ const void *normScale,
927
+ const void *normBias,
928
+ const cudnnTensorDescriptor_t normMeanVarDesc,
929
+ const void *estimatedMean,
930
+ const void *estimatedVariance,
931
+ const cudnnTensorDescriptor_t zDesc,
932
+ const void *z,
933
+ cudnnActivationDescriptor_t activationDesc,
934
+ const cudnnTensorDescriptor_t yDesc,
935
+ void *y, /* NxCxHxW */
936
+ double epsilon,
937
+ int groupCnt); /* Place hold for future work*/
938
+
939
+ /* APIs for spatial transformer network*/
940
+ typedef enum {
941
+ CUDNN_SAMPLER_BILINEAR = 0,
942
+ } cudnnSamplerType_t;
943
+
944
+ cudnnStatus_t CUDNNWINAPI
945
+ cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc);
946
+
947
+ cudnnStatus_t CUDNNWINAPI
948
+ cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc,
949
+ cudnnSamplerType_t samplerType,
950
+ cudnnDataType_t dataType,
951
+ const int nbDims,
952
+ const int dimA[]);
953
+
954
+ cudnnStatus_t CUDNNWINAPI
955
+ cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc);
956
+
957
+ cudnnStatus_t CUDNNWINAPI
958
+ cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle,
959
+ const cudnnSpatialTransformerDescriptor_t stDesc,
960
+ const void *theta,
961
+ void *grid);
962
+
963
+ cudnnStatus_t CUDNNWINAPI
964
+ cudnnSpatialTfSamplerForward(cudnnHandle_t handle,
965
+ cudnnSpatialTransformerDescriptor_t stDesc,
966
+ const void *alpha,
967
+ const cudnnTensorDescriptor_t xDesc,
968
+ const void *x,
969
+ const void *grid,
970
+ const void *beta,
971
+ cudnnTensorDescriptor_t yDesc,
972
+ void *y);
973
+
974
+ typedef struct cudnnDropoutStruct *cudnnDropoutDescriptor_t;
975
+
976
+ cudnnStatus_t CUDNNWINAPI
977
+ cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc);
978
+
979
+ cudnnStatus_t CUDNNWINAPI
980
+ cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc);
981
+
982
+ /*helper function to determine size of the states to be passed to cudnnSetDropoutDescriptor */
983
+ cudnnStatus_t CUDNNWINAPI
984
+ cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes);
985
+
986
+ /*helper function to determine size of the reserve space to be passed to dropout forward/backward calls */
987
+ cudnnStatus_t CUDNNWINAPI
988
+ cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes);
989
+
990
+ cudnnStatus_t CUDNNWINAPI
991
+ cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
992
+ cudnnHandle_t handle,
993
+ float dropout,
994
+ void *states,
995
+ size_t stateSizeInBytes,
996
+ unsigned long long seed);
997
+
998
+ /* Restores the dropout descriptor to a previously saved-off state */
999
+ cudnnStatus_t CUDNNWINAPI
1000
+ cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
1001
+ cudnnHandle_t handle,
1002
+ float dropout,
1003
+ void *states,
1004
+ size_t stateSizeInBytes,
1005
+ unsigned long long seed);
1006
+
1007
+ cudnnStatus_t CUDNNWINAPI
1008
+ cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
1009
+ cudnnHandle_t handle,
1010
+ float *dropout,
1011
+ void **states,
1012
+ unsigned long long *seed);
1013
+
1014
+ cudnnStatus_t CUDNNWINAPI
1015
+ cudnnDropoutForward(cudnnHandle_t handle,
1016
+ const cudnnDropoutDescriptor_t dropoutDesc,
1017
+ const cudnnTensorDescriptor_t xdesc,
1018
+ const void *x,
1019
+ const cudnnTensorDescriptor_t ydesc,
1020
+ void *y,
1021
+ void *reserveSpace,
1022
+ size_t reserveSpaceSizeInBytes);
1023
+
1024
+ /* TODO: remove */
1025
+
1026
+ typedef struct cudnnAlgorithmStruct *cudnnAlgorithmDescriptor_t;
1027
+ typedef struct cudnnAlgorithmPerformanceStruct *cudnnAlgorithmPerformance_t;
1028
+
1029
+ /* TODO: move these enums out to the appropriate submodule */
1030
+ typedef enum {
1031
+ CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 0,
1032
+ CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1,
1033
+ CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2,
1034
+ CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 3,
1035
+ CUDNN_CONVOLUTION_FWD_ALGO_FFT = 4,
1036
+ CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 5,
1037
+ CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 6,
1038
+ CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED = 7,
1039
+ CUDNN_CONVOLUTION_FWD_ALGO_COUNT = 8
1040
+ } cudnnConvolutionFwdAlgo_t;
1041
+
1042
+ typedef enum {
1043
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 0, /* non-deterministic */
1044
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 1,
1045
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 2,
1046
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 3, /* non-deterministic */
1047
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD = 4, /* not implemented */
1048
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5,
1049
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING = 6,
1050
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT = 7
1051
+ } cudnnConvolutionBwdFilterAlgo_t;
1052
+
1053
+ typedef enum {
1054
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 0, /* non-deterministic */
1055
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 1,
1056
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 2,
1057
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 3,
1058
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 4,
1059
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5,
1060
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT = 6
1061
+ } cudnnConvolutionBwdDataAlgo_t;
1062
+
1063
+ typedef enum {
1064
+ CUDNN_RNN_ALGO_STANDARD = 0,
1065
+ CUDNN_RNN_ALGO_PERSIST_STATIC = 1,
1066
+ CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 2,
1067
+ CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H = 3,
1068
+ CUDNN_RNN_ALGO_COUNT = 4,
1069
+ } cudnnRNNAlgo_t;
1070
+
1071
+ typedef enum { CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0, CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 } cudnnCTCLossAlgo_t;
1072
+
1073
+ /* TODO: remove */
1074
+ typedef struct cudnnAlgorithmUnionStruct {
1075
+ union Algorithm {
1076
+ cudnnConvolutionFwdAlgo_t convFwdAlgo;
1077
+ cudnnConvolutionBwdFilterAlgo_t convBwdFilterAlgo;
1078
+ cudnnConvolutionBwdDataAlgo_t convBwdDataAlgo;
1079
+ cudnnRNNAlgo_t RNNAlgo;
1080
+ cudnnCTCLossAlgo_t CTCLossAlgo;
1081
+ } algo;
1082
+ } cudnnAlgorithm_t;
1083
+
1084
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1085
+ cudnnCreateAlgorithmDescriptor(cudnnAlgorithmDescriptor_t *algoDesc);
1086
+
1087
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1088
+ cudnnSetAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm);
1089
+
1090
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1091
+ cudnnGetAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm);
1092
+
1093
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1094
+ cudnnCopyAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest);
1095
+
1096
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1097
+ cudnnDestroyAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc);
1098
+
1099
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1100
+ cudnnCreateAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate);
1101
+
1102
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1103
+ cudnnSetAlgorithmPerformance(cudnnAlgorithmPerformance_t algoPerf,
1104
+ cudnnAlgorithmDescriptor_t algoDesc,
1105
+ cudnnStatus_t status,
1106
+ float time,
1107
+ size_t memory);
1108
+
1109
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1110
+ cudnnGetAlgorithmPerformance(const cudnnAlgorithmPerformance_t algoPerf,
1111
+ cudnnAlgorithmDescriptor_t *algoDesc,
1112
+ cudnnStatus_t *status,
1113
+ float *time,
1114
+ size_t *memory);
1115
+
1116
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1117
+ cudnnDestroyAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy);
1118
+
1119
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1120
+ cudnnGetAlgorithmSpaceSize(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc, size_t *algoSpaceSizeInBytes);
1121
+
1122
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1123
+ cudnnSaveAlgorithm(cudnnHandle_t handle,
1124
+ cudnnAlgorithmDescriptor_t algoDesc,
1125
+ void *algoSpace,
1126
+ size_t algoSpaceSizeInBytes);
1127
+
1128
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1129
+ cudnnRestoreAlgorithm(cudnnHandle_t handle,
1130
+ void *algoSpace,
1131
+ size_t algoSpaceSizeInBytes,
1132
+ cudnnAlgorithmDescriptor_t algoDesc);
1133
+
1134
+ typedef enum {
1135
+ CUDNN_SEV_FATAL = 0,
1136
+ CUDNN_SEV_ERROR = 1,
1137
+ CUDNN_SEV_WARNING = 2,
1138
+ CUDNN_SEV_INFO = 3,
1139
+ } cudnnSeverity_t;
1140
+
1141
+ /* Message masks to be used with cudnnSetCallback() */
1142
+ #define CUDNN_SEV_ERROR_EN (1U << CUDNN_SEV_ERROR)
1143
+ #define CUDNN_SEV_WARNING_EN (1U << CUDNN_SEV_WARNING)
1144
+ #define CUDNN_SEV_INFO_EN (1U << CUDNN_SEV_INFO)
1145
+
1146
+ /* struct containing useful informaiton for each API call */
1147
+ typedef struct cudnnDebugStruct {
1148
+ unsigned cudnn_version;
1149
+ cudnnStatus_t cudnnStatus;
1150
+ unsigned time_sec; /* epoch time in seconds */
1151
+ unsigned time_usec; /* microseconds part of epoch time */
1152
+ unsigned time_delta; /* time since start in seconds */
1153
+ cudnnHandle_t handle; /* cudnn handle */
1154
+ cudaStream_t stream; /* cuda stream ID */
1155
+ unsigned long long pid; /* process ID */
1156
+ unsigned long long tid; /* thread ID */
1157
+ int cudaDeviceId; /* CUDA device ID */
1158
+ int reserved[15]; /* reserved for future use */
1159
+ } cudnnDebug_t;
1160
+
1161
+ typedef void (*cudnnCallback_t)(cudnnSeverity_t sev, void *udata, const cudnnDebug_t *dbg, const char *msg);
1162
+
1163
+ cudnnStatus_t CUDNNWINAPI
1164
+ cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr);
1165
+
1166
+ cudnnStatus_t CUDNNWINAPI
1167
+ cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr);
1168
+
1169
+ /*
1170
+ * \brief Cross-library version checker.
1171
+ * This function is implemented differently in each sub-library. Each sublib
1172
+ * checks whether its own version matches that of its dependencies.
1173
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
1174
+ * CUDNN_STATUS_VERSION_MISMATCH if the versions are inconsistent.
1175
+ */
1176
+ cudnnStatus_t CUDNNWINAPI
1177
+ cudnnOpsInferVersionCheck(void);
1178
+
1179
+ #if defined(__cplusplus)
1180
+ }
1181
+ #endif
1182
+
1183
+ #endif /* CUDNN_OPS_INFER_H_ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_train.h ADDED
@@ -0,0 +1,501 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2017-2022 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /*
51
+ * cudnn_ops_train : cuDNN's basic training operations and algorithms.
52
+ */
53
+
54
+ #if !defined(CUDNN_OPS_TRAIN_H_)
55
+ #define CUDNN_OPS_TRAIN_H_
56
+
57
+ #include <cuda_runtime.h>
58
+ #include <stdint.h>
59
+
60
+ #include "cudnn_version.h"
61
+ #include "cudnn_ops_infer.h"
62
+
63
+ /* These version numbers are autogenerated, do not edit manually. */
64
+ #define CUDNN_OPS_TRAIN_MAJOR 8
65
+ #define CUDNN_OPS_TRAIN_MINOR 7
66
+ #define CUDNN_OPS_TRAIN_PATCH 0
67
+
68
+ #if (CUDNN_OPS_TRAIN_MAJOR != CUDNN_MAJOR) || (CUDNN_OPS_TRAIN_MINOR != CUDNN_MINOR) || \
69
+ (CUDNN_OPS_TRAIN_PATCH != CUDNN_PATCHLEVEL)
70
+ #error Version mismatch in cuDNN OPS TRAIN!!!
71
+ #endif
72
+
73
+ #if defined(__cplusplus)
74
+ extern "C" {
75
+ #endif
76
+
77
+ /* Function to perform backward softmax */
78
+ cudnnStatus_t CUDNNWINAPI
79
+ cudnnSoftmaxBackward(cudnnHandle_t handle,
80
+ cudnnSoftmaxAlgorithm_t algo,
81
+ cudnnSoftmaxMode_t mode,
82
+ const void *alpha,
83
+ const cudnnTensorDescriptor_t yDesc,
84
+ const void *y,
85
+ const cudnnTensorDescriptor_t dyDesc,
86
+ const void *dy,
87
+ const void *beta,
88
+ const cudnnTensorDescriptor_t dxDesc,
89
+ void *dx);
90
+
91
+ /* Function to perform backward pooling */
92
+ cudnnStatus_t CUDNNWINAPI
93
+ cudnnPoolingBackward(cudnnHandle_t handle,
94
+ const cudnnPoolingDescriptor_t poolingDesc,
95
+ const void *alpha,
96
+ const cudnnTensorDescriptor_t yDesc,
97
+ const void *y,
98
+ const cudnnTensorDescriptor_t dyDesc,
99
+ const void *dy,
100
+ const cudnnTensorDescriptor_t xDesc,
101
+ const void *x,
102
+ const void *beta,
103
+ const cudnnTensorDescriptor_t dxDesc,
104
+ void *dx);
105
+
106
+ /* Function to perform backward activation */
107
+ cudnnStatus_t CUDNNWINAPI
108
+ cudnnActivationBackward(cudnnHandle_t handle,
109
+ cudnnActivationDescriptor_t activationDesc,
110
+ const void *alpha,
111
+ const cudnnTensorDescriptor_t yDesc,
112
+ const void *y,
113
+ const cudnnTensorDescriptor_t dyDesc,
114
+ const void *dy,
115
+ const cudnnTensorDescriptor_t xDesc,
116
+ const void *x,
117
+ const void *beta,
118
+ const cudnnTensorDescriptor_t dxDesc,
119
+ void *dx);
120
+
121
+ /* LRN cross-channel backward computation. Double parameters cast to tensor data type */
122
+ cudnnStatus_t CUDNNWINAPI
123
+ cudnnLRNCrossChannelBackward(cudnnHandle_t handle,
124
+ cudnnLRNDescriptor_t normDesc,
125
+ cudnnLRNMode_t lrnMode,
126
+ const void *alpha,
127
+ const cudnnTensorDescriptor_t yDesc,
128
+ const void *y,
129
+ const cudnnTensorDescriptor_t dyDesc,
130
+ const void *dy,
131
+ const cudnnTensorDescriptor_t xDesc,
132
+ const void *x,
133
+ const void *beta,
134
+ const cudnnTensorDescriptor_t dxDesc,
135
+ void *dx);
136
+
137
+ cudnnStatus_t CUDNNWINAPI
138
+ cudnnDivisiveNormalizationBackward(cudnnHandle_t handle,
139
+ cudnnLRNDescriptor_t normDesc,
140
+ cudnnDivNormMode_t mode,
141
+ const void *alpha,
142
+ const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */
143
+ const void *x,
144
+ const void *means, /* if NULL, means are assumed to be zero */
145
+ const void *dy,
146
+ void *temp,
147
+ void *temp2,
148
+ const void *beta,
149
+ const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
150
+ void *dx, /* output x differential */
151
+ void *dMeans); /* output means differential, can be NULL */
152
+
153
+ cudnnStatus_t CUDNNWINAPI
154
+ cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHandle_t handle,
155
+ cudnnBatchNormMode_t mode,
156
+ cudnnBatchNormOps_t bnOps,
157
+ const cudnnTensorDescriptor_t xDesc,
158
+ const cudnnTensorDescriptor_t zDesc,
159
+ const cudnnTensorDescriptor_t yDesc,
160
+ const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
161
+ const cudnnActivationDescriptor_t activationDesc,
162
+ size_t *sizeInBytes);
163
+
164
+ cudnnStatus_t CUDNNWINAPI
165
+ cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t handle,
166
+ cudnnBatchNormMode_t mode,
167
+ cudnnBatchNormOps_t bnOps,
168
+ const cudnnTensorDescriptor_t xDesc,
169
+ const cudnnTensorDescriptor_t yDesc,
170
+ const cudnnTensorDescriptor_t dyDesc,
171
+ const cudnnTensorDescriptor_t dzDesc,
172
+ const cudnnTensorDescriptor_t dxDesc,
173
+ const cudnnTensorDescriptor_t dBnScaleBiasDesc,
174
+ const cudnnActivationDescriptor_t activationDesc,
175
+ size_t *sizeInBytes);
176
+
177
+ cudnnStatus_t CUDNNWINAPI
178
+ cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t handle,
179
+ cudnnBatchNormMode_t mode,
180
+ cudnnBatchNormOps_t bnOps,
181
+ const cudnnActivationDescriptor_t activationDesc,
182
+ const cudnnTensorDescriptor_t xDesc,
183
+ size_t *sizeInBytes);
184
+
185
+ /* Computes y = BN(x). Also accumulates moving averages of mean and inverse variances */
186
+ cudnnStatus_t CUDNNWINAPI
187
+ cudnnBatchNormalizationForwardTraining(
188
+ cudnnHandle_t handle,
189
+ cudnnBatchNormMode_t mode,
190
+
191
+ const void *alpha, /* alpha[0] = result blend factor */
192
+ const void *beta, /* beta[0] = dest layer blend factor */
193
+
194
+ const cudnnTensorDescriptor_t xDesc,
195
+ const void *x, /* NxCxHxW */
196
+ const cudnnTensorDescriptor_t yDesc,
197
+ void *y, /* NxCxHxW */
198
+
199
+ /* Shared desc for the next 6 tensors in the argument list.
200
+ Data type to be set as follows:
201
+ type = (typeOf(x) == double) ? double : float
202
+ Dimensions for this descriptor depend on normalization mode
203
+ - Spatial Normalization : tensors are expected to have dims 1xCx1x1
204
+ (normalization is performed across NxHxW)
205
+ - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW
206
+ (normalization is performed across N) */
207
+ const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
208
+
209
+ /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
210
+ const void *bnScale,
211
+ const void *bnBias,
212
+
213
+ /* MUST use factor=1 in the very first call of a complete training cycle.
214
+ Use a factor=1/(1+n) at N-th call to the function to get
215
+ Cumulative Moving Average (CMA) behavior
216
+ CMA[n] = (x[1]+...+x[n])/n
217
+ Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
218
+ ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
219
+ CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
220
+ double exponentialAverageFactor,
221
+
222
+ /* Used in Training phase only.
223
+ runningMean = newMean*factor + runningMean*(1-factor) */
224
+ void *resultRunningMean,
225
+ /* Output in training mode, input in inference. Is the moving average
226
+ of variance[x] (factor is applied in the same way as for runningMean) */
227
+ void *resultRunningVariance,
228
+
229
+ /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
230
+ double epsilon,
231
+
232
+ /* Optionally save intermediate results from the forward pass here
233
+ - can be reused to speed up backward pass. NULL if unused */
234
+ void *resultSaveMean,
235
+ void *resultSaveInvVariance);
236
+
237
+ /* Computes y = relu(BN(x) + z). Also accumulates moving averages of mean and inverse variances */
238
+ cudnnStatus_t CUDNNWINAPI
239
+ cudnnBatchNormalizationForwardTrainingEx(
240
+ cudnnHandle_t handle,
241
+ cudnnBatchNormMode_t mode,
242
+ cudnnBatchNormOps_t bnOps,
243
+
244
+ const void *alpha, /* alpha[0] = result blend factor */
245
+ const void *beta, /* beta[0] = dest layer blend factor */
246
+
247
+ const cudnnTensorDescriptor_t xDesc,
248
+ const void *xData,
249
+ const cudnnTensorDescriptor_t zDesc,
250
+ const void *zData,
251
+ const cudnnTensorDescriptor_t yDesc,
252
+ void *yData,
253
+
254
+ const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
255
+ const void *bnScale,
256
+ const void *bnBias,
257
+
258
+ double exponentialAverageFactor,
259
+ void *resultRunningMean,
260
+ void *resultRunningVariance,
261
+
262
+ /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
263
+ double epsilon,
264
+
265
+ /* Optionally save intermediate results from the forward pass here
266
+ - can be reused to speed up backward pass. NULL if unused */
267
+ void *resultSaveMean,
268
+ void *resultSaveInvVariance,
269
+
270
+ cudnnActivationDescriptor_t activationDesc,
271
+ void *workspace,
272
+ size_t workSpaceSizeInBytes,
273
+ void *reserveSpace,
274
+ size_t reserveSpaceSizeInBytes);
275
+
276
+ /* Performs backward pass of Batch Normalization layer. Returns x gradient,
277
+ * bnScale gradient and bnBias gradient */
278
+ cudnnStatus_t CUDNNWINAPI
279
+ cudnnBatchNormalizationBackward(cudnnHandle_t handle,
280
+ cudnnBatchNormMode_t mode,
281
+ const void *alphaDataDiff,
282
+ const void *betaDataDiff,
283
+ const void *alphaParamDiff,
284
+ const void *betaParamDiff,
285
+ const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
286
+ const void *x,
287
+ const cudnnTensorDescriptor_t dyDesc,
288
+ const void *dy,
289
+ const cudnnTensorDescriptor_t dxDesc,
290
+ void *dx,
291
+ /* Shared tensor desc for the 4 tensors below */
292
+ const cudnnTensorDescriptor_t dBnScaleBiasDesc,
293
+ const void *bnScale, /* bnBias doesn't affect backpropagation */
294
+ /* scale and bias diff are not backpropagated below this layer */
295
+ void *dBnScaleResult,
296
+ void *dBnBiasResult,
297
+ /* Same epsilon as forward pass */
298
+ double epsilon,
299
+
300
+ /* Optionally cached intermediate results from
301
+ forward pass */
302
+ const void *savedMean,
303
+ const void *savedInvVariance);
304
+
305
+ cudnnStatus_t CUDNNWINAPI
306
+ cudnnBatchNormalizationBackwardEx(cudnnHandle_t handle,
307
+ cudnnBatchNormMode_t mode,
308
+ cudnnBatchNormOps_t bnOps,
309
+
310
+ const void *alphaDataDiff,
311
+ const void *betaDataDiff,
312
+ const void *alphaParamDiff,
313
+ const void *betaParamDiff,
314
+ const cudnnTensorDescriptor_t xDesc,
315
+ const void *xData,
316
+ const cudnnTensorDescriptor_t yDesc,
317
+ const void *yData,
318
+ const cudnnTensorDescriptor_t dyDesc,
319
+ const void *dyData,
320
+ const cudnnTensorDescriptor_t dzDesc,
321
+ void *dzData,
322
+ const cudnnTensorDescriptor_t dxDesc,
323
+ void *dxData,
324
+
325
+ /* Shared tensor desc for the 4 tensors below */
326
+ const cudnnTensorDescriptor_t dBnScaleBiasDesc,
327
+ const void *bnScaleData,
328
+ const void *bnBiasData, /* needed if there is activation */
329
+ void *dBnScaleData,
330
+ void *dBnBiasData,
331
+ double epsilon, /* Same epsilon as forward pass */
332
+
333
+ /* Optionally cached intermediate results from
334
+ forward pass */
335
+ const void *savedMean,
336
+ const void *savedInvVariance,
337
+ cudnnActivationDescriptor_t activationDesc,
338
+ void *workSpace,
339
+ size_t workSpaceSizeInBytes,
340
+ void *reserveSpace,
341
+ size_t reserveSpaceSizeInBytes);
342
+
343
+ cudnnStatus_t CUDNNWINAPI
344
+ cudnnGetNormalizationForwardTrainingWorkspaceSize(cudnnHandle_t handle,
345
+ cudnnNormMode_t mode,
346
+ cudnnNormOps_t normOps,
347
+ cudnnNormAlgo_t algo,
348
+ const cudnnTensorDescriptor_t xDesc,
349
+ const cudnnTensorDescriptor_t zDesc,
350
+ const cudnnTensorDescriptor_t yDesc,
351
+ const cudnnTensorDescriptor_t normScaleBiasDesc,
352
+ const cudnnActivationDescriptor_t activationDesc,
353
+ const cudnnTensorDescriptor_t normMeanVarDesc,
354
+ size_t *sizeInBytes,
355
+ int groupCnt); /* Place hold for future work, should be set to 1 now*/
356
+
357
+ cudnnStatus_t CUDNNWINAPI
358
+ cudnnGetNormalizationBackwardWorkspaceSize(cudnnHandle_t handle,
359
+ cudnnNormMode_t mode,
360
+ cudnnNormOps_t normOps,
361
+ cudnnNormAlgo_t algo,
362
+ const cudnnTensorDescriptor_t xDesc,
363
+ const cudnnTensorDescriptor_t yDesc,
364
+ const cudnnTensorDescriptor_t dyDesc,
365
+ const cudnnTensorDescriptor_t dzDesc,
366
+ const cudnnTensorDescriptor_t dxDesc,
367
+ const cudnnTensorDescriptor_t dNormScaleBiasDesc,
368
+ const cudnnActivationDescriptor_t activationDesc,
369
+ const cudnnTensorDescriptor_t normMeanVarDesc,
370
+ size_t *sizeInBytes,
371
+ int groupCnt); /* Place hold for future work, should be set to 1 now*/
372
+
373
+ cudnnStatus_t CUDNNWINAPI
374
+ cudnnGetNormalizationTrainingReserveSpaceSize(cudnnHandle_t handle,
375
+ cudnnNormMode_t mode,
376
+ cudnnNormOps_t normOps,
377
+ cudnnNormAlgo_t algo,
378
+ const cudnnActivationDescriptor_t activationDesc,
379
+ const cudnnTensorDescriptor_t xDesc,
380
+ size_t *sizeInBytes,
381
+ int groupCnt); /* Place hold for future work, should be set to 1 now*/
382
+
383
+ /* Computes y = relu(Norm(x) + z). Also accumulates moving averages of mean and inverse variances */
384
+ cudnnStatus_t CUDNNWINAPI
385
+ cudnnNormalizationForwardTraining(cudnnHandle_t handle,
386
+ cudnnNormMode_t mode,
387
+ cudnnNormOps_t normOps,
388
+ cudnnNormAlgo_t algo,
389
+ const void *alpha, /* alpha[0] = result blend factor */
390
+ const void *beta, /* beta[0] = dest layer blend factor */
391
+ const cudnnTensorDescriptor_t xDesc,
392
+ const void *xData,
393
+ const cudnnTensorDescriptor_t normScaleBiasDesc,
394
+ const void *normScale,
395
+ const void *normBias,
396
+ double exponentialAverageFactor,
397
+ const cudnnTensorDescriptor_t normMeanVarDesc,
398
+ void *resultRunningMean,
399
+ void *resultRunningVariance,
400
+ /* Has to be >= 0. Should be the same in forward and backward functions. */
401
+ double epsilon,
402
+ /* Optionally save intermediate results from the forward pass here
403
+ - can be reused to speed up backward pass. NULL if unused */
404
+ void *resultSaveMean,
405
+ void *resultSaveInvVariance,
406
+ cudnnActivationDescriptor_t activationDesc,
407
+ const cudnnTensorDescriptor_t zDesc,
408
+ const void *zData,
409
+ const cudnnTensorDescriptor_t yDesc,
410
+ void *yData,
411
+ void *workspace,
412
+ size_t workSpaceSizeInBytes,
413
+ void *reserveSpace,
414
+ size_t reserveSpaceSizeInBytes,
415
+ int groupCnt); /* Place hold for future work, should be set to 1 now*/
416
+
417
+ cudnnStatus_t CUDNNWINAPI
418
+ cudnnNormalizationBackward(cudnnHandle_t handle,
419
+ cudnnNormMode_t mode,
420
+ cudnnNormOps_t normOps,
421
+ cudnnNormAlgo_t algo,
422
+ const void *alphaDataDiff,
423
+ const void *betaDataDiff,
424
+ const void *alphaParamDiff,
425
+ const void *betaParamDiff,
426
+ const cudnnTensorDescriptor_t xDesc,
427
+ const void *xData,
428
+ const cudnnTensorDescriptor_t yDesc,
429
+ const void *yData,
430
+ const cudnnTensorDescriptor_t dyDesc,
431
+ const void *dyData,
432
+ const cudnnTensorDescriptor_t dzDesc,
433
+ void *dzData,
434
+ const cudnnTensorDescriptor_t dxDesc,
435
+ void *dxData,
436
+ /* Shared tensor desc for the 4 tensors below */
437
+ const cudnnTensorDescriptor_t dNormScaleBiasDesc,
438
+ const void *normScaleData,
439
+ const void *normBiasData, /* needed if there is activation */
440
+ void *dNormScaleData,
441
+ void *dNormBiasData,
442
+ double epsilon, /* Same epsilon as forward pass */
443
+ const cudnnTensorDescriptor_t normMeanVarDesc,
444
+ /* Optionally cached intermediate results from
445
+ forward pass */
446
+ const void *savedMean,
447
+ const void *savedInvVariance,
448
+ cudnnActivationDescriptor_t activationDesc,
449
+ void *workSpace,
450
+ size_t workSpaceSizeInBytes,
451
+ void *reserveSpace,
452
+ size_t reserveSpaceSizeInBytes,
453
+ int groupCnt); /* Place hold for future work, should be set to 1 now*/
454
+
455
+ cudnnStatus_t CUDNNWINAPI
456
+ cudnnSpatialTfGridGeneratorBackward(cudnnHandle_t handle,
457
+ const cudnnSpatialTransformerDescriptor_t stDesc,
458
+ const void *dgrid,
459
+ void *dtheta);
460
+
461
+ cudnnStatus_t CUDNNWINAPI
462
+ cudnnSpatialTfSamplerBackward(cudnnHandle_t handle,
463
+ cudnnSpatialTransformerDescriptor_t stDesc,
464
+ const void *alpha,
465
+ const cudnnTensorDescriptor_t xDesc,
466
+ const void *x,
467
+ const void *beta,
468
+ const cudnnTensorDescriptor_t dxDesc,
469
+ void *dx,
470
+ const void *alphaDgrid,
471
+ const cudnnTensorDescriptor_t dyDesc,
472
+ const void *dy,
473
+ const void *grid,
474
+ const void *betaDgrid,
475
+ void *dgrid);
476
+
477
+ cudnnStatus_t CUDNNWINAPI
478
+ cudnnDropoutBackward(cudnnHandle_t handle,
479
+ const cudnnDropoutDescriptor_t dropoutDesc,
480
+ const cudnnTensorDescriptor_t dydesc,
481
+ const void *dy,
482
+ const cudnnTensorDescriptor_t dxdesc,
483
+ void *dx,
484
+ void *reserveSpace,
485
+ size_t reserveSpaceSizeInBytes);
486
+
487
+ /*
488
+ * \brief Cross-library version checker.
489
+ * This function is implemented differently in each sub-library. Each sublib
490
+ * checks whether its own version matches that of its dependencies.
491
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
492
+ * CUDNN_STATUS_VERSION_MISMATCH if the versions are inconsistent.
493
+ */
494
+ cudnnStatus_t CUDNNWINAPI
495
+ cudnnOpsTrainVersionCheck(void);
496
+
497
+ #if defined(__cplusplus)
498
+ }
499
+ #endif
500
+
501
+ #endif /* CUDNN_OPS_TRAIN_H_ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/__init__.py ADDED
File without changes
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (217 Bytes). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufft.h ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 2005-2021 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ /*!
50
+ * \file cufft.h
51
+ * \brief Public header file for the NVIDIA CUDA FFT library (CUFFT)
52
+ */
53
+
54
+ #ifndef _CUFFT_H_
55
+ #define _CUFFT_H_
56
+
57
+
58
+ #include "cuComplex.h"
59
+ #include "driver_types.h"
60
+ #include "library_types.h"
61
+
62
+ #ifndef CUFFTAPI
63
+ #ifdef _WIN32
64
+ #define CUFFTAPI __stdcall
65
+ #elif __GNUC__ >= 4
66
+ #define CUFFTAPI __attribute__ ((visibility ("default")))
67
+ #else
68
+ #define CUFFTAPI
69
+ #endif
70
+ #endif
71
+
72
+ #ifdef __cplusplus
73
+ extern "C" {
74
+ #endif
75
+
76
+ #define CUFFT_VER_MAJOR 10
77
+ #define CUFFT_VER_MINOR 9
78
+ #define CUFFT_VER_PATCH 0
79
+ #define CUFFT_VER_BUILD 58
80
+
81
+ // cuFFT library version
82
+ //
83
+ // CUFFT_VERSION / 1000 - major version
84
+ // CUFFT_VERSION / 100 % 100 - minor version
85
+ // CUFFT_VERSION % 100 - patch level
86
+ #define CUFFT_VERSION 10900
87
+
88
+ // CUFFT API function return values
89
+ typedef enum cufftResult_t {
90
+ CUFFT_SUCCESS = 0x0,
91
+ CUFFT_INVALID_PLAN = 0x1,
92
+ CUFFT_ALLOC_FAILED = 0x2,
93
+ CUFFT_INVALID_TYPE = 0x3,
94
+ CUFFT_INVALID_VALUE = 0x4,
95
+ CUFFT_INTERNAL_ERROR = 0x5,
96
+ CUFFT_EXEC_FAILED = 0x6,
97
+ CUFFT_SETUP_FAILED = 0x7,
98
+ CUFFT_INVALID_SIZE = 0x8,
99
+ CUFFT_UNALIGNED_DATA = 0x9,
100
+ CUFFT_INCOMPLETE_PARAMETER_LIST = 0xA,
101
+ CUFFT_INVALID_DEVICE = 0xB,
102
+ CUFFT_PARSE_ERROR = 0xC,
103
+ CUFFT_NO_WORKSPACE = 0xD,
104
+ CUFFT_NOT_IMPLEMENTED = 0xE,
105
+ CUFFT_LICENSE_ERROR = 0x0F,
106
+ CUFFT_NOT_SUPPORTED = 0x10
107
+
108
+ } cufftResult;
109
+
110
+ #define MAX_CUFFT_ERROR 0x11
111
+
112
+
113
+ // CUFFT defines and supports the following data types
114
+
115
+
116
+ // cufftReal is a single-precision, floating-point real data type.
117
+ // cufftDoubleReal is a double-precision, real data type.
118
+ typedef float cufftReal;
119
+ typedef double cufftDoubleReal;
120
+
121
+ // cufftComplex is a single-precision, floating-point complex data type that
122
+ // consists of interleaved real and imaginary components.
123
+ // cufftDoubleComplex is the double-precision equivalent.
124
+ typedef cuComplex cufftComplex;
125
+ typedef cuDoubleComplex cufftDoubleComplex;
126
+
127
+ // CUFFT transform directions
128
+ #define CUFFT_FORWARD -1 // Forward FFT
129
+ #define CUFFT_INVERSE 1 // Inverse FFT
130
+
131
+ // CUFFT supports the following transform types
132
+ typedef enum cufftType_t {
133
+ CUFFT_R2C = 0x2a, // Real to Complex (interleaved)
134
+ CUFFT_C2R = 0x2c, // Complex (interleaved) to Real
135
+ CUFFT_C2C = 0x29, // Complex to Complex, interleaved
136
+ CUFFT_D2Z = 0x6a, // Double to Double-Complex
137
+ CUFFT_Z2D = 0x6c, // Double-Complex to Double
138
+ CUFFT_Z2Z = 0x69 // Double-Complex to Double-Complex
139
+ } cufftType;
140
+
141
+ // CUFFT supports the following data layouts
142
+ typedef enum cufftCompatibility_t {
143
+ CUFFT_COMPATIBILITY_FFTW_PADDING = 0x01 // The default value
144
+ } cufftCompatibility;
145
+
146
+ #define CUFFT_COMPATIBILITY_DEFAULT CUFFT_COMPATIBILITY_FFTW_PADDING
147
+
148
+ //
149
+ // structure definition used by the shim between old and new APIs
150
+ //
151
+ #define MAX_SHIM_RANK 3
152
+
153
+ // cufftHandle is a handle type used to store and access CUFFT plans.
154
+ typedef int cufftHandle;
155
+
156
+
157
+ cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan,
158
+ int nx,
159
+ cufftType type,
160
+ int batch);
161
+
162
+ cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan,
163
+ int nx, int ny,
164
+ cufftType type);
165
+
166
+ cufftResult CUFFTAPI cufftPlan3d(cufftHandle *plan,
167
+ int nx, int ny, int nz,
168
+ cufftType type);
169
+
170
+ cufftResult CUFFTAPI cufftPlanMany(cufftHandle *plan,
171
+ int rank,
172
+ int *n,
173
+ int *inembed, int istride, int idist,
174
+ int *onembed, int ostride, int odist,
175
+ cufftType type,
176
+ int batch);
177
+
178
+ cufftResult CUFFTAPI cufftMakePlan1d(cufftHandle plan,
179
+ int nx,
180
+ cufftType type,
181
+ int batch,
182
+ size_t *workSize);
183
+
184
+ cufftResult CUFFTAPI cufftMakePlan2d(cufftHandle plan,
185
+ int nx, int ny,
186
+ cufftType type,
187
+ size_t *workSize);
188
+
189
+ cufftResult CUFFTAPI cufftMakePlan3d(cufftHandle plan,
190
+ int nx, int ny, int nz,
191
+ cufftType type,
192
+ size_t *workSize);
193
+
194
+ cufftResult CUFFTAPI cufftMakePlanMany(cufftHandle plan,
195
+ int rank,
196
+ int *n,
197
+ int *inembed, int istride, int idist,
198
+ int *onembed, int ostride, int odist,
199
+ cufftType type,
200
+ int batch,
201
+ size_t *workSize);
202
+
203
+ cufftResult CUFFTAPI cufftMakePlanMany64(cufftHandle plan,
204
+ int rank,
205
+ long long int *n,
206
+ long long int *inembed,
207
+ long long int istride,
208
+ long long int idist,
209
+ long long int *onembed,
210
+ long long int ostride, long long int odist,
211
+ cufftType type,
212
+ long long int batch,
213
+ size_t * workSize);
214
+
215
+ cufftResult CUFFTAPI cufftGetSizeMany64(cufftHandle plan,
216
+ int rank,
217
+ long long int *n,
218
+ long long int *inembed,
219
+ long long int istride, long long int idist,
220
+ long long int *onembed,
221
+ long long int ostride, long long int odist,
222
+ cufftType type,
223
+ long long int batch,
224
+ size_t *workSize);
225
+
226
+
227
+
228
+
229
+ cufftResult CUFFTAPI cufftEstimate1d(int nx,
230
+ cufftType type,
231
+ int batch,
232
+ size_t *workSize);
233
+
234
+ cufftResult CUFFTAPI cufftEstimate2d(int nx, int ny,
235
+ cufftType type,
236
+ size_t *workSize);
237
+
238
+ cufftResult CUFFTAPI cufftEstimate3d(int nx, int ny, int nz,
239
+ cufftType type,
240
+ size_t *workSize);
241
+
242
+ cufftResult CUFFTAPI cufftEstimateMany(int rank,
243
+ int *n,
244
+ int *inembed, int istride, int idist,
245
+ int *onembed, int ostride, int odist,
246
+ cufftType type,
247
+ int batch,
248
+ size_t *workSize);
249
+
250
+ cufftResult CUFFTAPI cufftCreate(cufftHandle * handle);
251
+
252
+ cufftResult CUFFTAPI cufftGetSize1d(cufftHandle handle,
253
+ int nx,
254
+ cufftType type,
255
+ int batch,
256
+ size_t *workSize );
257
+
258
+ cufftResult CUFFTAPI cufftGetSize2d(cufftHandle handle,
259
+ int nx, int ny,
260
+ cufftType type,
261
+ size_t *workSize);
262
+
263
+ cufftResult CUFFTAPI cufftGetSize3d(cufftHandle handle,
264
+ int nx, int ny, int nz,
265
+ cufftType type,
266
+ size_t *workSize);
267
+
268
+ cufftResult CUFFTAPI cufftGetSizeMany(cufftHandle handle,
269
+ int rank, int *n,
270
+ int *inembed, int istride, int idist,
271
+ int *onembed, int ostride, int odist,
272
+ cufftType type, int batch, size_t *workArea);
273
+
274
+ cufftResult CUFFTAPI cufftGetSize(cufftHandle handle, size_t *workSize);
275
+
276
+ cufftResult CUFFTAPI cufftSetWorkArea(cufftHandle plan, void *workArea);
277
+
278
+ cufftResult CUFFTAPI cufftSetAutoAllocation(cufftHandle plan, int autoAllocate);
279
+
280
+ cufftResult CUFFTAPI cufftExecC2C(cufftHandle plan,
281
+ cufftComplex *idata,
282
+ cufftComplex *odata,
283
+ int direction);
284
+
285
+ cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan,
286
+ cufftReal *idata,
287
+ cufftComplex *odata);
288
+
289
+ cufftResult CUFFTAPI cufftExecC2R(cufftHandle plan,
290
+ cufftComplex *idata,
291
+ cufftReal *odata);
292
+
293
+ cufftResult CUFFTAPI cufftExecZ2Z(cufftHandle plan,
294
+ cufftDoubleComplex *idata,
295
+ cufftDoubleComplex *odata,
296
+ int direction);
297
+
298
+ cufftResult CUFFTAPI cufftExecD2Z(cufftHandle plan,
299
+ cufftDoubleReal *idata,
300
+ cufftDoubleComplex *odata);
301
+
302
+ cufftResult CUFFTAPI cufftExecZ2D(cufftHandle plan,
303
+ cufftDoubleComplex *idata,
304
+ cufftDoubleReal *odata);
305
+
306
+
307
+ // utility functions
308
+ cufftResult CUFFTAPI cufftSetStream(cufftHandle plan,
309
+ cudaStream_t stream);
310
+
311
+ cufftResult CUFFTAPI cufftDestroy(cufftHandle plan);
312
+
313
+ cufftResult CUFFTAPI cufftGetVersion(int *version);
314
+
315
+ cufftResult CUFFTAPI cufftGetProperty(libraryPropertyType type,
316
+ int *value);
317
+
318
+ #ifdef __cplusplus
319
+ }
320
+ #endif
321
+
322
+ #endif /* _CUFFT_H_ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (214 Bytes). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (222 Bytes). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete.h ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #if !defined(CURANDDISCRETE_H_)
50
+ #define CURANDDISCRETE_H_
51
+
52
+ struct curandDistributionShift_st {
53
+ curandDistribution_t probability;
54
+ curandDistribution_t host_probability;
55
+ unsigned int shift;
56
+ unsigned int length;
57
+ unsigned int host_gen;
58
+ };
59
+
60
+ struct curandHistogramM2_st {
61
+ curandHistogramM2V_t V;
62
+ curandHistogramM2V_t host_V;
63
+ curandHistogramM2K_t K;
64
+ curandHistogramM2K_t host_K;
65
+ unsigned int host_gen;
66
+ };
67
+
68
+
69
+ struct curandDistributionM2Shift_st {
70
+ curandHistogramM2_t histogram;
71
+ curandHistogramM2_t host_histogram;
72
+ unsigned int shift;
73
+ unsigned int length;
74
+ unsigned int host_gen;
75
+ };
76
+
77
+ struct curandDiscreteDistribution_st {
78
+ curandDiscreteDistribution_t self_host_ptr;
79
+ curandDistributionM2Shift_t M2;
80
+ curandDistributionM2Shift_t host_M2;
81
+ double stddev;
82
+ double mean;
83
+ curandMethod_t method;
84
+ unsigned int host_gen;
85
+ };
86
+
87
+ #endif // !defined(CURANDDISCRETE_H_)
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete2.h ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ /* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * The source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * The Licensed Deliverables contained herein are PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+
51
+ #if !defined(CURAND_DISCRETE_H_)
52
+ #define CURAND_DISCRETE_H_
53
+
54
+ /**
55
+ * \defgroup DEVICE Device API
56
+ *
57
+ * @{
58
+ */
59
+
60
+ #ifndef __CUDACC_RTC__
61
+ #include <math.h>
62
+ #endif // __CUDACC_RTC__
63
+
64
+ #include "curand_mrg32k3a.h"
65
+ #include "curand_mtgp32_kernel.h"
66
+ #include "curand_philox4x32_x.h"
67
+
68
+
69
+ template <typename T>
70
+ QUALIFIERS unsigned int _curand_discrete(T x, curandDiscreteDistribution_t discrete_distribution){
71
+ if (discrete_distribution->method == CURAND_M2){
72
+ return _curand_M2_double(x, discrete_distribution->M2);
73
+ }
74
+ return (unsigned int)((discrete_distribution->stddev * _curand_normal_icdf_double(x)) + discrete_distribution->mean + 0.5);
75
+ }
76
+
77
+
78
+ template <typename STATE>
79
+ QUALIFIERS unsigned int curand__discrete(STATE state, curandDiscreteDistribution_t discrete_distribution){
80
+ if (discrete_distribution->method == CURAND_M2){
81
+ return curand_M2_double(state, discrete_distribution->M2);
82
+ }
83
+ return (unsigned int)((discrete_distribution->stddev * curand_normal_double(state)) + discrete_distribution->mean + 0.5); //Round to nearest
84
+ }
85
+
86
+ template <typename STATE>
87
+ QUALIFIERS uint4 curand__discrete4(STATE state, curandDiscreteDistribution_t discrete_distribution){
88
+ if (discrete_distribution->method == CURAND_M2){
89
+ return curand_M2_double4(state, discrete_distribution->M2);
90
+ }
91
+ double4 _res;
92
+ uint4 result;
93
+ _res = curand_normal4_double(state);
94
+ result.x = (unsigned int)((discrete_distribution->stddev * _res.x) + discrete_distribution->mean + 0.5); //Round to nearest
95
+ result.y = (unsigned int)((discrete_distribution->stddev * _res.y) + discrete_distribution->mean + 0.5); //Round to nearest
96
+ result.z = (unsigned int)((discrete_distribution->stddev * _res.z) + discrete_distribution->mean + 0.5); //Round to nearest
97
+ result.w = (unsigned int)((discrete_distribution->stddev * _res.w) + discrete_distribution->mean + 0.5); //Round to nearest
98
+ return result;
99
+ }
100
+
101
+ /*
102
+ * \brief Return a discrete distributed unsigned int from a XORWOW generator.
103
+ *
104
+ * Return a single discrete distributed unsigned int derived from a
105
+ * distribution defined by \p discrete_distribution from the XORWOW generator in \p state,
106
+ * increment position of generator by one.
107
+ *
108
+ * \param state - Pointer to state to update
109
+ * \param discrete_distribution - ancillary structure for discrete distribution
110
+ *
111
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
112
+ */
113
+ QUALIFIERS unsigned int curand_discrete(curandStateXORWOW_t *state, curandDiscreteDistribution_t discrete_distribution)
114
+ {
115
+ return curand__discrete(state, discrete_distribution);
116
+ }
117
+
118
+ /*
119
+ * \brief Return a discrete distributed unsigned int from a Philox4_32_10 generator.
120
+ *
121
+ * Return a single discrete distributed unsigned int derived from a
122
+ * distribution defined by \p discrete_distribution from the Philox4_32_10 generator in \p state,
123
+ * increment position of generator by one.
124
+ *
125
+ * \param state - Pointer to state to update
126
+ * \param discrete_distribution - ancillary structure for discrete distribution
127
+ *
128
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
129
+ */
130
+ QUALIFIERS unsigned int curand_discrete(curandStatePhilox4_32_10_t *state, curandDiscreteDistribution_t discrete_distribution)
131
+ {
132
+ return curand__discrete(state, discrete_distribution);
133
+ }
134
+
135
+ /*
136
+ * \brief Return four discrete distributed unsigned ints from a Philox4_32_10 generator.
137
+ *
138
+ * Return four single discrete distributed unsigned ints derived from a
139
+ * distribution defined by \p discrete_distribution from the Philox4_32_10 generator in \p state,
140
+ * increment position of generator by one.
141
+ *
142
+ * \param state - Pointer to state to update
143
+ * \param discrete_distribution - ancillary structure for discrete distribution
144
+ *
145
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
146
+ */
147
+ QUALIFIERS uint4 curand_discrete4(curandStatePhilox4_32_10_t *state, curandDiscreteDistribution_t discrete_distribution)
148
+ {
149
+ return curand__discrete4(state, discrete_distribution);
150
+ }
151
+ /*
152
+ * \brief Return a discrete distributed unsigned int from a MRG32k3a generator.
153
+ *
154
+ * Re turn a single discrete distributed unsigned int derived from a
155
+ * distribution defined by \p discrete_distribution from the MRG32k3a generator in \p state,
156
+ * increment position of generator by one.
157
+ *
158
+ * \param state - Pointer to state to update
159
+ * \param discrete_distribution - ancillary structure for discrete distribution
160
+ *
161
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
162
+ */
163
+ QUALIFIERS unsigned int curand_discrete(curandStateMRG32k3a_t *state, curandDiscreteDistribution_t discrete_distribution)
164
+ {
165
+ return curand__discrete(state, discrete_distribution);
166
+ }
167
+
168
+ /*
169
+ * \brief Return a discrete distributed unsigned int from a MTGP32 generator.
170
+ *
171
+ * Return a single discrete distributed unsigned int derived from a
172
+ * distribution defined by \p discrete_distribution from the MTGP32 generator in \p state,
173
+ * increment position of generator by one.
174
+ *
175
+ * \param state - Pointer to state to update
176
+ * \param discrete_distribution - ancillary structure for discrete distribution
177
+ *
178
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
179
+ */
180
+ QUALIFIERS unsigned int curand_discrete(curandStateMtgp32_t *state, curandDiscreteDistribution_t discrete_distribution)
181
+ {
182
+ return curand__discrete(state, discrete_distribution);
183
+ }
184
+
185
+ /*
186
+ * \brief Return a discrete distributed unsigned int from a Sobol32 generator.
187
+ *
188
+ * Return a single discrete distributed unsigned int derived from a
189
+ * distribution defined by \p discrete_distribution from the Sobol32 generator in \p state,
190
+ * increment position of generator by one.
191
+ *
192
+ * \param state - Pointer to state to update
193
+ * \param discrete_distribution - ancillary structure for discrete distribution
194
+ *
195
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
196
+ */
197
+ QUALIFIERS unsigned int curand_discrete(curandStateSobol32_t *state, curandDiscreteDistribution_t discrete_distribution)
198
+ {
199
+ return curand__discrete(state, discrete_distribution);
200
+ }
201
+
202
+ /*
203
+ * \brief Return a discrete distributed unsigned int from a scrambled Sobol32 generator.
204
+ *
205
+ * Return a single discrete distributed unsigned int derived from a
206
+ * distribution defined by \p discrete_distribution from the scrambled Sobol32 generator in \p state,
207
+ * increment position of generator by one.
208
+ *
209
+ * \param state - Pointer to state to update
210
+ * \param discrete_distribution - ancillary structure for discrete distribution
211
+ *
212
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
213
+ */
214
+ QUALIFIERS unsigned int curand_discrete(curandStateScrambledSobol32_t *state, curandDiscreteDistribution_t discrete_distribution)
215
+ {
216
+ return curand__discrete(state, discrete_distribution);
217
+ }
218
+
219
+ /*
220
+ * \brief Return a discrete distributed unsigned int from a Sobol64 generator.
221
+ *
222
+ * Return a single discrete distributed unsigned int derived from a
223
+ * distribution defined by \p discrete_distribution from the Sobol64 generator in \p state,
224
+ * increment position of generator by one.
225
+ *
226
+ * \param state - Pointer to state to update
227
+ * \param discrete_distribution - ancillary structure for discrete distribution
228
+ *
229
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
230
+ */
231
+ QUALIFIERS unsigned int curand_discrete(curandStateSobol64_t *state, curandDiscreteDistribution_t discrete_distribution)
232
+ {
233
+ return curand__discrete(state, discrete_distribution);
234
+ }
235
+
236
+ /*
237
+ * \brief Return a discrete distributed unsigned int from a scrambled Sobol64 generator.
238
+ *
239
+ * Return a single discrete distributed unsigned int derived from a
240
+ * distribution defined by \p discrete_distribution from the scrambled Sobol64 generator in \p state,
241
+ * increment position of generator by one.
242
+ *
243
+ * \param state - Pointer to state to update
244
+ * \param discrete_distribution - ancillary structure for discrete distribution
245
+ *
246
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
247
+ */
248
+ QUALIFIERS unsigned int curand_discrete(curandStateScrambledSobol64_t *state, curandDiscreteDistribution_t discrete_distribution)
249
+ {
250
+ return curand__discrete(state, discrete_distribution);
251
+ }
252
+
253
+ #endif // !defined(CURAND_DISCRETE_H_)
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32.h ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef CURAND_MTGP32_H
51
+ #define CURAND_MTGP32_H
52
+ /*
53
+ * @file curand_mtgp32.h
54
+ *
55
+ * @brief Mersenne Twister for Graphic Processors (mtgp32), which
56
+ * generates 32-bit unsigned integers and single precision floating
57
+ * point numbers based on IEEE 754 format.
58
+ *
59
+ * @author Mutsuo Saito (Hiroshima University)
60
+ * @author Makoto Matsumoto (Hiroshima University)
61
+ *
62
+ */
63
+ /*
64
+ * Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
65
+ * University. All rights reserved.
66
+ * Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
67
+ * University and University of Tokyo. All rights reserved.
68
+ *
69
+ * Redistribution and use in source and binary forms, with or without
70
+ * modification, are permitted provided that the following conditions are
71
+ * met:
72
+ *
73
+ * * Redistributions of source code must retain the above copyright
74
+ * notice, this list of conditions and the following disclaimer.
75
+ * * Redistributions in binary form must reproduce the above
76
+ * copyright notice, this list of conditions and the following
77
+ * disclaimer in the documentation and/or other materials provided
78
+ * with the distribution.
79
+ * * Neither the name of the Hiroshima University nor the names of
80
+ * its contributors may be used to endorse or promote products
81
+ * derived from this software without specific prior written
82
+ * permission.
83
+ *
84
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
85
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
86
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
87
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
88
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
89
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
90
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
91
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
92
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
93
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
94
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
95
+ */
96
+
97
+
98
+ #define MTGPDC_MEXP 11213
99
+ #define MTGPDC_N 351
100
+ #define MTGPDC_FLOOR_2P 256
101
+ #define MTGPDC_CEIL_2P 512
102
+ #define MTGPDC_PARAM_TABLE mtgp32dc_params_fast_11213
103
+ #define MTGP32_STATE_SIZE 1024
104
+ #define MTGP32_STATE_MASK 1023
105
+ #define CURAND_NUM_MTGP32_PARAMS 200
106
+ #define MEXP 11213
107
+ #define THREAD_NUM MTGPDC_FLOOR_2P
108
+ #define LARGE_SIZE (THREAD_NUM * 3)
109
+ #define TBL_SIZE 16
110
+
111
+ /**
112
+ * \addtogroup DEVICE Device API
113
+ *
114
+ * @{
115
+ */
116
+
117
+ /*
118
+ * \struct MTGP32_PARAMS_FAST_T
119
+ * MTGP32 parameters.
120
+ * Some element is redundant to keep structure simple.
121
+ *
122
+ * \b pos is a pick up position which is selected to have good
123
+ * performance on graphic processors. 3 < \b pos < Q, where Q is a
124
+ * maximum number such that the size of status array - Q is a power of
125
+ * 2. For example, when \b mexp is 44497, size of 32-bit status array
126
+ * is 696, and Q is 184, then \b pos is between 4 and 183. This means
127
+ * 512 parallel calculations is allowed when \b mexp is 44497.
128
+ *
129
+ * \b poly_sha1 is SHA1 digest of the characteristic polynomial of
130
+ * state transition function. SHA1 is calculated based on printing
131
+ * form of the polynomial. This is important when we use parameters
132
+ * generated by the dynamic creator which
133
+ *
134
+ * \b mask This is a mask to make the dimension of state space have
135
+ * just Mersenne Prime. This is redundant.
136
+ */
137
+
138
+ struct mtgp32_params_fast;
139
+
140
+ struct mtgp32_params_fast {
141
+ int mexp; /*< Mersenne exponent. This is redundant. */
142
+ int pos; /*< pick up position. */
143
+ int sh1; /*< shift value 1. 0 < sh1 < 32. */
144
+ int sh2; /*< shift value 2. 0 < sh2 < 32. */
145
+ unsigned int tbl[16]; /*< a small matrix. */
146
+ unsigned int tmp_tbl[16]; /*< a small matrix for tempering. */
147
+ unsigned int flt_tmp_tbl[16]; /*< a small matrix for tempering and
148
+ converting to float. */
149
+ unsigned int mask; /*< This is a mask for state space */
150
+ unsigned char poly_sha1[21]; /*< SHA1 digest */
151
+ };
152
+
153
+ /** \cond UNHIDE_TYPEDEFS */
154
+ typedef struct mtgp32_params_fast mtgp32_params_fast_t;
155
+ /** \endcond */
156
+
157
+ /*
158
+ * Generator Parameters.
159
+ */
160
+ struct mtgp32_kernel_params;
161
+ struct mtgp32_kernel_params {
162
+ unsigned int pos_tbl[CURAND_NUM_MTGP32_PARAMS];
163
+ unsigned int param_tbl[CURAND_NUM_MTGP32_PARAMS][TBL_SIZE];
164
+ unsigned int temper_tbl[CURAND_NUM_MTGP32_PARAMS][TBL_SIZE];
165
+ unsigned int single_temper_tbl[CURAND_NUM_MTGP32_PARAMS][TBL_SIZE];
166
+ unsigned int sh1_tbl[CURAND_NUM_MTGP32_PARAMS];
167
+ unsigned int sh2_tbl[CURAND_NUM_MTGP32_PARAMS];
168
+ unsigned int mask[1];
169
+ };
170
+
171
+ /** \cond UNHIDE_TYPEDEFS */
172
+ typedef struct mtgp32_kernel_params mtgp32_kernel_params_t;
173
+ /** \endcond */
174
+
175
+
176
+
177
+ /*
178
+ * kernel I/O
179
+ * This structure must be initialized before first use.
180
+ */
181
+
182
+ /* MTGP (Mersenne Twister) RNG */
183
+ /* This generator uses the Mersenne Twister algorithm of
184
+ * http://arxiv.org/abs/1005.4973v2
185
+ * Has period 2^11213.
186
+ */
187
+
188
+ /**
189
+ * CURAND MTGP32 state
190
+ */
191
+ struct curandStateMtgp32;
192
+
193
+ struct curandStateMtgp32 {
194
+ unsigned int s[MTGP32_STATE_SIZE];
195
+ int offset;
196
+ int pIdx;
197
+ mtgp32_kernel_params_t * k;
198
+ };
199
+
200
+ /*
201
+ * CURAND MTGP32 state
202
+ */
203
+ /** \cond UNHIDE_TYPEDEFS */
204
+ typedef struct curandStateMtgp32 curandStateMtgp32_t;
205
+ /** \endcond */
206
+
207
+ /** @} */
208
+
209
+ #endif
210
+
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32_kernel.h ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /*
51
+ * curand_mtgp32_kernel.h
52
+ *
53
+ *
54
+ * MTGP32-11213
55
+ *
56
+ * Mersenne Twister RNG for the GPU
57
+ *
58
+ * The period of generated integers is 2<sup>11213</sup>-1.
59
+ *
60
+ * This code generates 32-bit unsigned integers, and
61
+ * single precision floating point numbers uniformly distributed
62
+ * in the range [1, 2). (float r; 1.0 <= r < 2.0)
63
+ */
64
+
65
+ /*
66
+ * Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
67
+ * University. All rights reserved.
68
+ * Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
69
+ * University and University of Tokyo. All rights reserved.
70
+ *
71
+ * Redistribution and use in source and binary forms, with or without
72
+ * modification, are permitted provided that the following conditions are
73
+ * met:
74
+ *
75
+ * * Redistributions of source code must retain the above copyright
76
+ * notice, this list of conditions and the following disclaimer.
77
+ * * Redistributions in binary form must reproduce the above
78
+ * copyright notice, this list of conditions and the following
79
+ * disclaimer in the documentation and/or other materials provided
80
+ * with the distribution.
81
+ * * Neither the name of the Hiroshima University nor the names of
82
+ * its contributors may be used to endorse or promote products
83
+ * derived from this software without specific prior written
84
+ * permission.
85
+ *
86
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
87
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
88
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
89
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
90
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
91
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
92
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
93
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
94
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
95
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
96
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
97
+ */
98
+ #if !defined CURAND_MTGP32_KERNEL_H
99
+ #define CURAND_MTGP32_KERNEL_H
100
+
101
+ #if !defined(QUALIFIERS)
102
+ #define QUALIFIERS static __forceinline__ __device__
103
+ #endif
104
+
105
+ #ifndef __CUDACC_RTC__
106
+ #include <cuda.h>
107
+ #include <stdlib.h>
108
+ #include <memory.h>
109
+ #include <string.h>
110
+ #endif // ifndef __CUDACC_RTC__
111
+ #include "curand.h"
112
+ #include "curand_mtgp32.h"
113
+
114
+ /**
115
+ * \addtogroup DEVICE Device API
116
+ *
117
+ * @{
118
+ */
119
+
120
+ #ifndef __CUDA_ARCH__
121
+ // define blockDim and threadIdx for host compatibility call
122
+ extern const dim3 blockDim;
123
+ extern const uint3 threadIdx;
124
+ #endif
125
+
126
+
127
+ /*
128
+ * The function of the recursion formula calculation.
129
+ *
130
+ * @param[in] X1 the farthest part of state array.
131
+ * @param[in] X2 the second farthest part of state array.
132
+ * @param[in] Y a part of state array.
133
+ * @param[in] bid block id.
134
+ * @return output
135
+ */
136
+ QUALIFIERS unsigned int para_rec(mtgp32_kernel_params_t * k,unsigned int X1, unsigned int X2, unsigned int Y, int bid) {
137
+ unsigned int X = (X1 & k->mask[0]) ^ X2;
138
+ unsigned int MAT;
139
+
140
+ X ^= X << k->sh1_tbl[bid];
141
+ Y = X ^ (Y >> k->sh2_tbl[bid]);
142
+ MAT = k->param_tbl[bid][Y & 0x0f];
143
+ return Y ^ MAT;
144
+ }
145
+
146
+ /*
147
+ * The tempering function.
148
+ *
149
+ * @param[in] V the output value should be tempered.
150
+ * @param[in] T the tempering helper value.
151
+ * @param[in] bid block id.
152
+ * @return the tempered value.
153
+ */
154
+ QUALIFIERS unsigned int temper(mtgp32_kernel_params_t * k,unsigned int V, unsigned int T, int bid) {
155
+ unsigned int MAT;
156
+
157
+ T ^= T >> 16;
158
+ T ^= T >> 8;
159
+ MAT = k->temper_tbl[bid][T & 0x0f];
160
+ return V ^ MAT;
161
+ }
162
+
163
+ /*
164
+ * The tempering and converting function.
165
+ * By using the preset table, converting to IEEE format
166
+ * and tempering are done simultaneously.
167
+ *
168
+ * @param[in] V the output value should be tempered.
169
+ * @param[in] T the tempering helper value.
170
+ * @param[in] bid block id.
171
+ * @return the tempered and converted value.
172
+ */
173
+ QUALIFIERS unsigned int temper_single(mtgp32_kernel_params_t * k,unsigned int V, unsigned int T, int bid) {
174
+ unsigned int MAT;
175
+ unsigned int r;
176
+
177
+ T ^= T >> 16;
178
+ T ^= T >> 8;
179
+ MAT = k->single_temper_tbl[bid][T & 0x0f];
180
+ r = (V >> 9) ^ MAT;
181
+ return r;
182
+ }
183
+
184
+ /**
185
+ * \brief Return 32-bits of pseudorandomness from a mtgp32 generator.
186
+ *
187
+ * Return 32-bits of pseudorandomness from the mtgp32 generator in \p state,
188
+ * increment position of generator by the number of threads in the block.
189
+ * Note the number of threads in the block can not exceed 256.
190
+ *
191
+ * \param state - Pointer to state to update
192
+ *
193
+ * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
194
+ */
195
+ QUALIFIERS unsigned int curand(curandStateMtgp32_t *state)
196
+ {
197
+ unsigned int t;
198
+ unsigned int d;
199
+ int pos = state->k->pos_tbl[state->pIdx];
200
+ unsigned int r;
201
+ unsigned int o;
202
+
203
+ d = blockDim.z * blockDim.y * blockDim.x;
204
+ //assert( d <= 256 );
205
+ t = (blockDim.z * blockDim.y * threadIdx.z) + (blockDim.x * threadIdx.y) + threadIdx.x;
206
+ r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
207
+ state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
208
+ state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
209
+ state->pIdx);
210
+
211
+ state->s[(t + state->offset + MTGPDC_N) & MTGP32_STATE_MASK] = r;
212
+ o = temper(state->k, r,
213
+ state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
214
+ state->pIdx);
215
+ #if __CUDA_ARCH__ != 0
216
+ __syncthreads();
217
+ #endif
218
+ if (t == 0)
219
+ {
220
+ state->offset = (state->offset + d) & MTGP32_STATE_MASK;
221
+ }
222
+ #if __CUDA_ARCH__ != 0
223
+ __syncthreads();
224
+ #endif
225
+ return o;
226
+
227
+ }
228
+ /**
229
+ * \brief Return 32-bits of pseudorandomness from a specific position in a mtgp32 generator.
230
+ *
231
+ * Return 32-bits of pseudorandomness from position \p index of the mtgp32 generator in \p state,
232
+ * increment position of generator by \p n positions, which must be the total number of positions
233
+ * upddated in the state by the thread block, for this invocation.
234
+ *
235
+ * Note :
236
+ * Thread indices must range from 0...\ n - 1.
237
+ * The number of positions updated may not exceed 256.
238
+ * A thread block may update more than one state, but a given state may not be updated by more than one thread block.
239
+ *
240
+ * \param state - Pointer to state to update
241
+ * \param index - Index (0..255) of the position within the state to draw from and update
242
+ * \param n - The total number of postions in this state that are being updated by this invocation
243
+ *
244
+ * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
245
+ */
246
+ QUALIFIERS unsigned int curand_mtgp32_specific(curandStateMtgp32_t *state, unsigned char index, unsigned char n)
247
+ {
248
+ unsigned int t;
249
+ int pos = state->k->pos_tbl[state->pIdx];
250
+ unsigned int r;
251
+ unsigned int o;
252
+
253
+ t = index;
254
+ r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
255
+ state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
256
+ state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
257
+ state->pIdx);
258
+
259
+ state->s[(t + state->offset + MTGPDC_N) & MTGP32_STATE_MASK] = r;
260
+ o = temper(state->k, r,
261
+ state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
262
+ state->pIdx);
263
+ #if __CUDA_ARCH__ != 0
264
+ __syncthreads();
265
+ #endif
266
+ if (index == 0)
267
+ {
268
+ state->offset = (state->offset + n) & MTGP32_STATE_MASK;
269
+ }
270
+ #if __CUDA_ARCH__ != 0
271
+ __syncthreads();
272
+ #endif
273
+ return o;
274
+ }
275
+ /**
276
+ * \brief Return a uniformly distributed float from a mtgp32 generator.
277
+ *
278
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
279
+ * from the mtgp32 generator in \p state, increment position of generator.
280
+ * Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
281
+ * point outputs are never returned.
282
+ *
283
+ * Note: This alternate derivation of a uniform float is provided for completeness
284
+ * with the original source
285
+ *
286
+ * \param state - Pointer to state to update
287
+ *
288
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
289
+ */
290
+ QUALIFIERS float curand_mtgp32_single(curandStateMtgp32_t *state)
291
+ {
292
+ unsigned int t;
293
+ unsigned int d;
294
+ int pos = state->k->pos_tbl[state->pIdx];
295
+ unsigned int r;
296
+ unsigned int o_u;
297
+ float o_f;
298
+
299
+
300
+ t = blockDim.z * blockDim.y;
301
+ d = t * blockDim.x;
302
+ //assert( d <= 256 );
303
+ t += threadIdx.x;
304
+ r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
305
+ state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
306
+ state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
307
+ state->pIdx);
308
+
309
+ state->s[t] = r;
310
+ o_u = temper_single(state->k, r,
311
+ state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
312
+ state->pIdx);
313
+ #if __CUDA_ARCH__ != 0
314
+ __syncthreads();
315
+ #endif
316
+ if (threadIdx.x == 0)
317
+ {
318
+ state->offset = (state->offset + d) & MTGP32_STATE_MASK;
319
+ }
320
+ #if __CUDA_ARCH__ != 0
321
+ __syncthreads();
322
+ #endif
323
+ memcpy(&o_f, &o_u, sizeof(o_u));
324
+ return o_f;
325
+ }
326
+
327
+ /**
328
+ * \brief Return a uniformly distributed float from a specific position in a mtgp32 generator.
329
+ *
330
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
331
+ * from position \p index of the mtgp32 generator in \p state, and
332
+ * increment position of generator by \p n positions, which must be the total number of positions
333
+ * upddated in the state by the thread block, for this invocation.
334
+ * Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
335
+ * point outputs are never returned.
336
+ *
337
+ * Note 1:
338
+ * Thread indices must range from 0...\p n - 1.
339
+ * The number of positions updated may not exceed 256.
340
+ * A thread block may update more than one state, but a given state may not be updated by more than one thread block.
341
+ *
342
+ * Note 2: This alternate derivation of a uniform float is provided for completeness
343
+ * with the original source
344
+ *
345
+ * \param state - Pointer to state to update
346
+ * \param index - Index (0..255) of the position within the state to draw from and update
347
+ * \param n - The total number of postions in this state that are being updated by this invocation
348
+ *
349
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
350
+ */
351
+ QUALIFIERS float curand_mtgp32_single_specific(curandStateMtgp32_t *state, unsigned char index, unsigned char n)
352
+ {
353
+ unsigned int t;
354
+ int pos = state->k->pos_tbl[state->pIdx];
355
+ unsigned int r;
356
+ unsigned int o_u;
357
+ float o_f;
358
+
359
+ t = index;
360
+ r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
361
+ state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
362
+ state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
363
+ state->pIdx);
364
+
365
+ state->s[t] = r;
366
+ o_u = temper_single(state->k, r,
367
+ state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
368
+ state->pIdx);
369
+ #if __CUDA_ARCH__ != 0
370
+ __syncthreads();
371
+ #endif
372
+ if (threadIdx.x == 0)
373
+ {
374
+ state->offset = (state->offset + n) & MTGP32_STATE_MASK;
375
+ }
376
+ #if __CUDA_ARCH__ != 0
377
+ __syncthreads();
378
+ #endif
379
+ memcpy(&o_f, &o_u, sizeof(o_u));
380
+ return o_f;
381
+ }
382
+
383
+ /** @} */
384
+
385
+ #endif
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_normal.h ADDED
@@ -0,0 +1,837 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ /* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * The source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * The Licensed Deliverables contained herein are PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+
51
+ #if !defined(CURAND_NORMAL_H_)
52
+ #define CURAND_NORMAL_H_
53
+
54
+ /**
55
+ * \defgroup DEVICE Device API
56
+ *
57
+ * @{
58
+ */
59
+
60
+ #ifndef __CUDACC_RTC__
61
+ #include <math.h>
62
+ #endif // __CUDACC_RTC__
63
+
64
+ #include "curand_mrg32k3a.h"
65
+ #include "curand_mtgp32_kernel.h"
66
+ #include "curand_philox4x32_x.h"
67
+ #include "curand_normal_static.h"
68
+
69
+ QUALIFIERS float2 _curand_box_muller(unsigned int x, unsigned int y)
70
+ {
71
+ float2 result;
72
+ float u = x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2);
73
+ float v = y * CURAND_2POW32_INV_2PI + (CURAND_2POW32_INV_2PI/2);
74
+ #if __CUDA_ARCH__ > 0
75
+ float s = sqrtf(-2.0f * logf(u));
76
+ __sincosf(v, &result.x, &result.y);
77
+ #else
78
+ float s = sqrtf(-2.0f * logf(u));
79
+ result.x = sinf(v);
80
+ result.y = cosf(v);
81
+ #endif
82
+ result.x *= s;
83
+ result.y *= s;
84
+ return result;
85
+ }
86
+
87
+ QUALIFIERS float2 curand_box_muller_mrg(curandStateMRG32k3a_t * state)
88
+ {
89
+ float x, y;
90
+ x = curand_uniform(state);
91
+ y = curand_uniform(state) * CURAND_2PI;
92
+ float2 result;
93
+ #if __CUDA_ARCH__ > 0
94
+ float s = sqrtf(-2.0f * logf(x));
95
+ __sincosf(y, &result.x, &result.y);
96
+ #else
97
+ float s = sqrtf(-2.0f * logf(x));
98
+ result.x = sinf(y);
99
+ result.y = cosf(y);
100
+ #endif
101
+ result.x *= s;
102
+ result.y *= s;
103
+ return result;
104
+ }
105
+
106
+ QUALIFIERS double2
107
+ _curand_box_muller_double(unsigned int x0, unsigned int x1,
108
+ unsigned int y0, unsigned int y1)
109
+ {
110
+ double2 result;
111
+ unsigned long long zx = (unsigned long long)x0 ^
112
+ ((unsigned long long)x1 << (53 - 32));
113
+ double u = zx * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
114
+ unsigned long long zy = (unsigned long long)y0 ^
115
+ ((unsigned long long)y1 << (53 - 32));
116
+ double v = zy * (CURAND_2POW53_INV_DOUBLE*2.0) + CURAND_2POW53_INV_DOUBLE;
117
+ double s = sqrt(-2.0 * log(u));
118
+
119
+ #if __CUDA_ARCH__ > 0
120
+ sincospi(v, &result.x, &result.y);
121
+ #else
122
+ result.x = sin(v*CURAND_PI_DOUBLE);
123
+ result.y = cos(v*CURAND_PI_DOUBLE);
124
+ #endif
125
+ result.x *= s;
126
+ result.y *= s;
127
+
128
+ return result;
129
+ }
130
+
131
+ QUALIFIERS double2
132
+ curand_box_muller_mrg_double(curandStateMRG32k3a_t * state)
133
+ {
134
+ double x, y;
135
+ double2 result;
136
+ x = curand_uniform_double(state);
137
+ y = curand_uniform_double(state) * 2.0;
138
+
139
+ double s = sqrt(-2.0 * log(x));
140
+ #if __CUDA_ARCH__ > 0
141
+ sincospi(y, &result.x, &result.y);
142
+ #else
143
+ result.x = sin(y*CURAND_PI_DOUBLE);
144
+ result.y = cos(y*CURAND_PI_DOUBLE);
145
+ #endif
146
+ result.x *= s;
147
+ result.y *= s;
148
+ return result;
149
+ }
150
+
151
+ template <typename R>
152
+ QUALIFIERS float2 curand_box_muller(R *state)
153
+ {
154
+ float2 result;
155
+ unsigned int x = curand(state);
156
+ unsigned int y = curand(state);
157
+ result = _curand_box_muller(x, y);
158
+ return result;
159
+ }
160
+
161
+ template <typename R>
162
+ QUALIFIERS float4 curand_box_muller4(R *state)
163
+ {
164
+ float4 result;
165
+ float2 _result;
166
+ uint4 x = curand4(state);
167
+ //unsigned int y = curand(state);
168
+ _result = _curand_box_muller(x.x, x.y);
169
+ result.x = _result.x;
170
+ result.y = _result.y;
171
+ _result = _curand_box_muller(x.z, x.w);
172
+ result.z = _result.x;
173
+ result.w = _result.y;
174
+ return result;
175
+ }
176
+
177
+ template <typename R>
178
+ QUALIFIERS double2 curand_box_muller_double(R *state)
179
+ {
180
+ double2 result;
181
+ unsigned int x0 = curand(state);
182
+ unsigned int x1 = curand(state);
183
+ unsigned int y0 = curand(state);
184
+ unsigned int y1 = curand(state);
185
+ result = _curand_box_muller_double(x0, x1, y0, y1);
186
+ return result;
187
+ }
188
+
189
+ template <typename R>
190
+ QUALIFIERS double2 curand_box_muller2_double(R *state)
191
+ {
192
+ double2 result;
193
+ uint4 _x;
194
+ _x = curand4(state);
195
+ result = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
196
+ return result;
197
+ }
198
+
199
+
200
+ template <typename R>
201
+ QUALIFIERS double4 curand_box_muller4_double(R *state)
202
+ {
203
+ double4 result;
204
+ double2 _res1;
205
+ double2 _res2;
206
+ uint4 _x;
207
+ uint4 _y;
208
+ _x = curand4(state);
209
+ _y = curand4(state);
210
+ _res1 = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
211
+ _res2 = _curand_box_muller_double(_y.x, _y.y, _y.z, _y.w);
212
+ result.x = _res1.x;
213
+ result.y = _res1.y;
214
+ result.z = _res2.x;
215
+ result.w = _res2.y;
216
+ return result;
217
+ }
218
+
219
+ //QUALIFIERS float _curand_normal_icdf(unsigned int x)
220
+ //{
221
+ //#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
222
+ // float s = CURAND_SQRT2;
223
+ // // Mirror to avoid loss of precision
224
+ // if(x > 0x80000000UL) {
225
+ // x = 0xffffffffUL - x;
226
+ // s = -s;
227
+ // }
228
+ // float p = x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
229
+ // // p is in (0, 0.5], 2p is in (0, 1]
230
+ // return s * erfcinvf(2.0f * p);
231
+ //#else
232
+ // x++; //suppress warnings
233
+ // return 0.0f;
234
+ //#endif
235
+ //}
236
+ //
237
+ //QUALIFIERS float _curand_normal_icdf(unsigned long long x)
238
+ //{
239
+ //#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
240
+ // unsigned int t = (unsigned int)(x >> 32);
241
+ // float s = CURAND_SQRT2;
242
+ // // Mirror to avoid loss of precision
243
+ // if(t > 0x80000000UL) {
244
+ // t = 0xffffffffUL - t;
245
+ // s = -s;
246
+ // }
247
+ // float p = t * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
248
+ // // p is in (0, 0.5], 2p is in (0, 1]
249
+ // return s * erfcinvf(2.0f * p);
250
+ //#else
251
+ // x++;
252
+ // return 0.0f;
253
+ //#endif
254
+ //}
255
+ //
256
+ //QUALIFIERS double _curand_normal_icdf_double(unsigned int x)
257
+ //{
258
+ //#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
259
+ // double s = CURAND_SQRT2_DOUBLE;
260
+ // // Mirror to avoid loss of precision
261
+ // if(x > 0x80000000UL) {
262
+ // x = 0xffffffffUL - x;
263
+ // s = -s;
264
+ // }
265
+ // double p = x * CURAND_2POW32_INV_DOUBLE + (CURAND_2POW32_INV_DOUBLE/2.0);
266
+ // // p is in (0, 0.5], 2p is in (0, 1]
267
+ // return s * erfcinv(2.0 * p);
268
+ //#else
269
+ // x++;
270
+ // return 0.0;
271
+ //#endif
272
+ //}
273
+ //
274
+ //QUALIFIERS double _curand_normal_icdf_double(unsigned long long x)
275
+ //{
276
+ //#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
277
+ // double s = CURAND_SQRT2_DOUBLE;
278
+ // x >>= 11;
279
+ // // Mirror to avoid loss of precision
280
+ // if(x > 0x10000000000000UL) {
281
+ // x = 0x1fffffffffffffUL - x;
282
+ // s = -s;
283
+ // }
284
+ // double p = x * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
285
+ // // p is in (0, 0.5], 2p is in (0, 1]
286
+ // return s * erfcinv(2.0 * p);
287
+ //#else
288
+ // x++;
289
+ // return 0.0;
290
+ //#endif
291
+ //}
292
+ //
293
+
294
+ /**
295
+ * \brief Return a normally distributed float from an XORWOW generator.
296
+ *
297
+ * Return a single normally distributed float with mean \p 0.0f and
298
+ * standard deviation \p 1.0f from the XORWOW generator in \p state,
299
+ * increment position of generator by one.
300
+ *
301
+ * The implementation uses a Box-Muller transform to generate two
302
+ * normally distributed results, then returns them one at a time.
303
+ * See ::curand_normal2() for a more efficient version that returns
304
+ * both results at once.
305
+ *
306
+ * \param state - Pointer to state to update
307
+ *
308
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
309
+ */
310
+ QUALIFIERS float curand_normal(curandStateXORWOW_t *state)
311
+ {
312
+ if(state->boxmuller_flag != EXTRA_FLAG_NORMAL) {
313
+ unsigned int x, y;
314
+ x = curand(state);
315
+ y = curand(state);
316
+ float2 v = _curand_box_muller(x, y);
317
+ state->boxmuller_extra = v.y;
318
+ state->boxmuller_flag = EXTRA_FLAG_NORMAL;
319
+ return v.x;
320
+ }
321
+ state->boxmuller_flag = 0;
322
+ return state->boxmuller_extra;
323
+ }
324
+
325
+ /**
326
+ * \brief Return a normally distributed float from an Philox4_32_10 generator.
327
+ *
328
+ * Return a single normally distributed float with mean \p 0.0f and
329
+ * standard deviation \p 1.0f from the Philox4_32_10 generator in \p state,
330
+ * increment position of generator by one.
331
+ *
332
+ * The implementation uses a Box-Muller transform to generate two
333
+ * normally distributed results, then returns them one at a time.
334
+ * See ::curand_normal2() for a more efficient version that returns
335
+ * both results at once.
336
+ *
337
+ * \param state - Pointer to state to update
338
+ *
339
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
340
+ */
341
+
342
+ QUALIFIERS float curand_normal(curandStatePhilox4_32_10_t *state)
343
+ {
344
+ if(state->boxmuller_flag != EXTRA_FLAG_NORMAL) {
345
+ unsigned int x, y;
346
+ x = curand(state);
347
+ y = curand(state);
348
+ float2 v = _curand_box_muller(x, y);
349
+ state->boxmuller_extra = v.y;
350
+ state->boxmuller_flag = EXTRA_FLAG_NORMAL;
351
+ return v.x;
352
+ }
353
+ state->boxmuller_flag = 0;
354
+ return state->boxmuller_extra;
355
+ }
356
+
357
+
358
+
359
+ /**
360
+ * \brief Return a normally distributed float from an MRG32k3a generator.
361
+ *
362
+ * Return a single normally distributed float with mean \p 0.0f and
363
+ * standard deviation \p 1.0f from the MRG32k3a generator in \p state,
364
+ * increment position of generator by one.
365
+ *
366
+ * The implementation uses a Box-Muller transform to generate two
367
+ * normally distributed results, then returns them one at a time.
368
+ * See ::curand_normal2() for a more efficient version that returns
369
+ * both results at once.
370
+ *
371
+ * \param state - Pointer to state to update
372
+ *
373
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
374
+ */
375
+ QUALIFIERS float curand_normal(curandStateMRG32k3a_t *state)
376
+ {
377
+ if(state->boxmuller_flag != EXTRA_FLAG_NORMAL) {
378
+ float2 v = curand_box_muller_mrg(state);
379
+ state->boxmuller_extra = v.y;
380
+ state->boxmuller_flag = EXTRA_FLAG_NORMAL;
381
+ return v.x;
382
+ }
383
+ state->boxmuller_flag = 0;
384
+ return state->boxmuller_extra;
385
+ }
386
+
387
+ /**
388
+ * \brief Return two normally distributed floats from an XORWOW generator.
389
+ *
390
+ * Return two normally distributed floats with mean \p 0.0f and
391
+ * standard deviation \p 1.0f from the XORWOW generator in \p state,
392
+ * increment position of generator by two.
393
+ *
394
+ * The implementation uses a Box-Muller transform to generate two
395
+ * normally distributed results.
396
+ *
397
+ * \param state - Pointer to state to update
398
+ *
399
+ * \return Normally distributed float2 where each element is from a
400
+ * distribution with mean \p 0.0f and standard deviation \p 1.0f
401
+ */
402
+ QUALIFIERS float2 curand_normal2(curandStateXORWOW_t *state)
403
+ {
404
+ return curand_box_muller(state);
405
+ }
406
+ /**
407
+ * \brief Return two normally distributed floats from an Philox4_32_10 generator.
408
+ *
409
+ * Return two normally distributed floats with mean \p 0.0f and
410
+ * standard deviation \p 1.0f from the Philox4_32_10 generator in \p state,
411
+ * increment position of generator by two.
412
+ *
413
+ * The implementation uses a Box-Muller transform to generate two
414
+ * normally distributed results.
415
+ *
416
+ * \param state - Pointer to state to update
417
+ *
418
+ * \return Normally distributed float2 where each element is from a
419
+ * distribution with mean \p 0.0f and standard deviation \p 1.0f
420
+ */
421
+ QUALIFIERS float2 curand_normal2(curandStatePhilox4_32_10_t *state)
422
+ {
423
+ return curand_box_muller(state);
424
+ }
425
+
426
+ /**
427
+ * \brief Return four normally distributed floats from an Philox4_32_10 generator.
428
+ *
429
+ * Return four normally distributed floats with mean \p 0.0f and
430
+ * standard deviation \p 1.0f from the Philox4_32_10 generator in \p state,
431
+ * increment position of generator by four.
432
+ *
433
+ * The implementation uses a Box-Muller transform to generate two
434
+ * normally distributed results.
435
+ *
436
+ * \param state - Pointer to state to update
437
+ *
438
+ * \return Normally distributed float2 where each element is from a
439
+ * distribution with mean \p 0.0f and standard deviation \p 1.0f
440
+ */
441
+ QUALIFIERS float4 curand_normal4(curandStatePhilox4_32_10_t *state)
442
+ {
443
+ return curand_box_muller4(state);
444
+ }
445
+
446
+
447
+
448
+ /**
449
+ * \brief Return two normally distributed floats from an MRG32k3a generator.
450
+ *
451
+ * Return two normally distributed floats with mean \p 0.0f and
452
+ * standard deviation \p 1.0f from the MRG32k3a generator in \p state,
453
+ * increment position of generator by two.
454
+ *
455
+ * The implementation uses a Box-Muller transform to generate two
456
+ * normally distributed results.
457
+ *
458
+ * \param state - Pointer to state to update
459
+ *
460
+ * \return Normally distributed float2 where each element is from a
461
+ * distribution with mean \p 0.0f and standard deviation \p 1.0f
462
+ */
463
+ QUALIFIERS float2 curand_normal2(curandStateMRG32k3a_t *state)
464
+ {
465
+ return curand_box_muller_mrg(state);
466
+ }
467
+
468
+ /**
469
+ * \brief Return a normally distributed float from a MTGP32 generator.
470
+ *
471
+ * Return a single normally distributed float with mean \p 0.0f and
472
+ * standard deviation \p 1.0f from the MTGP32 generator in \p state,
473
+ * increment position of generator.
474
+ *
475
+ * The implementation uses the inverse cumulative distribution function
476
+ * to generate normally distributed results.
477
+ *
478
+ * \param state - Pointer to state to update
479
+ *
480
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
481
+ */
482
+ QUALIFIERS float curand_normal(curandStateMtgp32_t *state)
483
+ {
484
+ return _curand_normal_icdf(curand(state));
485
+ }
486
+ /**
487
+ * \brief Return a normally distributed float from a Sobol32 generator.
488
+ *
489
+ * Return a single normally distributed float with mean \p 0.0f and
490
+ * standard deviation \p 1.0f from the Sobol32 generator in \p state,
491
+ * increment position of generator by one.
492
+ *
493
+ * The implementation uses the inverse cumulative distribution function
494
+ * to generate normally distributed results.
495
+ *
496
+ * \param state - Pointer to state to update
497
+ *
498
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
499
+ */
500
+ QUALIFIERS float curand_normal(curandStateSobol32_t *state)
501
+ {
502
+ return _curand_normal_icdf(curand(state));
503
+ }
504
+
505
+ /**
506
+ * \brief Return a normally distributed float from a scrambled Sobol32 generator.
507
+ *
508
+ * Return a single normally distributed float with mean \p 0.0f and
509
+ * standard deviation \p 1.0f from the scrambled Sobol32 generator in \p state,
510
+ * increment position of generator by one.
511
+ *
512
+ * The implementation uses the inverse cumulative distribution function
513
+ * to generate normally distributed results.
514
+ *
515
+ * \param state - Pointer to state to update
516
+ *
517
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
518
+ */
519
+ QUALIFIERS float curand_normal(curandStateScrambledSobol32_t *state)
520
+ {
521
+ return _curand_normal_icdf(curand(state));
522
+ }
523
+
524
+ /**
525
+ * \brief Return a normally distributed float from a Sobol64 generator.
526
+ *
527
+ * Return a single normally distributed float with mean \p 0.0f and
528
+ * standard deviation \p 1.0f from the Sobol64 generator in \p state,
529
+ * increment position of generator by one.
530
+ *
531
+ * The implementation uses the inverse cumulative distribution function
532
+ * to generate normally distributed results.
533
+ *
534
+ * \param state - Pointer to state to update
535
+ *
536
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
537
+ */
538
+ QUALIFIERS float curand_normal(curandStateSobol64_t *state)
539
+ {
540
+ return _curand_normal_icdf(curand(state));
541
+ }
542
+
543
+ /**
544
+ * \brief Return a normally distributed float from a scrambled Sobol64 generator.
545
+ *
546
+ * Return a single normally distributed float with mean \p 0.0f and
547
+ * standard deviation \p 1.0f from the scrambled Sobol64 generator in \p state,
548
+ * increment position of generator by one.
549
+ *
550
+ * The implementation uses the inverse cumulative distribution function
551
+ * to generate normally distributed results.
552
+ *
553
+ * \param state - Pointer to state to update
554
+ *
555
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
556
+ */
557
+ QUALIFIERS float curand_normal(curandStateScrambledSobol64_t *state)
558
+ {
559
+ return _curand_normal_icdf(curand(state));
560
+ }
561
+
562
+ /**
563
+ * \brief Return a normally distributed double from an XORWOW generator.
564
+ *
565
+ * Return a single normally distributed double with mean \p 0.0 and
566
+ * standard deviation \p 1.0 from the XORWOW generator in \p state,
567
+ * increment position of generator.
568
+ *
569
+ * The implementation uses a Box-Muller transform to generate two
570
+ * normally distributed results, then returns them one at a time.
571
+ * See ::curand_normal2_double() for a more efficient version that returns
572
+ * both results at once.
573
+ *
574
+ * \param state - Pointer to state to update
575
+ *
576
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
577
+ */
578
+ QUALIFIERS double curand_normal_double(curandStateXORWOW_t *state)
579
+ {
580
+ if(state->boxmuller_flag_double != EXTRA_FLAG_NORMAL) {
581
+ unsigned int x0, x1, y0, y1;
582
+ x0 = curand(state);
583
+ x1 = curand(state);
584
+ y0 = curand(state);
585
+ y1 = curand(state);
586
+ double2 v = _curand_box_muller_double(x0, x1, y0, y1);
587
+ state->boxmuller_extra_double = v.y;
588
+ state->boxmuller_flag_double = EXTRA_FLAG_NORMAL;
589
+ return v.x;
590
+ }
591
+ state->boxmuller_flag_double = 0;
592
+ return state->boxmuller_extra_double;
593
+ }
594
+
595
+ /**
596
+ * \brief Return a normally distributed double from an Philox4_32_10 generator.
597
+ *
598
+ * Return a single normally distributed double with mean \p 0.0 and
599
+ * standard deviation \p 1.0 from the Philox4_32_10 generator in \p state,
600
+ * increment position of generator.
601
+ *
602
+ * The implementation uses a Box-Muller transform to generate two
603
+ * normally distributed results, then returns them one at a time.
604
+ * See ::curand_normal2_double() for a more efficient version that returns
605
+ * both results at once.
606
+ *
607
+ * \param state - Pointer to state to update
608
+ *
609
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
610
+ */
611
+
612
+ QUALIFIERS double curand_normal_double(curandStatePhilox4_32_10_t *state)
613
+ {
614
+ if(state->boxmuller_flag_double != EXTRA_FLAG_NORMAL) {
615
+ uint4 _x;
616
+ _x = curand4(state);
617
+ double2 v = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
618
+ state->boxmuller_extra_double = v.y;
619
+ state->boxmuller_flag_double = EXTRA_FLAG_NORMAL;
620
+ return v.x;
621
+ }
622
+ state->boxmuller_flag_double = 0;
623
+ return state->boxmuller_extra_double;
624
+ }
625
+
626
+
627
+ /**
628
+ * \brief Return a normally distributed double from an MRG32k3a generator.
629
+ *
630
+ * Return a single normally distributed double with mean \p 0.0 and
631
+ * standard deviation \p 1.0 from the XORWOW generator in \p state,
632
+ * increment position of generator.
633
+ *
634
+ * The implementation uses a Box-Muller transform to generate two
635
+ * normally distributed results, then returns them one at a time.
636
+ * See ::curand_normal2_double() for a more efficient version that returns
637
+ * both results at once.
638
+ *
639
+ * \param state - Pointer to state to update
640
+ *
641
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
642
+ */
643
+ QUALIFIERS double curand_normal_double(curandStateMRG32k3a_t *state)
644
+ {
645
+ if(state->boxmuller_flag_double != EXTRA_FLAG_NORMAL) {
646
+ double2 v = curand_box_muller_mrg_double(state);
647
+ state->boxmuller_extra_double = v.y;
648
+ state->boxmuller_flag_double = EXTRA_FLAG_NORMAL;
649
+ return v.x;
650
+ }
651
+ state->boxmuller_flag_double = 0;
652
+ return state->boxmuller_extra_double;
653
+ }
654
+
655
+ /**
656
+ * \brief Return two normally distributed doubles from an XORWOW generator.
657
+ *
658
+ * Return two normally distributed doubles with mean \p 0.0 and
659
+ * standard deviation \p 1.0 from the XORWOW generator in \p state,
660
+ * increment position of generator by 2.
661
+ *
662
+ * The implementation uses a Box-Muller transform to generate two
663
+ * normally distributed results.
664
+ *
665
+ * \param state - Pointer to state to update
666
+ *
667
+ * \return Normally distributed double2 where each element is from a
668
+ * distribution with mean \p 0.0 and standard deviation \p 1.0
669
+ */
670
+ QUALIFIERS double2 curand_normal2_double(curandStateXORWOW_t *state)
671
+ {
672
+ return curand_box_muller_double(state);
673
+ }
674
+
675
+ /**
676
+ * \brief Return two normally distributed doubles from an Philox4_32_10 generator.
677
+ *
678
+ * Return two normally distributed doubles with mean \p 0.0 and
679
+ * standard deviation \p 1.0 from the Philox4_32_10 generator in \p state,
680
+ * increment position of generator by 2.
681
+ *
682
+ * The implementation uses a Box-Muller transform to generate two
683
+ * normally distributed results.
684
+ *
685
+ * \param state - Pointer to state to update
686
+ *
687
+ * \return Normally distributed double2 where each element is from a
688
+ * distribution with mean \p 0.0 and standard deviation \p 1.0
689
+ */
690
+ QUALIFIERS double2 curand_normal2_double(curandStatePhilox4_32_10_t *state)
691
+ {
692
+ uint4 _x;
693
+ double2 result;
694
+
695
+ _x = curand4(state);
696
+ double2 v1 = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
697
+ result.x = v1.x;
698
+ result.y = v1.y;
699
+
700
+ return result;
701
+ }
702
+
703
+ // not a part of API
704
+ QUALIFIERS double4 curand_normal4_double(curandStatePhilox4_32_10_t *state)
705
+ {
706
+ uint4 _x;
707
+ uint4 _y;
708
+ double4 result;
709
+
710
+ _x = curand4(state);
711
+ _y = curand4(state);
712
+ double2 v1 = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
713
+ double2 v2 = _curand_box_muller_double(_y.x, _y.y, _y.z, _y.w);
714
+ result.x = v1.x;
715
+ result.y = v1.y;
716
+ result.z = v2.x;
717
+ result.w = v2.y;
718
+
719
+ return result;
720
+ }
721
+
722
+
723
+ /**
724
+ * \brief Return two normally distributed doubles from an MRG32k3a generator.
725
+ *
726
+ * Return two normally distributed doubles with mean \p 0.0 and
727
+ * standard deviation \p 1.0 from the MRG32k3a generator in \p state,
728
+ * increment position of generator.
729
+ *
730
+ * The implementation uses a Box-Muller transform to generate two
731
+ * normally distributed results.
732
+ *
733
+ * \param state - Pointer to state to update
734
+ *
735
+ * \return Normally distributed double2 where each element is from a
736
+ * distribution with mean \p 0.0 and standard deviation \p 1.0
737
+ */
738
+ QUALIFIERS double2 curand_normal2_double(curandStateMRG32k3a_t *state)
739
+ {
740
+ return curand_box_muller_mrg_double(state);
741
+ }
742
+
743
+ /**
744
+ * \brief Return a normally distributed double from an MTGP32 generator.
745
+ *
746
+ * Return a single normally distributed double with mean \p 0.0 and
747
+ * standard deviation \p 1.0 from the MTGP32 generator in \p state,
748
+ * increment position of generator.
749
+ *
750
+ * The implementation uses the inverse cumulative distribution function
751
+ * to generate normally distributed results.
752
+ *
753
+ * \param state - Pointer to state to update
754
+ *
755
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
756
+ */
757
+ QUALIFIERS double curand_normal_double(curandStateMtgp32_t *state)
758
+ {
759
+ return _curand_normal_icdf_double(curand(state));
760
+ }
761
+
762
+ /**
763
+ * \brief Return a normally distributed double from an Sobol32 generator.
764
+ *
765
+ * Return a single normally distributed double with mean \p 0.0 and
766
+ * standard deviation \p 1.0 from the Sobol32 generator in \p state,
767
+ * increment position of generator by one.
768
+ *
769
+ * The implementation uses the inverse cumulative distribution function
770
+ * to generate normally distributed results.
771
+ *
772
+ * \param state - Pointer to state to update
773
+ *
774
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
775
+ */
776
+ QUALIFIERS double curand_normal_double(curandStateSobol32_t *state)
777
+ {
778
+ return _curand_normal_icdf_double(curand(state));
779
+ }
780
+
781
+ /**
782
+ * \brief Return a normally distributed double from a scrambled Sobol32 generator.
783
+ *
784
+ * Return a single normally distributed double with mean \p 0.0 and
785
+ * standard deviation \p 1.0 from the scrambled Sobol32 generator in \p state,
786
+ * increment position of generator by one.
787
+ *
788
+ * The implementation uses the inverse cumulative distribution function
789
+ * to generate normally distributed results.
790
+ *
791
+ * \param state - Pointer to state to update
792
+ *
793
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
794
+ */
795
+ QUALIFIERS double curand_normal_double(curandStateScrambledSobol32_t *state)
796
+ {
797
+ return _curand_normal_icdf_double(curand(state));
798
+ }
799
+
800
+ /**
801
+ * \brief Return a normally distributed double from a Sobol64 generator.
802
+ *
803
+ * Return a single normally distributed double with mean \p 0.0 and
804
+ * standard deviation \p 1.0 from the Sobol64 generator in \p state,
805
+ * increment position of generator by one.
806
+ *
807
+ * The implementation uses the inverse cumulative distribution function
808
+ * to generate normally distributed results.
809
+ *
810
+ * \param state - Pointer to state to update
811
+ *
812
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
813
+ */
814
+ QUALIFIERS double curand_normal_double(curandStateSobol64_t *state)
815
+ {
816
+ return _curand_normal_icdf_double(curand(state));
817
+ }
818
+
819
+ /**
820
+ * \brief Return a normally distributed double from a scrambled Sobol64 generator.
821
+ *
822
+ * Return a single normally distributed double with mean \p 0.0 and
823
+ * standard deviation \p 1.0 from the scrambled Sobol64 generator in \p state,
824
+ * increment position of generator by one.
825
+ *
826
+ * The implementation uses the inverse cumulative distribution function
827
+ * to generate normally distributed results.
828
+ *
829
+ * \param state - Pointer to state to update
830
+ *
831
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
832
+ */
833
+ QUALIFIERS double curand_normal_double(curandStateScrambledSobol64_t *state)
834
+ {
835
+ return _curand_normal_icdf_double(curand(state));
836
+ }
837
+ #endif // !defined(CURAND_NORMAL_H_)
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/lib/__init__.py ADDED
File without changes
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/lib/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (218 Bytes). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/__init__.py ADDED
File without changes
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (212 Bytes). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/__init__.py ADDED
File without changes
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvtxDetail/nvtxImpl.h ADDED
@@ -0,0 +1,469 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* This file was procedurally generated! Do not modify this file by hand. */
2
+
3
+ /*
4
+ * Copyright 2009-2016 NVIDIA Corporation. All rights reserved.
5
+ *
6
+ * NOTICE TO USER:
7
+ *
8
+ * This source code is subject to NVIDIA ownership rights under U.S. and
9
+ * international Copyright laws.
10
+ *
11
+ * This software and the information contained herein is PROPRIETARY and
12
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
13
+ * of a form of NVIDIA software license agreement.
14
+ *
15
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
16
+ * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
17
+ * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
18
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
19
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
20
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
21
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
22
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
23
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
24
+ * OR PERFORMANCE OF THIS SOURCE CODE.
25
+ *
26
+ * U.S. Government End Users. This source code is a "commercial item" as
27
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
28
+ * "commercial computer software" and "commercial computer software
29
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
30
+ * and is provided to the U.S. Government only as a commercial end item.
31
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
32
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
33
+ * source code with only those rights set forth herein.
34
+ *
35
+ * Any use of this source code in individual and commercial software must
36
+ * include, in the user documentation and internal comments to the code,
37
+ * the above Disclaimer and U.S. Government End Users Notice.
38
+ */
39
+
40
+ #ifndef NVTX_IMPL_GUARD
41
+ #error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
42
+ #endif
43
+
44
+ /* ---- Include required platform headers ---- */
45
+
46
+ #if defined(_WIN32)
47
+
48
+ #include <Windows.h>
49
+
50
+ #else
51
+ #include <unistd.h>
52
+
53
+ #if defined(__ANDROID__)
54
+ #include <android/api-level.h>
55
+ #endif
56
+
57
+ #if defined(__linux__) || defined(__CYGWIN__)
58
+ #include <sched.h>
59
+ #endif
60
+
61
+ #include <limits.h>
62
+ #include <dlfcn.h>
63
+ #include <fcntl.h>
64
+ #include <stdlib.h>
65
+ #include <stdio.h>
66
+ #include <sys/types.h>
67
+ #include <unistd.h>
68
+ #include <errno.h>
69
+
70
+ #include <string.h>
71
+ #include <sys/types.h>
72
+ #include <pthread.h>
73
+ #include <stdlib.h>
74
+ #include <wchar.h>
75
+
76
+ #endif
77
+
78
+ /* ---- Define macros used in this file ---- */
79
+
80
+ #define NVTX_INIT_STATE_FRESH 0
81
+ #define NVTX_INIT_STATE_STARTED 1
82
+ #define NVTX_INIT_STATE_COMPLETE 2
83
+
84
+ #ifdef NVTX_DEBUG_PRINT
85
+ #ifdef __ANDROID__
86
+ #include <android/log.h>
87
+ #define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__);
88
+ #define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__);
89
+ #else
90
+ #include <stdio.h>
91
+ #define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__)
92
+ #define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__)
93
+ #endif
94
+ #else /* !defined(NVTX_DEBUG_PRINT) */
95
+ #define NVTX_ERR(...)
96
+ #define NVTX_INFO(...)
97
+ #endif
98
+
99
+ #ifdef __cplusplus
100
+ extern "C" {
101
+ #endif /* __cplusplus */
102
+
103
+ #ifdef __GNUC__
104
+ #pragma GCC visibility push(hidden)
105
+ #endif
106
+
107
+ /* ---- Forward declare all functions referenced in globals ---- */
108
+
109
+ NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(void);
110
+ NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)(
111
+ NvtxCallbackModule module,
112
+ NvtxFunctionTable* out_table,
113
+ unsigned int* out_size);
114
+ NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(
115
+ uint32_t version);
116
+ NVTX_LINKONCE_FWDDECL_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(
117
+ uint32_t exportTableId);
118
+
119
+ #include "nvtxInitDecls.h"
120
+
121
+ /* ---- Define all globals ---- */
122
+
123
+ typedef struct nvtxGlobals_t
124
+ {
125
+ volatile unsigned int initState;
126
+ NvtxExportTableCallbacks etblCallbacks;
127
+ NvtxExportTableVersionInfo etblVersionInfo;
128
+
129
+ /* Implementation function pointers */
130
+ nvtxMarkEx_impl_fntype nvtxMarkEx_impl_fnptr;
131
+ nvtxMarkA_impl_fntype nvtxMarkA_impl_fnptr;
132
+ nvtxMarkW_impl_fntype nvtxMarkW_impl_fnptr;
133
+ nvtxRangeStartEx_impl_fntype nvtxRangeStartEx_impl_fnptr;
134
+ nvtxRangeStartA_impl_fntype nvtxRangeStartA_impl_fnptr;
135
+ nvtxRangeStartW_impl_fntype nvtxRangeStartW_impl_fnptr;
136
+ nvtxRangeEnd_impl_fntype nvtxRangeEnd_impl_fnptr;
137
+ nvtxRangePushEx_impl_fntype nvtxRangePushEx_impl_fnptr;
138
+ nvtxRangePushA_impl_fntype nvtxRangePushA_impl_fnptr;
139
+ nvtxRangePushW_impl_fntype nvtxRangePushW_impl_fnptr;
140
+ nvtxRangePop_impl_fntype nvtxRangePop_impl_fnptr;
141
+ nvtxNameCategoryA_impl_fntype nvtxNameCategoryA_impl_fnptr;
142
+ nvtxNameCategoryW_impl_fntype nvtxNameCategoryW_impl_fnptr;
143
+ nvtxNameOsThreadA_impl_fntype nvtxNameOsThreadA_impl_fnptr;
144
+ nvtxNameOsThreadW_impl_fntype nvtxNameOsThreadW_impl_fnptr;
145
+
146
+ nvtxNameCuDeviceA_fakeimpl_fntype nvtxNameCuDeviceA_impl_fnptr;
147
+ nvtxNameCuDeviceW_fakeimpl_fntype nvtxNameCuDeviceW_impl_fnptr;
148
+ nvtxNameCuContextA_fakeimpl_fntype nvtxNameCuContextA_impl_fnptr;
149
+ nvtxNameCuContextW_fakeimpl_fntype nvtxNameCuContextW_impl_fnptr;
150
+ nvtxNameCuStreamA_fakeimpl_fntype nvtxNameCuStreamA_impl_fnptr;
151
+ nvtxNameCuStreamW_fakeimpl_fntype nvtxNameCuStreamW_impl_fnptr;
152
+ nvtxNameCuEventA_fakeimpl_fntype nvtxNameCuEventA_impl_fnptr;
153
+ nvtxNameCuEventW_fakeimpl_fntype nvtxNameCuEventW_impl_fnptr;
154
+
155
+ nvtxNameClDeviceA_fakeimpl_fntype nvtxNameClDeviceA_impl_fnptr;
156
+ nvtxNameClDeviceW_fakeimpl_fntype nvtxNameClDeviceW_impl_fnptr;
157
+ nvtxNameClContextA_fakeimpl_fntype nvtxNameClContextA_impl_fnptr;
158
+ nvtxNameClContextW_fakeimpl_fntype nvtxNameClContextW_impl_fnptr;
159
+ nvtxNameClCommandQueueA_fakeimpl_fntype nvtxNameClCommandQueueA_impl_fnptr;
160
+ nvtxNameClCommandQueueW_fakeimpl_fntype nvtxNameClCommandQueueW_impl_fnptr;
161
+ nvtxNameClMemObjectA_fakeimpl_fntype nvtxNameClMemObjectA_impl_fnptr;
162
+ nvtxNameClMemObjectW_fakeimpl_fntype nvtxNameClMemObjectW_impl_fnptr;
163
+ nvtxNameClSamplerA_fakeimpl_fntype nvtxNameClSamplerA_impl_fnptr;
164
+ nvtxNameClSamplerW_fakeimpl_fntype nvtxNameClSamplerW_impl_fnptr;
165
+ nvtxNameClProgramA_fakeimpl_fntype nvtxNameClProgramA_impl_fnptr;
166
+ nvtxNameClProgramW_fakeimpl_fntype nvtxNameClProgramW_impl_fnptr;
167
+ nvtxNameClEventA_fakeimpl_fntype nvtxNameClEventA_impl_fnptr;
168
+ nvtxNameClEventW_fakeimpl_fntype nvtxNameClEventW_impl_fnptr;
169
+
170
+ nvtxNameCudaDeviceA_impl_fntype nvtxNameCudaDeviceA_impl_fnptr;
171
+ nvtxNameCudaDeviceW_impl_fntype nvtxNameCudaDeviceW_impl_fnptr;
172
+ nvtxNameCudaStreamA_fakeimpl_fntype nvtxNameCudaStreamA_impl_fnptr;
173
+ nvtxNameCudaStreamW_fakeimpl_fntype nvtxNameCudaStreamW_impl_fnptr;
174
+ nvtxNameCudaEventA_fakeimpl_fntype nvtxNameCudaEventA_impl_fnptr;
175
+ nvtxNameCudaEventW_fakeimpl_fntype nvtxNameCudaEventW_impl_fnptr;
176
+
177
+ nvtxDomainMarkEx_impl_fntype nvtxDomainMarkEx_impl_fnptr;
178
+ nvtxDomainRangeStartEx_impl_fntype nvtxDomainRangeStartEx_impl_fnptr;
179
+ nvtxDomainRangeEnd_impl_fntype nvtxDomainRangeEnd_impl_fnptr;
180
+ nvtxDomainRangePushEx_impl_fntype nvtxDomainRangePushEx_impl_fnptr;
181
+ nvtxDomainRangePop_impl_fntype nvtxDomainRangePop_impl_fnptr;
182
+ nvtxDomainResourceCreate_impl_fntype nvtxDomainResourceCreate_impl_fnptr;
183
+ nvtxDomainResourceDestroy_impl_fntype nvtxDomainResourceDestroy_impl_fnptr;
184
+ nvtxDomainNameCategoryA_impl_fntype nvtxDomainNameCategoryA_impl_fnptr;
185
+ nvtxDomainNameCategoryW_impl_fntype nvtxDomainNameCategoryW_impl_fnptr;
186
+ nvtxDomainRegisterStringA_impl_fntype nvtxDomainRegisterStringA_impl_fnptr;
187
+ nvtxDomainRegisterStringW_impl_fntype nvtxDomainRegisterStringW_impl_fnptr;
188
+ nvtxDomainCreateA_impl_fntype nvtxDomainCreateA_impl_fnptr;
189
+ nvtxDomainCreateW_impl_fntype nvtxDomainCreateW_impl_fnptr;
190
+ nvtxDomainDestroy_impl_fntype nvtxDomainDestroy_impl_fnptr;
191
+ nvtxInitialize_impl_fntype nvtxInitialize_impl_fnptr;
192
+
193
+ nvtxDomainSyncUserCreate_impl_fntype nvtxDomainSyncUserCreate_impl_fnptr;
194
+ nvtxDomainSyncUserDestroy_impl_fntype nvtxDomainSyncUserDestroy_impl_fnptr;
195
+ nvtxDomainSyncUserAcquireStart_impl_fntype nvtxDomainSyncUserAcquireStart_impl_fnptr;
196
+ nvtxDomainSyncUserAcquireFailed_impl_fntype nvtxDomainSyncUserAcquireFailed_impl_fnptr;
197
+ nvtxDomainSyncUserAcquireSuccess_impl_fntype nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
198
+ nvtxDomainSyncUserReleasing_impl_fntype nvtxDomainSyncUserReleasing_impl_fnptr;
199
+
200
+ /* Tables of function pointers -- Extra null added to the end to ensure
201
+ * a crash instead of silent corruption if a tool reads off the end. */
202
+ NvtxFunctionPointer* functionTable_CORE [NVTX_CBID_CORE_SIZE + 1];
203
+ NvtxFunctionPointer* functionTable_CUDA [NVTX_CBID_CUDA_SIZE + 1];
204
+ NvtxFunctionPointer* functionTable_OPENCL[NVTX_CBID_OPENCL_SIZE + 1];
205
+ NvtxFunctionPointer* functionTable_CUDART[NVTX_CBID_CUDART_SIZE + 1];
206
+ NvtxFunctionPointer* functionTable_CORE2 [NVTX_CBID_CORE2_SIZE + 1];
207
+ NvtxFunctionPointer* functionTable_SYNC [NVTX_CBID_SYNC_SIZE + 1];
208
+ } nvtxGlobals_t;
209
+
210
+ NVTX_LINKONCE_DEFINE_GLOBAL nvtxGlobals_t NVTX_VERSIONED_IDENTIFIER(nvtxGlobals) =
211
+ {
212
+ NVTX_INIT_STATE_FRESH,
213
+
214
+ {
215
+ sizeof(NvtxExportTableCallbacks),
216
+ NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)
217
+ },
218
+ {
219
+ sizeof(NvtxExportTableVersionInfo),
220
+ NVTX_VERSION,
221
+ 0,
222
+ NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)
223
+ },
224
+
225
+ /* Implementation function pointers */
226
+ NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init),
227
+ NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init),
228
+ NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init),
229
+ NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init),
230
+ NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init),
231
+ NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init),
232
+ NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init),
233
+ NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init),
234
+ NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init),
235
+ NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init),
236
+ NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init),
237
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init),
238
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init),
239
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init),
240
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init),
241
+
242
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init),
243
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init),
244
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init),
245
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init),
246
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init),
247
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init),
248
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init),
249
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init),
250
+
251
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init),
252
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init),
253
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init),
254
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init),
255
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init),
256
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init),
257
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init),
258
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init),
259
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init),
260
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init),
261
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init),
262
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init),
263
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init),
264
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init),
265
+
266
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init),
267
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init),
268
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init),
269
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init),
270
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init),
271
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init),
272
+
273
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init),
274
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init),
275
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init),
276
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init),
277
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init),
278
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init),
279
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init),
280
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init),
281
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init),
282
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init),
283
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init),
284
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init),
285
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init),
286
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init),
287
+ NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init),
288
+
289
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init),
290
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init),
291
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init),
292
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init),
293
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init),
294
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init),
295
+
296
+ /* Tables of function pointers */
297
+ {
298
+ 0,
299
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr,
300
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr,
301
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr,
302
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr,
303
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr,
304
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr,
305
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr,
306
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr,
307
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr,
308
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr,
309
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr,
310
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr,
311
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr,
312
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr,
313
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr,
314
+ 0
315
+ },
316
+ {
317
+ 0,
318
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr,
319
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr,
320
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr,
321
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr,
322
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr,
323
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr,
324
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr,
325
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr,
326
+ 0
327
+ },
328
+ {
329
+ 0,
330
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr,
331
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr,
332
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr,
333
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr,
334
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr,
335
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr,
336
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr,
337
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr,
338
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr,
339
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr,
340
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr,
341
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr,
342
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr,
343
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr,
344
+ 0
345
+ },
346
+ {
347
+ 0,
348
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr,
349
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr,
350
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr,
351
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr,
352
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr,
353
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr,
354
+ 0
355
+ },
356
+ {
357
+ 0,
358
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr,
359
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr,
360
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr,
361
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr,
362
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr,
363
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr,
364
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr,
365
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr,
366
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr,
367
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr,
368
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr,
369
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr,
370
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr,
371
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr,
372
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr,
373
+ 0
374
+ },
375
+ {
376
+ 0,
377
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr,
378
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr,
379
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr,
380
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr,
381
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr,
382
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr,
383
+ 0
384
+ }
385
+ };
386
+
387
+ /* ---- Define static inline implementations of core API functions ---- */
388
+
389
+ #include "nvtxImplCore.h"
390
+
391
+ /* ---- Define implementations of export table functions ---- */
392
+
393
+ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)(
394
+ NvtxCallbackModule module,
395
+ NvtxFunctionTable* out_table,
396
+ unsigned int* out_size)
397
+ {
398
+ unsigned int bytes = 0;
399
+ NvtxFunctionTable table = (NvtxFunctionTable)0;
400
+
401
+ switch (module)
402
+ {
403
+ case NVTX_CB_MODULE_CORE:
404
+ table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE;
405
+ bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE);
406
+ break;
407
+ case NVTX_CB_MODULE_CUDA:
408
+ table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA;
409
+ bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA);
410
+ break;
411
+ case NVTX_CB_MODULE_OPENCL:
412
+ table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL;
413
+ bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL);
414
+ break;
415
+ case NVTX_CB_MODULE_CUDART:
416
+ table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART;
417
+ bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART);
418
+ break;
419
+ case NVTX_CB_MODULE_CORE2:
420
+ table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2;
421
+ bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2);
422
+ break;
423
+ case NVTX_CB_MODULE_SYNC:
424
+ table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC;
425
+ bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC);
426
+ break;
427
+ default: return 0;
428
+ }
429
+
430
+ if (out_size)
431
+ *out_size = (bytes / (unsigned int)sizeof(NvtxFunctionPointer*)) - 1;
432
+
433
+ if (out_table)
434
+ *out_table = table;
435
+
436
+ return 1;
437
+ }
438
+
439
+ NVTX_LINKONCE_DEFINE_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(uint32_t exportTableId)
440
+ {
441
+ switch (exportTableId)
442
+ {
443
+ case NVTX_ETID_CALLBACKS: return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblCallbacks;
444
+ case NVTX_ETID_VERSIONINFO: return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblVersionInfo;
445
+ default: return 0;
446
+ }
447
+ }
448
+
449
+ NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(uint32_t version)
450
+ {
451
+ /* Reserved for custom implementations to resolve problems with tools */
452
+ (void)version;
453
+ }
454
+
455
+ /* ---- Define implementations of init versions of all API functions ---- */
456
+
457
+ #include "nvtxInitDefs.h"
458
+
459
+ /* ---- Define implementations of initialization functions ---- */
460
+
461
+ #include "nvtxInit.h"
462
+
463
+ #ifdef __GNUC__
464
+ #pragma GCC visibility pop
465
+ #endif
466
+
467
+ #ifdef __cplusplus
468
+ } /* extern "C" */
469
+ #endif /* __cplusplus */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/lib/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (216 Bytes). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/lib/libnvToolsExt.so.1 ADDED
Binary file (40.1 kB). View file