Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/cython.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/isympy.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_soft.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_error.py +30 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_soft.py +47 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/py.typed +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/version.py +16 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/__init__.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/utils.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/__init__.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/common.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/copy.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/put.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/get.py +587 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/put.py +591 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/__init__.py +38 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/control_flow.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/ops.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/control_flow.py +8 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaProfilerTypedefs.h +78 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaVDPAU.h +282 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_atomic_functions.h +100 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_atomic_functions.hpp +85 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_35_intrinsics.h +116 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/vector_functions.hpp +316 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/__pycache__/__init__.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/__pycache__/__init__.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_train.h +540 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_backend.h +600 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_infer.h +1183 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_infer_v8.h +1183 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_train.h +501 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/__init__.py +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/__pycache__/__init__.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufft.h +322 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/__pycache__/__init__.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/__pycache__/__init__.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete.h +87 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete2.h +253 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32.h +210 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32_kernel.h +385 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_normal.h +837 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/lib/__init__.py +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/lib/__pycache__/__init__.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/__init__.py +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/__pycache__/__init__.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/__init__.py +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvtxDetail/nvtxImpl.h +469 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/lib/__pycache__/__init__.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/lib/libnvToolsExt.so.1 +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/cython.cpython-311.pyc
ADDED
|
Binary file (863 Bytes). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/__pycache__/isympy.cpython-311.pyc
ADDED
|
Binary file (11.3 kB). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_soft.cpython-311.pyc
ADDED
|
Binary file (2.71 kB). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_error.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Any
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class Timeout(TimeoutError): # noqa: N818
|
| 7 |
+
"""Raised when the lock could not be acquired in *timeout* seconds."""
|
| 8 |
+
|
| 9 |
+
def __init__(self, lock_file: str) -> None:
|
| 10 |
+
super().__init__()
|
| 11 |
+
self._lock_file = lock_file
|
| 12 |
+
|
| 13 |
+
def __reduce__(self) -> str | tuple[Any, ...]:
|
| 14 |
+
return self.__class__, (self._lock_file,) # Properly pickle the exception
|
| 15 |
+
|
| 16 |
+
def __str__(self) -> str:
|
| 17 |
+
return f"The file lock '{self._lock_file}' could not be acquired."
|
| 18 |
+
|
| 19 |
+
def __repr__(self) -> str:
|
| 20 |
+
return f"{self.__class__.__name__}({self.lock_file!r})"
|
| 21 |
+
|
| 22 |
+
@property
|
| 23 |
+
def lock_file(self) -> str:
|
| 24 |
+
""":return: The path of the file lock."""
|
| 25 |
+
return self._lock_file
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
__all__ = [
|
| 29 |
+
"Timeout",
|
| 30 |
+
]
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_soft.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
from contextlib import suppress
|
| 6 |
+
from errno import EACCES, EEXIST
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
from ._api import BaseFileLock
|
| 10 |
+
from ._util import ensure_directory_exists, raise_on_not_writable_file
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class SoftFileLock(BaseFileLock):
|
| 14 |
+
"""Simply watches the existence of the lock file."""
|
| 15 |
+
|
| 16 |
+
def _acquire(self) -> None:
|
| 17 |
+
raise_on_not_writable_file(self.lock_file)
|
| 18 |
+
ensure_directory_exists(self.lock_file)
|
| 19 |
+
# first check for exists and read-only mode as the open will mask this case as EEXIST
|
| 20 |
+
flags = (
|
| 21 |
+
os.O_WRONLY # open for writing only
|
| 22 |
+
| os.O_CREAT
|
| 23 |
+
| os.O_EXCL # together with above raise EEXIST if the file specified by filename exists
|
| 24 |
+
| os.O_TRUNC # truncate the file to zero byte
|
| 25 |
+
)
|
| 26 |
+
try:
|
| 27 |
+
file_handler = os.open(self.lock_file, flags, self._context.mode)
|
| 28 |
+
except OSError as exception: # re-raise unless expected exception
|
| 29 |
+
if not (
|
| 30 |
+
exception.errno == EEXIST # lock already exist
|
| 31 |
+
or (exception.errno == EACCES and sys.platform == "win32") # has no access to this lock
|
| 32 |
+
): # pragma: win32 no cover
|
| 33 |
+
raise
|
| 34 |
+
else:
|
| 35 |
+
self._context.lock_file_fd = file_handler
|
| 36 |
+
|
| 37 |
+
def _release(self) -> None:
|
| 38 |
+
assert self._context.lock_file_fd is not None # noqa: S101
|
| 39 |
+
os.close(self._context.lock_file_fd) # the lock file is definitely not None
|
| 40 |
+
self._context.lock_file_fd = None
|
| 41 |
+
with suppress(OSError): # the file is already deleted and that's what we want
|
| 42 |
+
Path(self.lock_file).unlink()
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
__all__ = [
|
| 46 |
+
"SoftFileLock",
|
| 47 |
+
]
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/py.typed
ADDED
|
File without changes
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/version.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# file generated by setuptools_scm
|
| 2 |
+
# don't change, don't track in version control
|
| 3 |
+
TYPE_CHECKING = False
|
| 4 |
+
if TYPE_CHECKING:
|
| 5 |
+
from typing import Tuple, Union
|
| 6 |
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
| 7 |
+
else:
|
| 8 |
+
VERSION_TUPLE = object
|
| 9 |
+
|
| 10 |
+
version: str
|
| 11 |
+
__version__: str
|
| 12 |
+
__version_tuple__: VERSION_TUPLE
|
| 13 |
+
version_tuple: VERSION_TUPLE
|
| 14 |
+
|
| 15 |
+
__version__ = version = '3.13.1'
|
| 16 |
+
__version_tuple__ = version_tuple = (3, 13, 1)
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (2.25 kB). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/utils.cpython-311.pyc
ADDED
|
Binary file (32.1 kB). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (15 kB). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/common.cpython-311.pyc
ADDED
|
Binary file (2.32 kB). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/copy.cpython-311.pyc
ADDED
|
Binary file (26.6 kB). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/__pycache__/put.cpython-311.pyc
ADDED
|
Binary file (27.8 kB). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/get.py
ADDED
|
@@ -0,0 +1,587 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from hashlib import md5
|
| 2 |
+
from itertools import product
|
| 3 |
+
|
| 4 |
+
import pytest
|
| 5 |
+
|
| 6 |
+
from fsspec.implementations.local import make_path_posix
|
| 7 |
+
from fsspec.tests.abstract.common import GLOB_EDGE_CASES_TESTS
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class AbstractGetTests:
|
| 11 |
+
def test_get_file_to_existing_directory(
|
| 12 |
+
self,
|
| 13 |
+
fs,
|
| 14 |
+
fs_join,
|
| 15 |
+
fs_bulk_operations_scenario_0,
|
| 16 |
+
local_fs,
|
| 17 |
+
local_join,
|
| 18 |
+
local_target,
|
| 19 |
+
):
|
| 20 |
+
# Copy scenario 1a
|
| 21 |
+
source = fs_bulk_operations_scenario_0
|
| 22 |
+
|
| 23 |
+
target = local_target
|
| 24 |
+
local_fs.mkdir(target)
|
| 25 |
+
assert local_fs.isdir(target)
|
| 26 |
+
|
| 27 |
+
target_file2 = local_join(target, "file2")
|
| 28 |
+
target_subfile1 = local_join(target, "subfile1")
|
| 29 |
+
|
| 30 |
+
# Copy from source directory
|
| 31 |
+
fs.get(fs_join(source, "file2"), target)
|
| 32 |
+
assert local_fs.isfile(target_file2)
|
| 33 |
+
|
| 34 |
+
# Copy from sub directory
|
| 35 |
+
fs.get(fs_join(source, "subdir", "subfile1"), target)
|
| 36 |
+
assert local_fs.isfile(target_subfile1)
|
| 37 |
+
|
| 38 |
+
# Remove copied files
|
| 39 |
+
local_fs.rm([target_file2, target_subfile1])
|
| 40 |
+
assert not local_fs.exists(target_file2)
|
| 41 |
+
assert not local_fs.exists(target_subfile1)
|
| 42 |
+
|
| 43 |
+
# Repeat with trailing slash on target
|
| 44 |
+
fs.get(fs_join(source, "file2"), target + "/")
|
| 45 |
+
assert local_fs.isdir(target)
|
| 46 |
+
assert local_fs.isfile(target_file2)
|
| 47 |
+
|
| 48 |
+
fs.get(fs_join(source, "subdir", "subfile1"), target + "/")
|
| 49 |
+
assert local_fs.isfile(target_subfile1)
|
| 50 |
+
|
| 51 |
+
def test_get_file_to_new_directory(
|
| 52 |
+
self,
|
| 53 |
+
fs,
|
| 54 |
+
fs_join,
|
| 55 |
+
fs_bulk_operations_scenario_0,
|
| 56 |
+
local_fs,
|
| 57 |
+
local_join,
|
| 58 |
+
local_target,
|
| 59 |
+
):
|
| 60 |
+
# Copy scenario 1b
|
| 61 |
+
source = fs_bulk_operations_scenario_0
|
| 62 |
+
|
| 63 |
+
target = local_target
|
| 64 |
+
local_fs.mkdir(target)
|
| 65 |
+
|
| 66 |
+
fs.get(
|
| 67 |
+
fs_join(source, "subdir", "subfile1"), local_join(target, "newdir/")
|
| 68 |
+
) # Note trailing slash
|
| 69 |
+
|
| 70 |
+
assert local_fs.isdir(target)
|
| 71 |
+
assert local_fs.isdir(local_join(target, "newdir"))
|
| 72 |
+
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
|
| 73 |
+
|
| 74 |
+
def test_get_file_to_file_in_existing_directory(
|
| 75 |
+
self,
|
| 76 |
+
fs,
|
| 77 |
+
fs_join,
|
| 78 |
+
fs_bulk_operations_scenario_0,
|
| 79 |
+
local_fs,
|
| 80 |
+
local_join,
|
| 81 |
+
local_target,
|
| 82 |
+
):
|
| 83 |
+
# Copy scenario 1c
|
| 84 |
+
source = fs_bulk_operations_scenario_0
|
| 85 |
+
|
| 86 |
+
target = local_target
|
| 87 |
+
local_fs.mkdir(target)
|
| 88 |
+
|
| 89 |
+
fs.get(fs_join(source, "subdir", "subfile1"), local_join(target, "newfile"))
|
| 90 |
+
assert local_fs.isfile(local_join(target, "newfile"))
|
| 91 |
+
|
| 92 |
+
def test_get_file_to_file_in_new_directory(
|
| 93 |
+
self,
|
| 94 |
+
fs,
|
| 95 |
+
fs_join,
|
| 96 |
+
fs_bulk_operations_scenario_0,
|
| 97 |
+
local_fs,
|
| 98 |
+
local_join,
|
| 99 |
+
local_target,
|
| 100 |
+
):
|
| 101 |
+
# Copy scenario 1d
|
| 102 |
+
source = fs_bulk_operations_scenario_0
|
| 103 |
+
|
| 104 |
+
target = local_target
|
| 105 |
+
local_fs.mkdir(target)
|
| 106 |
+
|
| 107 |
+
fs.get(
|
| 108 |
+
fs_join(source, "subdir", "subfile1"),
|
| 109 |
+
local_join(target, "newdir", "newfile"),
|
| 110 |
+
)
|
| 111 |
+
assert local_fs.isdir(local_join(target, "newdir"))
|
| 112 |
+
assert local_fs.isfile(local_join(target, "newdir", "newfile"))
|
| 113 |
+
|
| 114 |
+
def test_get_directory_to_existing_directory(
|
| 115 |
+
self,
|
| 116 |
+
fs,
|
| 117 |
+
fs_join,
|
| 118 |
+
fs_bulk_operations_scenario_0,
|
| 119 |
+
local_fs,
|
| 120 |
+
local_join,
|
| 121 |
+
local_target,
|
| 122 |
+
):
|
| 123 |
+
# Copy scenario 1e
|
| 124 |
+
source = fs_bulk_operations_scenario_0
|
| 125 |
+
|
| 126 |
+
target = local_target
|
| 127 |
+
local_fs.mkdir(target)
|
| 128 |
+
assert local_fs.isdir(target)
|
| 129 |
+
|
| 130 |
+
for source_slash, target_slash in zip([False, True], [False, True]):
|
| 131 |
+
s = fs_join(source, "subdir")
|
| 132 |
+
if source_slash:
|
| 133 |
+
s += "/"
|
| 134 |
+
t = target + "/" if target_slash else target
|
| 135 |
+
|
| 136 |
+
# Without recursive does nothing
|
| 137 |
+
fs.get(s, t)
|
| 138 |
+
assert local_fs.ls(target) == []
|
| 139 |
+
|
| 140 |
+
# With recursive
|
| 141 |
+
fs.get(s, t, recursive=True)
|
| 142 |
+
if source_slash:
|
| 143 |
+
assert local_fs.isfile(local_join(target, "subfile1"))
|
| 144 |
+
assert local_fs.isfile(local_join(target, "subfile2"))
|
| 145 |
+
assert local_fs.isdir(local_join(target, "nesteddir"))
|
| 146 |
+
assert local_fs.isfile(local_join(target, "nesteddir", "nestedfile"))
|
| 147 |
+
assert not local_fs.exists(local_join(target, "subdir"))
|
| 148 |
+
|
| 149 |
+
local_fs.rm(
|
| 150 |
+
[
|
| 151 |
+
local_join(target, "subfile1"),
|
| 152 |
+
local_join(target, "subfile2"),
|
| 153 |
+
local_join(target, "nesteddir"),
|
| 154 |
+
],
|
| 155 |
+
recursive=True,
|
| 156 |
+
)
|
| 157 |
+
else:
|
| 158 |
+
assert local_fs.isdir(local_join(target, "subdir"))
|
| 159 |
+
assert local_fs.isfile(local_join(target, "subdir", "subfile1"))
|
| 160 |
+
assert local_fs.isfile(local_join(target, "subdir", "subfile2"))
|
| 161 |
+
assert local_fs.isdir(local_join(target, "subdir", "nesteddir"))
|
| 162 |
+
assert local_fs.isfile(
|
| 163 |
+
local_join(target, "subdir", "nesteddir", "nestedfile")
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
local_fs.rm(local_join(target, "subdir"), recursive=True)
|
| 167 |
+
assert local_fs.ls(target) == []
|
| 168 |
+
|
| 169 |
+
# Limit recursive by maxdepth
|
| 170 |
+
fs.get(s, t, recursive=True, maxdepth=1)
|
| 171 |
+
if source_slash:
|
| 172 |
+
assert local_fs.isfile(local_join(target, "subfile1"))
|
| 173 |
+
assert local_fs.isfile(local_join(target, "subfile2"))
|
| 174 |
+
assert not local_fs.exists(local_join(target, "nesteddir"))
|
| 175 |
+
assert not local_fs.exists(local_join(target, "subdir"))
|
| 176 |
+
|
| 177 |
+
local_fs.rm(
|
| 178 |
+
[
|
| 179 |
+
local_join(target, "subfile1"),
|
| 180 |
+
local_join(target, "subfile2"),
|
| 181 |
+
],
|
| 182 |
+
recursive=True,
|
| 183 |
+
)
|
| 184 |
+
else:
|
| 185 |
+
assert local_fs.isdir(local_join(target, "subdir"))
|
| 186 |
+
assert local_fs.isfile(local_join(target, "subdir", "subfile1"))
|
| 187 |
+
assert local_fs.isfile(local_join(target, "subdir", "subfile2"))
|
| 188 |
+
assert not local_fs.exists(local_join(target, "subdir", "nesteddir"))
|
| 189 |
+
|
| 190 |
+
local_fs.rm(local_join(target, "subdir"), recursive=True)
|
| 191 |
+
assert local_fs.ls(target) == []
|
| 192 |
+
|
| 193 |
+
def test_get_directory_to_new_directory(
|
| 194 |
+
self,
|
| 195 |
+
fs,
|
| 196 |
+
fs_join,
|
| 197 |
+
fs_bulk_operations_scenario_0,
|
| 198 |
+
local_fs,
|
| 199 |
+
local_join,
|
| 200 |
+
local_target,
|
| 201 |
+
):
|
| 202 |
+
# Copy scenario 1f
|
| 203 |
+
source = fs_bulk_operations_scenario_0
|
| 204 |
+
|
| 205 |
+
target = local_target
|
| 206 |
+
local_fs.mkdir(target)
|
| 207 |
+
|
| 208 |
+
for source_slash, target_slash in zip([False, True], [False, True]):
|
| 209 |
+
s = fs_join(source, "subdir")
|
| 210 |
+
if source_slash:
|
| 211 |
+
s += "/"
|
| 212 |
+
t = local_join(target, "newdir")
|
| 213 |
+
if target_slash:
|
| 214 |
+
t += "/"
|
| 215 |
+
|
| 216 |
+
# Without recursive does nothing
|
| 217 |
+
fs.get(s, t)
|
| 218 |
+
assert local_fs.ls(target) == []
|
| 219 |
+
|
| 220 |
+
# With recursive
|
| 221 |
+
fs.get(s, t, recursive=True)
|
| 222 |
+
assert local_fs.isdir(local_join(target, "newdir"))
|
| 223 |
+
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
|
| 224 |
+
assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
|
| 225 |
+
assert local_fs.isdir(local_join(target, "newdir", "nesteddir"))
|
| 226 |
+
assert local_fs.isfile(
|
| 227 |
+
local_join(target, "newdir", "nesteddir", "nestedfile")
|
| 228 |
+
)
|
| 229 |
+
assert not local_fs.exists(local_join(target, "subdir"))
|
| 230 |
+
|
| 231 |
+
local_fs.rm(local_join(target, "newdir"), recursive=True)
|
| 232 |
+
assert local_fs.ls(target) == []
|
| 233 |
+
|
| 234 |
+
# Limit recursive by maxdepth
|
| 235 |
+
fs.get(s, t, recursive=True, maxdepth=1)
|
| 236 |
+
assert local_fs.isdir(local_join(target, "newdir"))
|
| 237 |
+
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
|
| 238 |
+
assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
|
| 239 |
+
assert not local_fs.exists(local_join(target, "newdir", "nesteddir"))
|
| 240 |
+
assert not local_fs.exists(local_join(target, "subdir"))
|
| 241 |
+
|
| 242 |
+
local_fs.rm(local_join(target, "newdir"), recursive=True)
|
| 243 |
+
assert not local_fs.exists(local_join(target, "newdir"))
|
| 244 |
+
|
| 245 |
+
def test_get_glob_to_existing_directory(
|
| 246 |
+
self,
|
| 247 |
+
fs,
|
| 248 |
+
fs_join,
|
| 249 |
+
fs_bulk_operations_scenario_0,
|
| 250 |
+
local_fs,
|
| 251 |
+
local_join,
|
| 252 |
+
local_target,
|
| 253 |
+
):
|
| 254 |
+
# Copy scenario 1g
|
| 255 |
+
source = fs_bulk_operations_scenario_0
|
| 256 |
+
|
| 257 |
+
target = local_target
|
| 258 |
+
local_fs.mkdir(target)
|
| 259 |
+
|
| 260 |
+
for target_slash in [False, True]:
|
| 261 |
+
t = target + "/" if target_slash else target
|
| 262 |
+
|
| 263 |
+
# Without recursive
|
| 264 |
+
fs.get(fs_join(source, "subdir", "*"), t)
|
| 265 |
+
assert local_fs.isfile(local_join(target, "subfile1"))
|
| 266 |
+
assert local_fs.isfile(local_join(target, "subfile2"))
|
| 267 |
+
assert not local_fs.isdir(local_join(target, "nesteddir"))
|
| 268 |
+
assert not local_fs.exists(local_join(target, "nesteddir", "nestedfile"))
|
| 269 |
+
assert not local_fs.exists(local_join(target, "subdir"))
|
| 270 |
+
|
| 271 |
+
local_fs.rm(
|
| 272 |
+
[
|
| 273 |
+
local_join(target, "subfile1"),
|
| 274 |
+
local_join(target, "subfile2"),
|
| 275 |
+
],
|
| 276 |
+
recursive=True,
|
| 277 |
+
)
|
| 278 |
+
assert local_fs.ls(target) == []
|
| 279 |
+
|
| 280 |
+
# With recursive
|
| 281 |
+
for glob, recursive in zip(["*", "**"], [True, False]):
|
| 282 |
+
fs.get(fs_join(source, "subdir", glob), t, recursive=recursive)
|
| 283 |
+
assert local_fs.isfile(local_join(target, "subfile1"))
|
| 284 |
+
assert local_fs.isfile(local_join(target, "subfile2"))
|
| 285 |
+
assert local_fs.isdir(local_join(target, "nesteddir"))
|
| 286 |
+
assert local_fs.isfile(local_join(target, "nesteddir", "nestedfile"))
|
| 287 |
+
assert not local_fs.exists(local_join(target, "subdir"))
|
| 288 |
+
|
| 289 |
+
local_fs.rm(
|
| 290 |
+
[
|
| 291 |
+
local_join(target, "subfile1"),
|
| 292 |
+
local_join(target, "subfile2"),
|
| 293 |
+
local_join(target, "nesteddir"),
|
| 294 |
+
],
|
| 295 |
+
recursive=True,
|
| 296 |
+
)
|
| 297 |
+
assert local_fs.ls(target) == []
|
| 298 |
+
|
| 299 |
+
# Limit recursive by maxdepth
|
| 300 |
+
fs.get(
|
| 301 |
+
fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1
|
| 302 |
+
)
|
| 303 |
+
assert local_fs.isfile(local_join(target, "subfile1"))
|
| 304 |
+
assert local_fs.isfile(local_join(target, "subfile2"))
|
| 305 |
+
assert not local_fs.exists(local_join(target, "nesteddir"))
|
| 306 |
+
assert not local_fs.exists(local_join(target, "subdir"))
|
| 307 |
+
|
| 308 |
+
local_fs.rm(
|
| 309 |
+
[
|
| 310 |
+
local_join(target, "subfile1"),
|
| 311 |
+
local_join(target, "subfile2"),
|
| 312 |
+
],
|
| 313 |
+
recursive=True,
|
| 314 |
+
)
|
| 315 |
+
assert local_fs.ls(target) == []
|
| 316 |
+
|
| 317 |
+
def test_get_glob_to_new_directory(
|
| 318 |
+
self,
|
| 319 |
+
fs,
|
| 320 |
+
fs_join,
|
| 321 |
+
fs_bulk_operations_scenario_0,
|
| 322 |
+
local_fs,
|
| 323 |
+
local_join,
|
| 324 |
+
local_target,
|
| 325 |
+
):
|
| 326 |
+
# Copy scenario 1h
|
| 327 |
+
source = fs_bulk_operations_scenario_0
|
| 328 |
+
|
| 329 |
+
target = local_target
|
| 330 |
+
local_fs.mkdir(target)
|
| 331 |
+
|
| 332 |
+
for target_slash in [False, True]:
|
| 333 |
+
t = fs_join(target, "newdir")
|
| 334 |
+
if target_slash:
|
| 335 |
+
t += "/"
|
| 336 |
+
|
| 337 |
+
# Without recursive
|
| 338 |
+
fs.get(fs_join(source, "subdir", "*"), t)
|
| 339 |
+
assert local_fs.isdir(local_join(target, "newdir"))
|
| 340 |
+
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
|
| 341 |
+
assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
|
| 342 |
+
assert not local_fs.exists(local_join(target, "newdir", "nesteddir"))
|
| 343 |
+
assert not local_fs.exists(
|
| 344 |
+
local_join(target, "newdir", "nesteddir", "nestedfile")
|
| 345 |
+
)
|
| 346 |
+
assert not local_fs.exists(local_join(target, "subdir"))
|
| 347 |
+
assert not local_fs.exists(local_join(target, "newdir", "subdir"))
|
| 348 |
+
|
| 349 |
+
local_fs.rm(local_join(target, "newdir"), recursive=True)
|
| 350 |
+
assert local_fs.ls(target) == []
|
| 351 |
+
|
| 352 |
+
# With recursive
|
| 353 |
+
for glob, recursive in zip(["*", "**"], [True, False]):
|
| 354 |
+
fs.get(fs_join(source, "subdir", glob), t, recursive=recursive)
|
| 355 |
+
assert local_fs.isdir(local_join(target, "newdir"))
|
| 356 |
+
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
|
| 357 |
+
assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
|
| 358 |
+
assert local_fs.isdir(local_join(target, "newdir", "nesteddir"))
|
| 359 |
+
assert local_fs.isfile(
|
| 360 |
+
local_join(target, "newdir", "nesteddir", "nestedfile")
|
| 361 |
+
)
|
| 362 |
+
assert not local_fs.exists(local_join(target, "subdir"))
|
| 363 |
+
assert not local_fs.exists(local_join(target, "newdir", "subdir"))
|
| 364 |
+
|
| 365 |
+
local_fs.rm(local_join(target, "newdir"), recursive=True)
|
| 366 |
+
assert not local_fs.exists(local_join(target, "newdir"))
|
| 367 |
+
|
| 368 |
+
# Limit recursive by maxdepth
|
| 369 |
+
fs.get(
|
| 370 |
+
fs_join(source, "subdir", glob), t, recursive=recursive, maxdepth=1
|
| 371 |
+
)
|
| 372 |
+
assert local_fs.isdir(local_join(target, "newdir"))
|
| 373 |
+
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
|
| 374 |
+
assert local_fs.isfile(local_join(target, "newdir", "subfile2"))
|
| 375 |
+
assert not local_fs.exists(local_join(target, "newdir", "nesteddir"))
|
| 376 |
+
assert not local_fs.exists(local_join(target, "subdir"))
|
| 377 |
+
assert not local_fs.exists(local_join(target, "newdir", "subdir"))
|
| 378 |
+
|
| 379 |
+
local_fs.rm(local_fs.ls(target, detail=False), recursive=True)
|
| 380 |
+
assert not local_fs.exists(local_join(target, "newdir"))
|
| 381 |
+
|
| 382 |
+
@pytest.mark.parametrize(
|
| 383 |
+
GLOB_EDGE_CASES_TESTS["argnames"],
|
| 384 |
+
GLOB_EDGE_CASES_TESTS["argvalues"],
|
| 385 |
+
)
|
| 386 |
+
def test_get_glob_edge_cases(
|
| 387 |
+
self,
|
| 388 |
+
path,
|
| 389 |
+
recursive,
|
| 390 |
+
maxdepth,
|
| 391 |
+
expected,
|
| 392 |
+
fs,
|
| 393 |
+
fs_join,
|
| 394 |
+
fs_glob_edge_cases_files,
|
| 395 |
+
local_fs,
|
| 396 |
+
local_join,
|
| 397 |
+
local_target,
|
| 398 |
+
):
|
| 399 |
+
# Copy scenario 1g
|
| 400 |
+
source = fs_glob_edge_cases_files
|
| 401 |
+
|
| 402 |
+
target = local_target
|
| 403 |
+
|
| 404 |
+
for new_dir, target_slash in product([True, False], [True, False]):
|
| 405 |
+
local_fs.mkdir(target)
|
| 406 |
+
|
| 407 |
+
t = local_join(target, "newdir") if new_dir else target
|
| 408 |
+
t = t + "/" if target_slash else t
|
| 409 |
+
|
| 410 |
+
fs.get(fs_join(source, path), t, recursive=recursive, maxdepth=maxdepth)
|
| 411 |
+
|
| 412 |
+
output = local_fs.find(target)
|
| 413 |
+
if new_dir:
|
| 414 |
+
prefixed_expected = [
|
| 415 |
+
make_path_posix(local_join(target, "newdir", p)) for p in expected
|
| 416 |
+
]
|
| 417 |
+
else:
|
| 418 |
+
prefixed_expected = [
|
| 419 |
+
make_path_posix(local_join(target, p)) for p in expected
|
| 420 |
+
]
|
| 421 |
+
assert sorted(output) == sorted(prefixed_expected)
|
| 422 |
+
|
| 423 |
+
try:
|
| 424 |
+
local_fs.rm(target, recursive=True)
|
| 425 |
+
except FileNotFoundError:
|
| 426 |
+
pass
|
| 427 |
+
|
| 428 |
+
def test_get_list_of_files_to_existing_directory(
|
| 429 |
+
self,
|
| 430 |
+
fs,
|
| 431 |
+
fs_join,
|
| 432 |
+
fs_bulk_operations_scenario_0,
|
| 433 |
+
local_fs,
|
| 434 |
+
local_join,
|
| 435 |
+
local_target,
|
| 436 |
+
):
|
| 437 |
+
# Copy scenario 2a
|
| 438 |
+
source = fs_bulk_operations_scenario_0
|
| 439 |
+
|
| 440 |
+
target = local_target
|
| 441 |
+
local_fs.mkdir(target)
|
| 442 |
+
|
| 443 |
+
source_files = [
|
| 444 |
+
fs_join(source, "file1"),
|
| 445 |
+
fs_join(source, "file2"),
|
| 446 |
+
fs_join(source, "subdir", "subfile1"),
|
| 447 |
+
]
|
| 448 |
+
|
| 449 |
+
for target_slash in [False, True]:
|
| 450 |
+
t = target + "/" if target_slash else target
|
| 451 |
+
|
| 452 |
+
fs.get(source_files, t)
|
| 453 |
+
assert local_fs.isfile(local_join(target, "file1"))
|
| 454 |
+
assert local_fs.isfile(local_join(target, "file2"))
|
| 455 |
+
assert local_fs.isfile(local_join(target, "subfile1"))
|
| 456 |
+
|
| 457 |
+
local_fs.rm(
|
| 458 |
+
[
|
| 459 |
+
local_join(target, "file1"),
|
| 460 |
+
local_join(target, "file2"),
|
| 461 |
+
local_join(target, "subfile1"),
|
| 462 |
+
],
|
| 463 |
+
recursive=True,
|
| 464 |
+
)
|
| 465 |
+
assert local_fs.ls(target) == []
|
| 466 |
+
|
| 467 |
+
def test_get_list_of_files_to_new_directory(
|
| 468 |
+
self,
|
| 469 |
+
fs,
|
| 470 |
+
fs_join,
|
| 471 |
+
fs_bulk_operations_scenario_0,
|
| 472 |
+
local_fs,
|
| 473 |
+
local_join,
|
| 474 |
+
local_target,
|
| 475 |
+
):
|
| 476 |
+
# Copy scenario 2b
|
| 477 |
+
source = fs_bulk_operations_scenario_0
|
| 478 |
+
|
| 479 |
+
target = local_target
|
| 480 |
+
local_fs.mkdir(target)
|
| 481 |
+
|
| 482 |
+
source_files = [
|
| 483 |
+
fs_join(source, "file1"),
|
| 484 |
+
fs_join(source, "file2"),
|
| 485 |
+
fs_join(source, "subdir", "subfile1"),
|
| 486 |
+
]
|
| 487 |
+
|
| 488 |
+
fs.get(source_files, local_join(target, "newdir") + "/") # Note trailing slash
|
| 489 |
+
assert local_fs.isdir(local_join(target, "newdir"))
|
| 490 |
+
assert local_fs.isfile(local_join(target, "newdir", "file1"))
|
| 491 |
+
assert local_fs.isfile(local_join(target, "newdir", "file2"))
|
| 492 |
+
assert local_fs.isfile(local_join(target, "newdir", "subfile1"))
|
| 493 |
+
|
| 494 |
+
def test_get_directory_recursive(
|
| 495 |
+
self, fs, fs_join, fs_path, local_fs, local_join, local_target
|
| 496 |
+
):
|
| 497 |
+
# https://github.com/fsspec/filesystem_spec/issues/1062
|
| 498 |
+
# Recursive cp/get/put of source directory into non-existent target directory.
|
| 499 |
+
src = fs_join(fs_path, "src")
|
| 500 |
+
src_file = fs_join(src, "file")
|
| 501 |
+
fs.mkdir(src)
|
| 502 |
+
fs.touch(src_file)
|
| 503 |
+
|
| 504 |
+
target = local_target
|
| 505 |
+
|
| 506 |
+
# get without slash
|
| 507 |
+
assert not local_fs.exists(target)
|
| 508 |
+
for loop in range(2):
|
| 509 |
+
fs.get(src, target, recursive=True)
|
| 510 |
+
assert local_fs.isdir(target)
|
| 511 |
+
|
| 512 |
+
if loop == 0:
|
| 513 |
+
assert local_fs.isfile(local_join(target, "file"))
|
| 514 |
+
assert not local_fs.exists(local_join(target, "src"))
|
| 515 |
+
else:
|
| 516 |
+
assert local_fs.isfile(local_join(target, "file"))
|
| 517 |
+
assert local_fs.isdir(local_join(target, "src"))
|
| 518 |
+
assert local_fs.isfile(local_join(target, "src", "file"))
|
| 519 |
+
|
| 520 |
+
local_fs.rm(target, recursive=True)
|
| 521 |
+
|
| 522 |
+
# get with slash
|
| 523 |
+
assert not local_fs.exists(target)
|
| 524 |
+
for loop in range(2):
|
| 525 |
+
fs.get(src + "/", target, recursive=True)
|
| 526 |
+
assert local_fs.isdir(target)
|
| 527 |
+
assert local_fs.isfile(local_join(target, "file"))
|
| 528 |
+
assert not local_fs.exists(local_join(target, "src"))
|
| 529 |
+
|
| 530 |
+
def test_get_directory_without_files_with_same_name_prefix(
|
| 531 |
+
self,
|
| 532 |
+
fs,
|
| 533 |
+
fs_join,
|
| 534 |
+
local_fs,
|
| 535 |
+
local_join,
|
| 536 |
+
local_target,
|
| 537 |
+
fs_dir_and_file_with_same_name_prefix,
|
| 538 |
+
):
|
| 539 |
+
# Create the test dirs
|
| 540 |
+
source = fs_dir_and_file_with_same_name_prefix
|
| 541 |
+
target = local_target
|
| 542 |
+
|
| 543 |
+
# Test without glob
|
| 544 |
+
fs.get(fs_join(source, "subdir"), target, recursive=True)
|
| 545 |
+
|
| 546 |
+
assert local_fs.isfile(local_join(target, "subfile.txt"))
|
| 547 |
+
assert not local_fs.isfile(local_join(target, "subdir.txt"))
|
| 548 |
+
|
| 549 |
+
local_fs.rm([local_join(target, "subfile.txt")])
|
| 550 |
+
assert local_fs.ls(target) == []
|
| 551 |
+
|
| 552 |
+
# Test with glob
|
| 553 |
+
fs.get(fs_join(source, "subdir*"), target, recursive=True)
|
| 554 |
+
|
| 555 |
+
assert local_fs.isdir(local_join(target, "subdir"))
|
| 556 |
+
assert local_fs.isfile(local_join(target, "subdir", "subfile.txt"))
|
| 557 |
+
assert local_fs.isfile(local_join(target, "subdir.txt"))
|
| 558 |
+
|
| 559 |
+
def test_get_with_source_and_destination_as_list(
|
| 560 |
+
self,
|
| 561 |
+
fs,
|
| 562 |
+
fs_join,
|
| 563 |
+
local_fs,
|
| 564 |
+
local_join,
|
| 565 |
+
local_target,
|
| 566 |
+
fs_10_files_with_hashed_names,
|
| 567 |
+
):
|
| 568 |
+
# Create the test dir
|
| 569 |
+
source = fs_10_files_with_hashed_names
|
| 570 |
+
target = local_target
|
| 571 |
+
|
| 572 |
+
# Create list of files for source and destination
|
| 573 |
+
source_files = []
|
| 574 |
+
destination_files = []
|
| 575 |
+
for i in range(10):
|
| 576 |
+
hashed_i = md5(str(i).encode("utf-8")).hexdigest()
|
| 577 |
+
source_files.append(fs_join(source, f"{hashed_i}.txt"))
|
| 578 |
+
destination_files.append(
|
| 579 |
+
make_path_posix(local_join(target, f"{hashed_i}.txt"))
|
| 580 |
+
)
|
| 581 |
+
|
| 582 |
+
# Copy and assert order was kept
|
| 583 |
+
fs.get(rpath=source_files, lpath=destination_files)
|
| 584 |
+
|
| 585 |
+
for i in range(10):
|
| 586 |
+
file_content = local_fs.cat(destination_files[i]).decode("utf-8")
|
| 587 |
+
assert file_content == str(i)
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/tests/abstract/put.py
ADDED
|
@@ -0,0 +1,591 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from hashlib import md5
|
| 2 |
+
from itertools import product
|
| 3 |
+
|
| 4 |
+
import pytest
|
| 5 |
+
|
| 6 |
+
from fsspec.tests.abstract.common import GLOB_EDGE_CASES_TESTS
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class AbstractPutTests:
|
| 10 |
+
def test_put_file_to_existing_directory(
|
| 11 |
+
self,
|
| 12 |
+
fs,
|
| 13 |
+
fs_join,
|
| 14 |
+
fs_target,
|
| 15 |
+
local_join,
|
| 16 |
+
local_bulk_operations_scenario_0,
|
| 17 |
+
supports_empty_directories,
|
| 18 |
+
):
|
| 19 |
+
# Copy scenario 1a
|
| 20 |
+
source = local_bulk_operations_scenario_0
|
| 21 |
+
|
| 22 |
+
target = fs_target
|
| 23 |
+
fs.mkdir(target)
|
| 24 |
+
if not supports_empty_directories:
|
| 25 |
+
# Force target directory to exist by adding a dummy file
|
| 26 |
+
fs.touch(fs_join(target, "dummy"))
|
| 27 |
+
assert fs.isdir(target)
|
| 28 |
+
|
| 29 |
+
target_file2 = fs_join(target, "file2")
|
| 30 |
+
target_subfile1 = fs_join(target, "subfile1")
|
| 31 |
+
|
| 32 |
+
# Copy from source directory
|
| 33 |
+
fs.put(local_join(source, "file2"), target)
|
| 34 |
+
assert fs.isfile(target_file2)
|
| 35 |
+
|
| 36 |
+
# Copy from sub directory
|
| 37 |
+
fs.put(local_join(source, "subdir", "subfile1"), target)
|
| 38 |
+
assert fs.isfile(target_subfile1)
|
| 39 |
+
|
| 40 |
+
# Remove copied files
|
| 41 |
+
fs.rm([target_file2, target_subfile1])
|
| 42 |
+
assert not fs.exists(target_file2)
|
| 43 |
+
assert not fs.exists(target_subfile1)
|
| 44 |
+
|
| 45 |
+
# Repeat with trailing slash on target
|
| 46 |
+
fs.put(local_join(source, "file2"), target + "/")
|
| 47 |
+
assert fs.isdir(target)
|
| 48 |
+
assert fs.isfile(target_file2)
|
| 49 |
+
|
| 50 |
+
fs.put(local_join(source, "subdir", "subfile1"), target + "/")
|
| 51 |
+
assert fs.isfile(target_subfile1)
|
| 52 |
+
|
| 53 |
+
def test_put_file_to_new_directory(
|
| 54 |
+
self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
|
| 55 |
+
):
|
| 56 |
+
# Copy scenario 1b
|
| 57 |
+
source = local_bulk_operations_scenario_0
|
| 58 |
+
|
| 59 |
+
target = fs_target
|
| 60 |
+
fs.mkdir(target)
|
| 61 |
+
|
| 62 |
+
fs.put(
|
| 63 |
+
local_join(source, "subdir", "subfile1"), fs_join(target, "newdir/")
|
| 64 |
+
) # Note trailing slash
|
| 65 |
+
assert fs.isdir(target)
|
| 66 |
+
assert fs.isdir(fs_join(target, "newdir"))
|
| 67 |
+
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
|
| 68 |
+
|
| 69 |
+
def test_put_file_to_file_in_existing_directory(
|
| 70 |
+
self,
|
| 71 |
+
fs,
|
| 72 |
+
fs_join,
|
| 73 |
+
fs_target,
|
| 74 |
+
local_join,
|
| 75 |
+
supports_empty_directories,
|
| 76 |
+
local_bulk_operations_scenario_0,
|
| 77 |
+
):
|
| 78 |
+
# Copy scenario 1c
|
| 79 |
+
source = local_bulk_operations_scenario_0
|
| 80 |
+
|
| 81 |
+
target = fs_target
|
| 82 |
+
fs.mkdir(target)
|
| 83 |
+
if not supports_empty_directories:
|
| 84 |
+
# Force target directory to exist by adding a dummy file
|
| 85 |
+
fs.touch(fs_join(target, "dummy"))
|
| 86 |
+
assert fs.isdir(target)
|
| 87 |
+
|
| 88 |
+
fs.put(local_join(source, "subdir", "subfile1"), fs_join(target, "newfile"))
|
| 89 |
+
assert fs.isfile(fs_join(target, "newfile"))
|
| 90 |
+
|
| 91 |
+
def test_put_file_to_file_in_new_directory(
|
| 92 |
+
self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
|
| 93 |
+
):
|
| 94 |
+
# Copy scenario 1d
|
| 95 |
+
source = local_bulk_operations_scenario_0
|
| 96 |
+
|
| 97 |
+
target = fs_target
|
| 98 |
+
fs.mkdir(target)
|
| 99 |
+
|
| 100 |
+
fs.put(
|
| 101 |
+
local_join(source, "subdir", "subfile1"),
|
| 102 |
+
fs_join(target, "newdir", "newfile"),
|
| 103 |
+
)
|
| 104 |
+
assert fs.isdir(fs_join(target, "newdir"))
|
| 105 |
+
assert fs.isfile(fs_join(target, "newdir", "newfile"))
|
| 106 |
+
|
| 107 |
+
def test_put_directory_to_existing_directory(
|
| 108 |
+
self,
|
| 109 |
+
fs,
|
| 110 |
+
fs_join,
|
| 111 |
+
fs_target,
|
| 112 |
+
local_bulk_operations_scenario_0,
|
| 113 |
+
supports_empty_directories,
|
| 114 |
+
):
|
| 115 |
+
# Copy scenario 1e
|
| 116 |
+
source = local_bulk_operations_scenario_0
|
| 117 |
+
|
| 118 |
+
target = fs_target
|
| 119 |
+
fs.mkdir(target)
|
| 120 |
+
if not supports_empty_directories:
|
| 121 |
+
# Force target directory to exist by adding a dummy file
|
| 122 |
+
dummy = fs_join(target, "dummy")
|
| 123 |
+
fs.touch(dummy)
|
| 124 |
+
assert fs.isdir(target)
|
| 125 |
+
|
| 126 |
+
for source_slash, target_slash in zip([False, True], [False, True]):
|
| 127 |
+
s = fs_join(source, "subdir")
|
| 128 |
+
if source_slash:
|
| 129 |
+
s += "/"
|
| 130 |
+
t = target + "/" if target_slash else target
|
| 131 |
+
|
| 132 |
+
# Without recursive does nothing
|
| 133 |
+
fs.put(s, t)
|
| 134 |
+
assert fs.ls(target, detail=False) == (
|
| 135 |
+
[] if supports_empty_directories else [dummy]
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
# With recursive
|
| 139 |
+
fs.put(s, t, recursive=True)
|
| 140 |
+
if source_slash:
|
| 141 |
+
assert fs.isfile(fs_join(target, "subfile1"))
|
| 142 |
+
assert fs.isfile(fs_join(target, "subfile2"))
|
| 143 |
+
assert fs.isdir(fs_join(target, "nesteddir"))
|
| 144 |
+
assert fs.isfile(fs_join(target, "nesteddir", "nestedfile"))
|
| 145 |
+
assert not fs.exists(fs_join(target, "subdir"))
|
| 146 |
+
|
| 147 |
+
fs.rm(
|
| 148 |
+
[
|
| 149 |
+
fs_join(target, "subfile1"),
|
| 150 |
+
fs_join(target, "subfile2"),
|
| 151 |
+
fs_join(target, "nesteddir"),
|
| 152 |
+
],
|
| 153 |
+
recursive=True,
|
| 154 |
+
)
|
| 155 |
+
else:
|
| 156 |
+
assert fs.isdir(fs_join(target, "subdir"))
|
| 157 |
+
assert fs.isfile(fs_join(target, "subdir", "subfile1"))
|
| 158 |
+
assert fs.isfile(fs_join(target, "subdir", "subfile2"))
|
| 159 |
+
assert fs.isdir(fs_join(target, "subdir", "nesteddir"))
|
| 160 |
+
assert fs.isfile(fs_join(target, "subdir", "nesteddir", "nestedfile"))
|
| 161 |
+
|
| 162 |
+
fs.rm(fs_join(target, "subdir"), recursive=True)
|
| 163 |
+
assert fs.ls(target, detail=False) == (
|
| 164 |
+
[] if supports_empty_directories else [dummy]
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
# Limit recursive by maxdepth
|
| 168 |
+
fs.put(s, t, recursive=True, maxdepth=1)
|
| 169 |
+
if source_slash:
|
| 170 |
+
assert fs.isfile(fs_join(target, "subfile1"))
|
| 171 |
+
assert fs.isfile(fs_join(target, "subfile2"))
|
| 172 |
+
assert not fs.exists(fs_join(target, "nesteddir"))
|
| 173 |
+
assert not fs.exists(fs_join(target, "subdir"))
|
| 174 |
+
|
| 175 |
+
fs.rm(
|
| 176 |
+
[
|
| 177 |
+
fs_join(target, "subfile1"),
|
| 178 |
+
fs_join(target, "subfile2"),
|
| 179 |
+
],
|
| 180 |
+
recursive=True,
|
| 181 |
+
)
|
| 182 |
+
else:
|
| 183 |
+
assert fs.isdir(fs_join(target, "subdir"))
|
| 184 |
+
assert fs.isfile(fs_join(target, "subdir", "subfile1"))
|
| 185 |
+
assert fs.isfile(fs_join(target, "subdir", "subfile2"))
|
| 186 |
+
assert not fs.exists(fs_join(target, "subdir", "nesteddir"))
|
| 187 |
+
|
| 188 |
+
fs.rm(fs_join(target, "subdir"), recursive=True)
|
| 189 |
+
assert fs.ls(target, detail=False) == (
|
| 190 |
+
[] if supports_empty_directories else [dummy]
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
def test_put_directory_to_new_directory(
|
| 194 |
+
self,
|
| 195 |
+
fs,
|
| 196 |
+
fs_join,
|
| 197 |
+
fs_target,
|
| 198 |
+
local_bulk_operations_scenario_0,
|
| 199 |
+
supports_empty_directories,
|
| 200 |
+
):
|
| 201 |
+
# Copy scenario 1f
|
| 202 |
+
source = local_bulk_operations_scenario_0
|
| 203 |
+
|
| 204 |
+
target = fs_target
|
| 205 |
+
fs.mkdir(target)
|
| 206 |
+
|
| 207 |
+
for source_slash, target_slash in zip([False, True], [False, True]):
|
| 208 |
+
s = fs_join(source, "subdir")
|
| 209 |
+
if source_slash:
|
| 210 |
+
s += "/"
|
| 211 |
+
t = fs_join(target, "newdir")
|
| 212 |
+
if target_slash:
|
| 213 |
+
t += "/"
|
| 214 |
+
|
| 215 |
+
# Without recursive does nothing
|
| 216 |
+
fs.put(s, t)
|
| 217 |
+
if supports_empty_directories:
|
| 218 |
+
assert fs.ls(target) == []
|
| 219 |
+
else:
|
| 220 |
+
with pytest.raises(FileNotFoundError):
|
| 221 |
+
fs.ls(target)
|
| 222 |
+
|
| 223 |
+
# With recursive
|
| 224 |
+
fs.put(s, t, recursive=True)
|
| 225 |
+
assert fs.isdir(fs_join(target, "newdir"))
|
| 226 |
+
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
|
| 227 |
+
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
|
| 228 |
+
assert fs.isdir(fs_join(target, "newdir", "nesteddir"))
|
| 229 |
+
assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile"))
|
| 230 |
+
assert not fs.exists(fs_join(target, "subdir"))
|
| 231 |
+
|
| 232 |
+
fs.rm(fs_join(target, "newdir"), recursive=True)
|
| 233 |
+
assert not fs.exists(fs_join(target, "newdir"))
|
| 234 |
+
|
| 235 |
+
# Limit recursive by maxdepth
|
| 236 |
+
fs.put(s, t, recursive=True, maxdepth=1)
|
| 237 |
+
assert fs.isdir(fs_join(target, "newdir"))
|
| 238 |
+
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
|
| 239 |
+
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
|
| 240 |
+
assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
|
| 241 |
+
assert not fs.exists(fs_join(target, "subdir"))
|
| 242 |
+
|
| 243 |
+
fs.rm(fs_join(target, "newdir"), recursive=True)
|
| 244 |
+
assert not fs.exists(fs_join(target, "newdir"))
|
| 245 |
+
|
| 246 |
+
def test_put_glob_to_existing_directory(
|
| 247 |
+
self,
|
| 248 |
+
fs,
|
| 249 |
+
fs_join,
|
| 250 |
+
fs_target,
|
| 251 |
+
local_join,
|
| 252 |
+
supports_empty_directories,
|
| 253 |
+
local_bulk_operations_scenario_0,
|
| 254 |
+
):
|
| 255 |
+
# Copy scenario 1g
|
| 256 |
+
source = local_bulk_operations_scenario_0
|
| 257 |
+
|
| 258 |
+
target = fs_target
|
| 259 |
+
fs.mkdir(target)
|
| 260 |
+
if not supports_empty_directories:
|
| 261 |
+
# Force target directory to exist by adding a dummy file
|
| 262 |
+
dummy = fs_join(target, "dummy")
|
| 263 |
+
fs.touch(dummy)
|
| 264 |
+
assert fs.isdir(target)
|
| 265 |
+
|
| 266 |
+
for target_slash in [False, True]:
|
| 267 |
+
t = target + "/" if target_slash else target
|
| 268 |
+
|
| 269 |
+
# Without recursive
|
| 270 |
+
fs.put(local_join(source, "subdir", "*"), t)
|
| 271 |
+
assert fs.isfile(fs_join(target, "subfile1"))
|
| 272 |
+
assert fs.isfile(fs_join(target, "subfile2"))
|
| 273 |
+
assert not fs.isdir(fs_join(target, "nesteddir"))
|
| 274 |
+
assert not fs.exists(fs_join(target, "nesteddir", "nestedfile"))
|
| 275 |
+
assert not fs.exists(fs_join(target, "subdir"))
|
| 276 |
+
|
| 277 |
+
fs.rm(
|
| 278 |
+
[
|
| 279 |
+
fs_join(target, "subfile1"),
|
| 280 |
+
fs_join(target, "subfile2"),
|
| 281 |
+
],
|
| 282 |
+
recursive=True,
|
| 283 |
+
)
|
| 284 |
+
assert fs.ls(target, detail=False) == (
|
| 285 |
+
[] if supports_empty_directories else [dummy]
|
| 286 |
+
)
|
| 287 |
+
|
| 288 |
+
# With recursive
|
| 289 |
+
for glob, recursive in zip(["*", "**"], [True, False]):
|
| 290 |
+
fs.put(local_join(source, "subdir", glob), t, recursive=recursive)
|
| 291 |
+
assert fs.isfile(fs_join(target, "subfile1"))
|
| 292 |
+
assert fs.isfile(fs_join(target, "subfile2"))
|
| 293 |
+
assert fs.isdir(fs_join(target, "nesteddir"))
|
| 294 |
+
assert fs.isfile(fs_join(target, "nesteddir", "nestedfile"))
|
| 295 |
+
assert not fs.exists(fs_join(target, "subdir"))
|
| 296 |
+
|
| 297 |
+
fs.rm(
|
| 298 |
+
[
|
| 299 |
+
fs_join(target, "subfile1"),
|
| 300 |
+
fs_join(target, "subfile2"),
|
| 301 |
+
fs_join(target, "nesteddir"),
|
| 302 |
+
],
|
| 303 |
+
recursive=True,
|
| 304 |
+
)
|
| 305 |
+
assert fs.ls(target, detail=False) == (
|
| 306 |
+
[] if supports_empty_directories else [dummy]
|
| 307 |
+
)
|
| 308 |
+
|
| 309 |
+
# Limit recursive by maxdepth
|
| 310 |
+
fs.put(
|
| 311 |
+
local_join(source, "subdir", glob),
|
| 312 |
+
t,
|
| 313 |
+
recursive=recursive,
|
| 314 |
+
maxdepth=1,
|
| 315 |
+
)
|
| 316 |
+
assert fs.isfile(fs_join(target, "subfile1"))
|
| 317 |
+
assert fs.isfile(fs_join(target, "subfile2"))
|
| 318 |
+
assert not fs.exists(fs_join(target, "nesteddir"))
|
| 319 |
+
assert not fs.exists(fs_join(target, "subdir"))
|
| 320 |
+
|
| 321 |
+
fs.rm(
|
| 322 |
+
[
|
| 323 |
+
fs_join(target, "subfile1"),
|
| 324 |
+
fs_join(target, "subfile2"),
|
| 325 |
+
],
|
| 326 |
+
recursive=True,
|
| 327 |
+
)
|
| 328 |
+
assert fs.ls(target, detail=False) == (
|
| 329 |
+
[] if supports_empty_directories else [dummy]
|
| 330 |
+
)
|
| 331 |
+
|
| 332 |
+
def test_put_glob_to_new_directory(
|
| 333 |
+
self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
|
| 334 |
+
):
|
| 335 |
+
# Copy scenario 1h
|
| 336 |
+
source = local_bulk_operations_scenario_0
|
| 337 |
+
|
| 338 |
+
target = fs_target
|
| 339 |
+
fs.mkdir(target)
|
| 340 |
+
|
| 341 |
+
for target_slash in [False, True]:
|
| 342 |
+
t = fs_join(target, "newdir")
|
| 343 |
+
if target_slash:
|
| 344 |
+
t += "/"
|
| 345 |
+
|
| 346 |
+
# Without recursive
|
| 347 |
+
fs.put(local_join(source, "subdir", "*"), t)
|
| 348 |
+
assert fs.isdir(fs_join(target, "newdir"))
|
| 349 |
+
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
|
| 350 |
+
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
|
| 351 |
+
assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
|
| 352 |
+
assert not fs.exists(fs_join(target, "newdir", "nesteddir", "nestedfile"))
|
| 353 |
+
assert not fs.exists(fs_join(target, "subdir"))
|
| 354 |
+
assert not fs.exists(fs_join(target, "newdir", "subdir"))
|
| 355 |
+
|
| 356 |
+
fs.rm(fs_join(target, "newdir"), recursive=True)
|
| 357 |
+
assert not fs.exists(fs_join(target, "newdir"))
|
| 358 |
+
|
| 359 |
+
# With recursive
|
| 360 |
+
for glob, recursive in zip(["*", "**"], [True, False]):
|
| 361 |
+
fs.put(local_join(source, "subdir", glob), t, recursive=recursive)
|
| 362 |
+
assert fs.isdir(fs_join(target, "newdir"))
|
| 363 |
+
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
|
| 364 |
+
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
|
| 365 |
+
assert fs.isdir(fs_join(target, "newdir", "nesteddir"))
|
| 366 |
+
assert fs.isfile(fs_join(target, "newdir", "nesteddir", "nestedfile"))
|
| 367 |
+
assert not fs.exists(fs_join(target, "subdir"))
|
| 368 |
+
assert not fs.exists(fs_join(target, "newdir", "subdir"))
|
| 369 |
+
|
| 370 |
+
fs.rm(fs_join(target, "newdir"), recursive=True)
|
| 371 |
+
assert not fs.exists(fs_join(target, "newdir"))
|
| 372 |
+
|
| 373 |
+
# Limit recursive by maxdepth
|
| 374 |
+
fs.put(
|
| 375 |
+
local_join(source, "subdir", glob),
|
| 376 |
+
t,
|
| 377 |
+
recursive=recursive,
|
| 378 |
+
maxdepth=1,
|
| 379 |
+
)
|
| 380 |
+
assert fs.isdir(fs_join(target, "newdir"))
|
| 381 |
+
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
|
| 382 |
+
assert fs.isfile(fs_join(target, "newdir", "subfile2"))
|
| 383 |
+
assert not fs.exists(fs_join(target, "newdir", "nesteddir"))
|
| 384 |
+
assert not fs.exists(fs_join(target, "subdir"))
|
| 385 |
+
assert not fs.exists(fs_join(target, "newdir", "subdir"))
|
| 386 |
+
|
| 387 |
+
fs.rm(fs_join(target, "newdir"), recursive=True)
|
| 388 |
+
assert not fs.exists(fs_join(target, "newdir"))
|
| 389 |
+
|
| 390 |
+
@pytest.mark.parametrize(
|
| 391 |
+
GLOB_EDGE_CASES_TESTS["argnames"],
|
| 392 |
+
GLOB_EDGE_CASES_TESTS["argvalues"],
|
| 393 |
+
)
|
| 394 |
+
def test_put_glob_edge_cases(
|
| 395 |
+
self,
|
| 396 |
+
path,
|
| 397 |
+
recursive,
|
| 398 |
+
maxdepth,
|
| 399 |
+
expected,
|
| 400 |
+
fs,
|
| 401 |
+
fs_join,
|
| 402 |
+
fs_target,
|
| 403 |
+
local_glob_edge_cases_files,
|
| 404 |
+
local_join,
|
| 405 |
+
fs_sanitize_path,
|
| 406 |
+
):
|
| 407 |
+
# Copy scenario 1g
|
| 408 |
+
source = local_glob_edge_cases_files
|
| 409 |
+
|
| 410 |
+
target = fs_target
|
| 411 |
+
|
| 412 |
+
for new_dir, target_slash in product([True, False], [True, False]):
|
| 413 |
+
fs.mkdir(target)
|
| 414 |
+
|
| 415 |
+
t = fs_join(target, "newdir") if new_dir else target
|
| 416 |
+
t = t + "/" if target_slash else t
|
| 417 |
+
|
| 418 |
+
fs.put(local_join(source, path), t, recursive=recursive, maxdepth=maxdepth)
|
| 419 |
+
|
| 420 |
+
output = fs.find(target)
|
| 421 |
+
if new_dir:
|
| 422 |
+
prefixed_expected = [
|
| 423 |
+
fs_sanitize_path(fs_join(target, "newdir", p)) for p in expected
|
| 424 |
+
]
|
| 425 |
+
else:
|
| 426 |
+
prefixed_expected = [
|
| 427 |
+
fs_sanitize_path(fs_join(target, p)) for p in expected
|
| 428 |
+
]
|
| 429 |
+
assert sorted(output) == sorted(prefixed_expected)
|
| 430 |
+
|
| 431 |
+
try:
|
| 432 |
+
fs.rm(target, recursive=True)
|
| 433 |
+
except FileNotFoundError:
|
| 434 |
+
pass
|
| 435 |
+
|
| 436 |
+
def test_put_list_of_files_to_existing_directory(
|
| 437 |
+
self,
|
| 438 |
+
fs,
|
| 439 |
+
fs_join,
|
| 440 |
+
fs_target,
|
| 441 |
+
local_join,
|
| 442 |
+
local_bulk_operations_scenario_0,
|
| 443 |
+
supports_empty_directories,
|
| 444 |
+
):
|
| 445 |
+
# Copy scenario 2a
|
| 446 |
+
source = local_bulk_operations_scenario_0
|
| 447 |
+
|
| 448 |
+
target = fs_target
|
| 449 |
+
fs.mkdir(target)
|
| 450 |
+
if not supports_empty_directories:
|
| 451 |
+
# Force target directory to exist by adding a dummy file
|
| 452 |
+
dummy = fs_join(target, "dummy")
|
| 453 |
+
fs.touch(dummy)
|
| 454 |
+
assert fs.isdir(target)
|
| 455 |
+
|
| 456 |
+
source_files = [
|
| 457 |
+
local_join(source, "file1"),
|
| 458 |
+
local_join(source, "file2"),
|
| 459 |
+
local_join(source, "subdir", "subfile1"),
|
| 460 |
+
]
|
| 461 |
+
|
| 462 |
+
for target_slash in [False, True]:
|
| 463 |
+
t = target + "/" if target_slash else target
|
| 464 |
+
|
| 465 |
+
fs.put(source_files, t)
|
| 466 |
+
assert fs.isfile(fs_join(target, "file1"))
|
| 467 |
+
assert fs.isfile(fs_join(target, "file2"))
|
| 468 |
+
assert fs.isfile(fs_join(target, "subfile1"))
|
| 469 |
+
|
| 470 |
+
fs.rm(
|
| 471 |
+
[
|
| 472 |
+
fs_join(target, "file1"),
|
| 473 |
+
fs_join(target, "file2"),
|
| 474 |
+
fs_join(target, "subfile1"),
|
| 475 |
+
],
|
| 476 |
+
recursive=True,
|
| 477 |
+
)
|
| 478 |
+
assert fs.ls(target, detail=False) == (
|
| 479 |
+
[] if supports_empty_directories else [dummy]
|
| 480 |
+
)
|
| 481 |
+
|
| 482 |
+
def test_put_list_of_files_to_new_directory(
|
| 483 |
+
self, fs, fs_join, fs_target, local_join, local_bulk_operations_scenario_0
|
| 484 |
+
):
|
| 485 |
+
# Copy scenario 2b
|
| 486 |
+
source = local_bulk_operations_scenario_0
|
| 487 |
+
|
| 488 |
+
target = fs_target
|
| 489 |
+
fs.mkdir(target)
|
| 490 |
+
|
| 491 |
+
source_files = [
|
| 492 |
+
local_join(source, "file1"),
|
| 493 |
+
local_join(source, "file2"),
|
| 494 |
+
local_join(source, "subdir", "subfile1"),
|
| 495 |
+
]
|
| 496 |
+
|
| 497 |
+
fs.put(source_files, fs_join(target, "newdir") + "/") # Note trailing slash
|
| 498 |
+
assert fs.isdir(fs_join(target, "newdir"))
|
| 499 |
+
assert fs.isfile(fs_join(target, "newdir", "file1"))
|
| 500 |
+
assert fs.isfile(fs_join(target, "newdir", "file2"))
|
| 501 |
+
assert fs.isfile(fs_join(target, "newdir", "subfile1"))
|
| 502 |
+
|
| 503 |
+
def test_put_directory_recursive(
|
| 504 |
+
self, fs, fs_join, fs_target, local_fs, local_join, local_path
|
| 505 |
+
):
|
| 506 |
+
# https://github.com/fsspec/filesystem_spec/issues/1062
|
| 507 |
+
# Recursive cp/get/put of source directory into non-existent target directory.
|
| 508 |
+
src = local_join(local_path, "src")
|
| 509 |
+
src_file = local_join(src, "file")
|
| 510 |
+
local_fs.mkdir(src)
|
| 511 |
+
local_fs.touch(src_file)
|
| 512 |
+
|
| 513 |
+
target = fs_target
|
| 514 |
+
|
| 515 |
+
# put without slash
|
| 516 |
+
assert not fs.exists(target)
|
| 517 |
+
for loop in range(2):
|
| 518 |
+
fs.put(src, target, recursive=True)
|
| 519 |
+
assert fs.isdir(target)
|
| 520 |
+
|
| 521 |
+
if loop == 0:
|
| 522 |
+
assert fs.isfile(fs_join(target, "file"))
|
| 523 |
+
assert not fs.exists(fs_join(target, "src"))
|
| 524 |
+
else:
|
| 525 |
+
assert fs.isfile(fs_join(target, "file"))
|
| 526 |
+
assert fs.isdir(fs_join(target, "src"))
|
| 527 |
+
assert fs.isfile(fs_join(target, "src", "file"))
|
| 528 |
+
|
| 529 |
+
fs.rm(target, recursive=True)
|
| 530 |
+
|
| 531 |
+
# put with slash
|
| 532 |
+
assert not fs.exists(target)
|
| 533 |
+
for loop in range(2):
|
| 534 |
+
fs.put(src + "/", target, recursive=True)
|
| 535 |
+
assert fs.isdir(target)
|
| 536 |
+
assert fs.isfile(fs_join(target, "file"))
|
| 537 |
+
assert not fs.exists(fs_join(target, "src"))
|
| 538 |
+
|
| 539 |
+
def test_put_directory_without_files_with_same_name_prefix(
|
| 540 |
+
self,
|
| 541 |
+
fs,
|
| 542 |
+
fs_join,
|
| 543 |
+
fs_target,
|
| 544 |
+
local_join,
|
| 545 |
+
local_dir_and_file_with_same_name_prefix,
|
| 546 |
+
supports_empty_directories,
|
| 547 |
+
):
|
| 548 |
+
# Create the test dirs
|
| 549 |
+
source = local_dir_and_file_with_same_name_prefix
|
| 550 |
+
target = fs_target
|
| 551 |
+
|
| 552 |
+
# Test without glob
|
| 553 |
+
fs.put(local_join(source, "subdir"), fs_target, recursive=True)
|
| 554 |
+
|
| 555 |
+
assert fs.isfile(fs_join(fs_target, "subfile.txt"))
|
| 556 |
+
assert not fs.isfile(fs_join(fs_target, "subdir.txt"))
|
| 557 |
+
|
| 558 |
+
fs.rm([fs_join(target, "subfile.txt")])
|
| 559 |
+
if supports_empty_directories:
|
| 560 |
+
assert fs.ls(target) == []
|
| 561 |
+
else:
|
| 562 |
+
assert not fs.exists(target)
|
| 563 |
+
|
| 564 |
+
# Test with glob
|
| 565 |
+
fs.put(local_join(source, "subdir*"), fs_target, recursive=True)
|
| 566 |
+
|
| 567 |
+
assert fs.isdir(fs_join(fs_target, "subdir"))
|
| 568 |
+
assert fs.isfile(fs_join(fs_target, "subdir", "subfile.txt"))
|
| 569 |
+
assert fs.isfile(fs_join(fs_target, "subdir.txt"))
|
| 570 |
+
|
| 571 |
+
def test_copy_with_source_and_destination_as_list(
|
| 572 |
+
self, fs, fs_target, fs_join, local_join, local_10_files_with_hashed_names
|
| 573 |
+
):
|
| 574 |
+
# Create the test dir
|
| 575 |
+
source = local_10_files_with_hashed_names
|
| 576 |
+
target = fs_target
|
| 577 |
+
|
| 578 |
+
# Create list of files for source and destination
|
| 579 |
+
source_files = []
|
| 580 |
+
destination_files = []
|
| 581 |
+
for i in range(10):
|
| 582 |
+
hashed_i = md5(str(i).encode("utf-8")).hexdigest()
|
| 583 |
+
source_files.append(local_join(source, f"{hashed_i}.txt"))
|
| 584 |
+
destination_files.append(fs_join(target, f"{hashed_i}.txt"))
|
| 585 |
+
|
| 586 |
+
# Copy and assert order was kept
|
| 587 |
+
fs.put(lpath=source_files, rpath=destination_files)
|
| 588 |
+
|
| 589 |
+
for i in range(10):
|
| 590 |
+
file_content = fs.cat(destination_files[i]).decode("utf-8")
|
| 591 |
+
assert file_content == str(i)
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/__init__.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
import torch
|
| 7 |
+
|
| 8 |
+
from torch._functorch.deprecated import (
|
| 9 |
+
combine_state_for_ensemble,
|
| 10 |
+
functionalize,
|
| 11 |
+
grad,
|
| 12 |
+
grad_and_value,
|
| 13 |
+
hessian,
|
| 14 |
+
jacfwd,
|
| 15 |
+
jacrev,
|
| 16 |
+
jvp,
|
| 17 |
+
make_functional,
|
| 18 |
+
make_functional_with_buffers,
|
| 19 |
+
vjp,
|
| 20 |
+
vmap,
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
# utilities. Maybe these should go in their own namespace in the future?
|
| 24 |
+
from torch._functorch.make_functional import (
|
| 25 |
+
FunctionalModule,
|
| 26 |
+
FunctionalModuleWithBuffers,
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
# Top-level APIs. Please think carefully before adding something to the
|
| 30 |
+
# top-level namespace:
|
| 31 |
+
# - private helper functions should go into torch._functorch
|
| 32 |
+
# - very experimental things should go into functorch.experimental
|
| 33 |
+
# - compilation related things should go into functorch.compile
|
| 34 |
+
|
| 35 |
+
# Was never documented
|
| 36 |
+
from torch._functorch.python_key import make_fx
|
| 37 |
+
|
| 38 |
+
__version__ = torch.__version__
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/control_flow.cpython-311.pyc
ADDED
|
Binary file (534 Bytes). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/__pycache__/ops.cpython-311.pyc
ADDED
|
Binary file (286 Bytes). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/experimental/control_flow.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from torch import cond # noqa: F401
|
| 2 |
+
from torch._higher_order_ops.cond import UnsupportedAliasMutationException # noqa: F401
|
| 3 |
+
|
| 4 |
+
from torch._higher_order_ops.map import ( # noqa: F401
|
| 5 |
+
_stack_pytree,
|
| 6 |
+
_unstack_pytree,
|
| 7 |
+
map,
|
| 8 |
+
)
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaProfilerTypedefs.h
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2020-2021 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef CUDAPROFILERTYPEDEFS_H
|
| 51 |
+
#define CUDAPROFILERTYPEDEFS_H
|
| 52 |
+
|
| 53 |
+
#include <cudaProfiler.h>
|
| 54 |
+
|
| 55 |
+
#ifdef __cplusplus
|
| 56 |
+
extern "C" {
|
| 57 |
+
#endif // __cplusplus
|
| 58 |
+
|
| 59 |
+
/*
|
| 60 |
+
* Macros for the latest version for each driver function in cudaProfiler.h
|
| 61 |
+
*/
|
| 62 |
+
#define PFN_cuProfilerInitialize PFN_cuProfilerInitialize_v4000
|
| 63 |
+
#define PFN_cuProfilerStart PFN_cuProfilerStart_v4000
|
| 64 |
+
#define PFN_cuProfilerStop PFN_cuProfilerStop_v4000
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
/**
|
| 68 |
+
* Type definitions for functions defined in cudaProfiler.h
|
| 69 |
+
*/
|
| 70 |
+
typedef CUresult (CUDAAPI *PFN_cuProfilerInitialize_v4000)(const char *configFile, const char *outputFile, CUoutput_mode outputMode);
|
| 71 |
+
typedef CUresult (CUDAAPI *PFN_cuProfilerStart_v4000)(void);
|
| 72 |
+
typedef CUresult (CUDAAPI *PFN_cuProfilerStop_v4000)(void);
|
| 73 |
+
|
| 74 |
+
#ifdef __cplusplus
|
| 75 |
+
}
|
| 76 |
+
#endif // __cplusplus
|
| 77 |
+
|
| 78 |
+
#endif // file guard
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaVDPAU.h
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef CUDAVDPAU_H
|
| 51 |
+
#define CUDAVDPAU_H
|
| 52 |
+
|
| 53 |
+
#ifdef CUDA_FORCE_API_VERSION
|
| 54 |
+
#error "CUDA_FORCE_API_VERSION is no longer supported."
|
| 55 |
+
#endif
|
| 56 |
+
|
| 57 |
+
#define cuVDPAUCtxCreate cuVDPAUCtxCreate_v2
|
| 58 |
+
|
| 59 |
+
#ifdef __cplusplus
|
| 60 |
+
extern "C" {
|
| 61 |
+
#endif
|
| 62 |
+
|
| 63 |
+
/**
|
| 64 |
+
* \defgroup CUDA_VDPAU VDPAU Interoperability
|
| 65 |
+
* \ingroup CUDA_DRIVER
|
| 66 |
+
*
|
| 67 |
+
* ___MANBRIEF___ VDPAU interoperability functions of the low-level CUDA driver
|
| 68 |
+
* API (___CURRENT_FILE___) ___ENDMANBRIEF___
|
| 69 |
+
*
|
| 70 |
+
* This section describes the VDPAU interoperability functions of the
|
| 71 |
+
* low-level CUDA driver application programming interface.
|
| 72 |
+
*
|
| 73 |
+
* @{
|
| 74 |
+
*/
|
| 75 |
+
|
| 76 |
+
/**
|
| 77 |
+
* \brief Gets the CUDA device associated with a VDPAU device
|
| 78 |
+
*
|
| 79 |
+
* Returns in \p *pDevice the CUDA device associated with a \p vdpDevice, if
|
| 80 |
+
* applicable.
|
| 81 |
+
*
|
| 82 |
+
* \param pDevice - Device associated with vdpDevice
|
| 83 |
+
* \param vdpDevice - A VdpDevice handle
|
| 84 |
+
* \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
|
| 85 |
+
*
|
| 86 |
+
* \return
|
| 87 |
+
* ::CUDA_SUCCESS,
|
| 88 |
+
* ::CUDA_ERROR_DEINITIALIZED,
|
| 89 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 90 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 91 |
+
* ::CUDA_ERROR_INVALID_VALUE
|
| 92 |
+
* \notefnerr
|
| 93 |
+
*
|
| 94 |
+
* \sa ::cuCtxCreate, ::cuVDPAUCtxCreate, ::cuGraphicsVDPAURegisterVideoSurface,
|
| 95 |
+
* ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource,
|
| 96 |
+
* ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
|
| 97 |
+
* ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
|
| 98 |
+
* ::cudaVDPAUGetDevice
|
| 99 |
+
*/
|
| 100 |
+
CUresult CUDAAPI cuVDPAUGetDevice(CUdevice *pDevice, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
|
| 101 |
+
|
| 102 |
+
/**
|
| 103 |
+
* \brief Create a CUDA context for interoperability with VDPAU
|
| 104 |
+
*
|
| 105 |
+
* Creates a new CUDA context, initializes VDPAU interoperability, and
|
| 106 |
+
* associates the CUDA context with the calling thread. It must be called
|
| 107 |
+
* before performing any other VDPAU interoperability operations. It may fail
|
| 108 |
+
* if the needed VDPAU driver facilities are not available. For usage of the
|
| 109 |
+
* \p flags parameter, see ::cuCtxCreate().
|
| 110 |
+
*
|
| 111 |
+
* \param pCtx - Returned CUDA context
|
| 112 |
+
* \param flags - Options for CUDA context creation
|
| 113 |
+
* \param device - Device on which to create the context
|
| 114 |
+
* \param vdpDevice - The VdpDevice to interop with
|
| 115 |
+
* \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
|
| 116 |
+
*
|
| 117 |
+
* \return
|
| 118 |
+
* ::CUDA_SUCCESS,
|
| 119 |
+
* ::CUDA_ERROR_DEINITIALIZED,
|
| 120 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 121 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 122 |
+
* ::CUDA_ERROR_INVALID_VALUE,
|
| 123 |
+
* ::CUDA_ERROR_OUT_OF_MEMORY
|
| 124 |
+
* \notefnerr
|
| 125 |
+
*
|
| 126 |
+
* \sa ::cuCtxCreate, ::cuGraphicsVDPAURegisterVideoSurface,
|
| 127 |
+
* ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource,
|
| 128 |
+
* ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
|
| 129 |
+
* ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
|
| 130 |
+
* ::cuVDPAUGetDevice
|
| 131 |
+
*/
|
| 132 |
+
CUresult CUDAAPI cuVDPAUCtxCreate(CUcontext *pCtx, unsigned int flags, CUdevice device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
|
| 133 |
+
|
| 134 |
+
/**
|
| 135 |
+
* \brief Registers a VDPAU VdpVideoSurface object
|
| 136 |
+
*
|
| 137 |
+
* Registers the VdpVideoSurface specified by \p vdpSurface for access by
|
| 138 |
+
* CUDA. A handle to the registered object is returned as \p pCudaResource.
|
| 139 |
+
* The surface's intended usage is specified using \p flags, as follows:
|
| 140 |
+
*
|
| 141 |
+
* - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
|
| 142 |
+
* resource will be used. It is therefore assumed that this resource will be
|
| 143 |
+
* read from and written to by CUDA. This is the default value.
|
| 144 |
+
* - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA
|
| 145 |
+
* will not write to this resource.
|
| 146 |
+
* - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that
|
| 147 |
+
* CUDA will not read from this resource and will write over the
|
| 148 |
+
* entire contents of the resource, so none of the data previously
|
| 149 |
+
* stored in the resource will be preserved.
|
| 150 |
+
*
|
| 151 |
+
* The VdpVideoSurface is presented as an array of subresources that may be
|
| 152 |
+
* accessed using pointers returned by ::cuGraphicsSubResourceGetMappedArray.
|
| 153 |
+
* The exact number of valid \p arrayIndex values depends on the VDPAU surface
|
| 154 |
+
* format. The mapping is shown in the table below. \p mipLevel must be 0.
|
| 155 |
+
*
|
| 156 |
+
* \htmlonly
|
| 157 |
+
* <table>
|
| 158 |
+
* <tr><th>VdpChromaType </th><th>arrayIndex</th><th>Size </th><th>Format</th><th>Content </th></tr>
|
| 159 |
+
* <tr><td rowspan="4" valign="top">VDP_CHROMA_TYPE_420</td><td>0 </td><td>w x h/2</td><td>R8 </td><td>Top-field luma </td></tr>
|
| 160 |
+
* <tr> <td>1 </td><td>w x h/2</td><td>R8 </td><td>Bottom-field luma </td></tr>
|
| 161 |
+
* <tr> <td>2 </td><td>w/2 x h/4</td><td>R8G8 </td><td>Top-field chroma </td></tr>
|
| 162 |
+
* <tr> <td>3 </td><td>w/2 x h/4</td><td>R8G8 </td><td>Bottom-field chroma</td></tr>
|
| 163 |
+
* <tr><td rowspan="4" valign="top">VDP_CHROMA_TYPE_422</td><td>0 </td><td>w x h/2</td><td>R8 </td><td>Top-field luma </td></tr>
|
| 164 |
+
* <tr> <td>1 </td><td>w x h/2</td><td>R8 </td><td>Bottom-field luma </td></tr>
|
| 165 |
+
* <tr> <td>2 </td><td>w/2 x h/2</td><td>R8G8 </td><td>Top-field chroma </td></tr>
|
| 166 |
+
* <tr> <td>3 </td><td>w/2 x h/2</td><td>R8G8 </td><td>Bottom-field chroma</td></tr>
|
| 167 |
+
* </table>
|
| 168 |
+
* \endhtmlonly
|
| 169 |
+
*
|
| 170 |
+
* \latexonly
|
| 171 |
+
* \begin{tabular}{|l|l|l|l|l|}
|
| 172 |
+
* \hline
|
| 173 |
+
* VdpChromaType & arrayIndex & Size & Format & Content \\
|
| 174 |
+
* \hline
|
| 175 |
+
* VDP\_CHROMA\_TYPE\_420 & 0 & w x h/2 & R8 & Top-field luma \\
|
| 176 |
+
* & 1 & w x h/2 & R8 & Bottom-field luma \\
|
| 177 |
+
* & 2 & w/2 x h/4 & R8G8 & Top-field chroma \\
|
| 178 |
+
* & 3 & w/2 x h/4 & R8G8 & Bottom-field chroma \\
|
| 179 |
+
* \hline
|
| 180 |
+
* VDP\_CHROMA\_TYPE\_422 & 0 & w x h/2 & R8 & Top-field luma \\
|
| 181 |
+
* & 1 & w x h/2 & R8 & Bottom-field luma \\
|
| 182 |
+
* & 2 & w/2 x h/2 & R8G8 & Top-field chroma \\
|
| 183 |
+
* & 3 & w/2 x h/2 & R8G8 & Bottom-field chroma \\
|
| 184 |
+
* \hline
|
| 185 |
+
* \end{tabular}
|
| 186 |
+
* \endlatexonly
|
| 187 |
+
*
|
| 188 |
+
* \param pCudaResource - Pointer to the returned object handle
|
| 189 |
+
* \param vdpSurface - The VdpVideoSurface to be registered
|
| 190 |
+
* \param flags - Map flags
|
| 191 |
+
*
|
| 192 |
+
* \return
|
| 193 |
+
* ::CUDA_SUCCESS,
|
| 194 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 195 |
+
* ::CUDA_ERROR_ALREADY_MAPPED,
|
| 196 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 197 |
+
* \notefnerr
|
| 198 |
+
*
|
| 199 |
+
* \sa ::cuCtxCreate, ::cuVDPAUCtxCreate,
|
| 200 |
+
* ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource,
|
| 201 |
+
* ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
|
| 202 |
+
* ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
|
| 203 |
+
* ::cuVDPAUGetDevice,
|
| 204 |
+
* ::cudaGraphicsVDPAURegisterVideoSurface
|
| 205 |
+
*/
|
| 206 |
+
CUresult CUDAAPI cuGraphicsVDPAURegisterVideoSurface(CUgraphicsResource *pCudaResource, VdpVideoSurface vdpSurface, unsigned int flags);
|
| 207 |
+
|
| 208 |
+
/**
|
| 209 |
+
* \brief Registers a VDPAU VdpOutputSurface object
|
| 210 |
+
*
|
| 211 |
+
* Registers the VdpOutputSurface specified by \p vdpSurface for access by
|
| 212 |
+
* CUDA. A handle to the registered object is returned as \p pCudaResource.
|
| 213 |
+
* The surface's intended usage is specified using \p flags, as follows:
|
| 214 |
+
*
|
| 215 |
+
* - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
|
| 216 |
+
* resource will be used. It is therefore assumed that this resource will be
|
| 217 |
+
* read from and written to by CUDA. This is the default value.
|
| 218 |
+
* - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA
|
| 219 |
+
* will not write to this resource.
|
| 220 |
+
* - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that
|
| 221 |
+
* CUDA will not read from this resource and will write over the
|
| 222 |
+
* entire contents of the resource, so none of the data previously
|
| 223 |
+
* stored in the resource will be preserved.
|
| 224 |
+
*
|
| 225 |
+
* The VdpOutputSurface is presented as an array of subresources that may be
|
| 226 |
+
* accessed using pointers returned by ::cuGraphicsSubResourceGetMappedArray.
|
| 227 |
+
* The exact number of valid \p arrayIndex values depends on the VDPAU surface
|
| 228 |
+
* format. The mapping is shown in the table below. \p mipLevel must be 0.
|
| 229 |
+
*
|
| 230 |
+
* \htmlonly
|
| 231 |
+
* <table>
|
| 232 |
+
* <tr><th>VdpRGBAFormat </th><th>arrayIndex</th><th>Size </th><th>Format </th><th>Content </th></tr>
|
| 233 |
+
* <tr><td>VDP_RGBA_FORMAT_B8G8R8A8 </td><td>0 </td><td>w x h</td><td>ARGB8 </td><td>Entire surface</td></tr>
|
| 234 |
+
* <tr><td>VDP_RGBA_FORMAT_R10G10B10A2</td><td>0 </td><td>w x h</td><td>A2BGR10</td><td>Entire surface</td></tr>
|
| 235 |
+
* </table>
|
| 236 |
+
* \endhtmlonly
|
| 237 |
+
*
|
| 238 |
+
* \latexonly
|
| 239 |
+
* \begin{tabular}{|l|l|l|l|l|}
|
| 240 |
+
* \hline
|
| 241 |
+
* VdpRGBAFormat & arrayIndex & Size & Format & Content \\
|
| 242 |
+
* \hline
|
| 243 |
+
* VDP\_RGBA\_FORMAT\_B8G8R8A8 & 0 & w x h & ARGB8 & Entire surface \\
|
| 244 |
+
* VDP\_RGBA\_FORMAT\_R10G10B10A2 & 0 & w x h & A2BGR10 & Entire surface \\
|
| 245 |
+
* \hline
|
| 246 |
+
* \end{tabular}
|
| 247 |
+
* \endlatexonly
|
| 248 |
+
*
|
| 249 |
+
* \param pCudaResource - Pointer to the returned object handle
|
| 250 |
+
* \param vdpSurface - The VdpOutputSurface to be registered
|
| 251 |
+
* \param flags - Map flags
|
| 252 |
+
*
|
| 253 |
+
* \return
|
| 254 |
+
* ::CUDA_SUCCESS,
|
| 255 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 256 |
+
* ::CUDA_ERROR_ALREADY_MAPPED,
|
| 257 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 258 |
+
* \notefnerr
|
| 259 |
+
*
|
| 260 |
+
* \sa ::cuCtxCreate, ::cuVDPAUCtxCreate,
|
| 261 |
+
* ::cuGraphicsVDPAURegisterVideoSurface, ::cuGraphicsUnregisterResource,
|
| 262 |
+
* ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
|
| 263 |
+
* ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
|
| 264 |
+
* ::cuVDPAUGetDevice,
|
| 265 |
+
* ::cudaGraphicsVDPAURegisterOutputSurface
|
| 266 |
+
*/
|
| 267 |
+
CUresult CUDAAPI cuGraphicsVDPAURegisterOutputSurface(CUgraphicsResource *pCudaResource, VdpOutputSurface vdpSurface, unsigned int flags);
|
| 268 |
+
|
| 269 |
+
/** @} */ /* END CUDA_VDPAU */
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
#if defined(__CUDA_API_VERSION_INTERNAL)
|
| 273 |
+
#undef cuVDPAUCtxCreate
|
| 274 |
+
|
| 275 |
+
CUresult CUDAAPI cuVDPAUCtxCreate(CUcontext *pCtx, unsigned int flags, CUdevice device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
|
| 276 |
+
#endif /* __CUDA_API_VERSION_INTERNAL */
|
| 277 |
+
|
| 278 |
+
#ifdef __cplusplus
|
| 279 |
+
};
|
| 280 |
+
#endif
|
| 281 |
+
|
| 282 |
+
#endif
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_atomic_functions.h
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__SM_20_ATOMIC_FUNCTIONS_H__)
|
| 51 |
+
#define __SM_20_ATOMIC_FUNCTIONS_H__
|
| 52 |
+
|
| 53 |
+
#if defined(__CUDACC_RTC__)
|
| 54 |
+
#define __SM_20_ATOMIC_FUNCTIONS_DECL__ __device__
|
| 55 |
+
#else /* __CUDACC_RTC__ */
|
| 56 |
+
#define __SM_20_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
|
| 57 |
+
#endif /* __CUDACC_RTC__ */
|
| 58 |
+
|
| 59 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 60 |
+
|
| 61 |
+
/*******************************************************************************
|
| 62 |
+
* *
|
| 63 |
+
* *
|
| 64 |
+
* *
|
| 65 |
+
*******************************************************************************/
|
| 66 |
+
|
| 67 |
+
#include "cuda_runtime_api.h"
|
| 68 |
+
|
| 69 |
+
#ifndef __CUDA_ARCH__
|
| 70 |
+
#define __DEF_IF_HOST { }
|
| 71 |
+
#else /* !__CUDA_ARCH__ */
|
| 72 |
+
#define __DEF_IF_HOST ;
|
| 73 |
+
#endif /* __CUDA_ARCH__ */
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
#ifdef __CUDA_ARCH__
|
| 77 |
+
extern "C"
|
| 78 |
+
{
|
| 79 |
+
extern __device__ __device_builtin__ float __fAtomicAdd(float *address, float val);
|
| 80 |
+
}
|
| 81 |
+
#endif /* __CUDA_ARCH__ */
|
| 82 |
+
|
| 83 |
+
/*******************************************************************************
|
| 84 |
+
* *
|
| 85 |
+
* *
|
| 86 |
+
* *
|
| 87 |
+
*******************************************************************************/
|
| 88 |
+
|
| 89 |
+
__SM_20_ATOMIC_FUNCTIONS_DECL__ float atomicAdd(float *address, float val) __DEF_IF_HOST
|
| 90 |
+
|
| 91 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 92 |
+
|
| 93 |
+
#undef __DEF_IF_HOST
|
| 94 |
+
#undef __SM_20_ATOMIC_FUNCTIONS_DECL__
|
| 95 |
+
|
| 96 |
+
#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
|
| 97 |
+
#include "sm_20_atomic_functions.hpp"
|
| 98 |
+
#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
|
| 99 |
+
|
| 100 |
+
#endif /* !__SM_20_ATOMIC_FUNCTIONS_H__ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_atomic_functions.hpp
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__SM_20_ATOMIC_FUNCTIONS_HPP__)
|
| 51 |
+
#define __SM_20_ATOMIC_FUNCTIONS_HPP__
|
| 52 |
+
|
| 53 |
+
#if defined(__CUDACC_RTC__)
|
| 54 |
+
#define __SM_20_ATOMIC_FUNCTIONS_DECL__ __device__
|
| 55 |
+
#else /* __CUDACC_RTC__ */
|
| 56 |
+
#define __SM_20_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
|
| 57 |
+
#endif /* __CUDACC_RTC__ */
|
| 58 |
+
|
| 59 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 60 |
+
|
| 61 |
+
/*******************************************************************************
|
| 62 |
+
* *
|
| 63 |
+
* *
|
| 64 |
+
* *
|
| 65 |
+
*******************************************************************************/
|
| 66 |
+
|
| 67 |
+
#include "cuda_runtime_api.h"
|
| 68 |
+
|
| 69 |
+
/*******************************************************************************
|
| 70 |
+
* *
|
| 71 |
+
* *
|
| 72 |
+
* *
|
| 73 |
+
*******************************************************************************/
|
| 74 |
+
|
| 75 |
+
__SM_20_ATOMIC_FUNCTIONS_DECL__ float atomicAdd(float *address, float val)
|
| 76 |
+
{
|
| 77 |
+
return __fAtomicAdd(address, val);
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 81 |
+
|
| 82 |
+
#undef __SM_20_ATOMIC_FUNCTIONS_DECL__
|
| 83 |
+
|
| 84 |
+
#endif /* !__SM_20_ATOMIC_FUNCTIONS_HPP__ */
|
| 85 |
+
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_35_intrinsics.h
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
|
| 3 |
+
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
| 4 |
+
|
| 5 |
+
*
|
| 6 |
+
|
| 7 |
+
* NOTICE TO LICENSEE:
|
| 8 |
+
|
| 9 |
+
*
|
| 10 |
+
|
| 11 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 12 |
+
|
| 13 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 14 |
+
|
| 15 |
+
* international Copyright laws.
|
| 16 |
+
|
| 17 |
+
*
|
| 18 |
+
|
| 19 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 20 |
+
|
| 21 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 22 |
+
|
| 23 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 24 |
+
|
| 25 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 26 |
+
|
| 27 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 28 |
+
|
| 29 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 30 |
+
|
| 31 |
+
* of the Licensed Deliverables to any third party without the express
|
| 32 |
+
|
| 33 |
+
* written consent of NVIDIA is prohibited.
|
| 34 |
+
|
| 35 |
+
*
|
| 36 |
+
|
| 37 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 38 |
+
|
| 39 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 40 |
+
|
| 41 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 42 |
+
|
| 43 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 44 |
+
|
| 45 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 46 |
+
|
| 47 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 48 |
+
|
| 49 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 50 |
+
|
| 51 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 52 |
+
|
| 53 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 54 |
+
|
| 55 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 56 |
+
|
| 57 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 58 |
+
|
| 59 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 60 |
+
|
| 61 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 62 |
+
|
| 63 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 64 |
+
|
| 65 |
+
*
|
| 66 |
+
|
| 67 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 68 |
+
|
| 69 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 70 |
+
|
| 71 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 72 |
+
|
| 73 |
+
* computer software documentation" as such terms are used in 48
|
| 74 |
+
|
| 75 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 76 |
+
|
| 77 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 78 |
+
|
| 79 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 80 |
+
|
| 81 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 82 |
+
|
| 83 |
+
* only those rights set forth herein.
|
| 84 |
+
|
| 85 |
+
*
|
| 86 |
+
|
| 87 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 88 |
+
|
| 89 |
+
* software must include, in the user documentation and internal
|
| 90 |
+
|
| 91 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 92 |
+
|
| 93 |
+
* Users Notice.
|
| 94 |
+
|
| 95 |
+
*/
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
#if !defined(__SM_35_INTRINSICS_H__)
|
| 100 |
+
|
| 101 |
+
#define __SM_35_INTRINSICS_H__
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
/**********************************************************************************
|
| 106 |
+
|
| 107 |
+
* All sm_35 intrinsics are supported by sm_32 so simply include its header file *
|
| 108 |
+
|
| 109 |
+
**********************************************************************************/
|
| 110 |
+
|
| 111 |
+
#include "sm_32_intrinsics.h"
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
#endif /* !__SM_35_INTRINSICS_H__ */
|
| 116 |
+
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/vector_functions.hpp
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__VECTOR_FUNCTIONS_HPP__)
|
| 51 |
+
#define __VECTOR_FUNCTIONS_HPP__
|
| 52 |
+
|
| 53 |
+
/*******************************************************************************
|
| 54 |
+
* *
|
| 55 |
+
* *
|
| 56 |
+
* *
|
| 57 |
+
*******************************************************************************/
|
| 58 |
+
|
| 59 |
+
#include "cuda_runtime_api.h"
|
| 60 |
+
|
| 61 |
+
#if defined(__CUDACC_RTC__)
|
| 62 |
+
#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
|
| 63 |
+
#else /* !__CUDACC_RTC__ */
|
| 64 |
+
#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
|
| 65 |
+
#endif /* __CUDACC_RTC__ */
|
| 66 |
+
|
| 67 |
+
/*******************************************************************************
|
| 68 |
+
* *
|
| 69 |
+
* *
|
| 70 |
+
* *
|
| 71 |
+
*******************************************************************************/
|
| 72 |
+
|
| 73 |
+
__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x)
|
| 74 |
+
{
|
| 75 |
+
char1 t; t.x = x; return t;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x)
|
| 79 |
+
{
|
| 80 |
+
uchar1 t; t.x = x; return t;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y)
|
| 84 |
+
{
|
| 85 |
+
char2 t; t.x = x; t.y = y; return t;
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y)
|
| 89 |
+
{
|
| 90 |
+
uchar2 t; t.x = x; t.y = y; return t;
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z)
|
| 94 |
+
{
|
| 95 |
+
char3 t; t.x = x; t.y = y; t.z = z; return t;
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z)
|
| 99 |
+
{
|
| 100 |
+
uchar3 t; t.x = x; t.y = y; t.z = z; return t;
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w)
|
| 104 |
+
{
|
| 105 |
+
char4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w)
|
| 109 |
+
{
|
| 110 |
+
uchar4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x)
|
| 114 |
+
{
|
| 115 |
+
short1 t; t.x = x; return t;
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x)
|
| 119 |
+
{
|
| 120 |
+
ushort1 t; t.x = x; return t;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y)
|
| 124 |
+
{
|
| 125 |
+
short2 t; t.x = x; t.y = y; return t;
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y)
|
| 129 |
+
{
|
| 130 |
+
ushort2 t; t.x = x; t.y = y; return t;
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z)
|
| 134 |
+
{
|
| 135 |
+
short3 t; t.x = x; t.y = y; t.z = z; return t;
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z)
|
| 139 |
+
{
|
| 140 |
+
ushort3 t; t.x = x; t.y = y; t.z = z; return t;
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w)
|
| 144 |
+
{
|
| 145 |
+
short4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
|
| 149 |
+
{
|
| 150 |
+
ushort4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x)
|
| 154 |
+
{
|
| 155 |
+
int1 t; t.x = x; return t;
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x)
|
| 159 |
+
{
|
| 160 |
+
uint1 t; t.x = x; return t;
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y)
|
| 164 |
+
{
|
| 165 |
+
int2 t; t.x = x; t.y = y; return t;
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y)
|
| 169 |
+
{
|
| 170 |
+
uint2 t; t.x = x; t.y = y; return t;
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z)
|
| 174 |
+
{
|
| 175 |
+
int3 t; t.x = x; t.y = y; t.z = z; return t;
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z)
|
| 179 |
+
{
|
| 180 |
+
uint3 t; t.x = x; t.y = y; t.z = z; return t;
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w)
|
| 184 |
+
{
|
| 185 |
+
int4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w)
|
| 189 |
+
{
|
| 190 |
+
uint4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x)
|
| 194 |
+
{
|
| 195 |
+
long1 t; t.x = x; return t;
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x)
|
| 199 |
+
{
|
| 200 |
+
ulong1 t; t.x = x; return t;
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y)
|
| 204 |
+
{
|
| 205 |
+
long2 t; t.x = x; t.y = y; return t;
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y)
|
| 209 |
+
{
|
| 210 |
+
ulong2 t; t.x = x; t.y = y; return t;
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z)
|
| 214 |
+
{
|
| 215 |
+
long3 t; t.x = x; t.y = y; t.z = z; return t;
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z)
|
| 219 |
+
{
|
| 220 |
+
ulong3 t; t.x = x; t.y = y; t.z = z; return t;
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w)
|
| 224 |
+
{
|
| 225 |
+
long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w)
|
| 229 |
+
{
|
| 230 |
+
ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x)
|
| 234 |
+
{
|
| 235 |
+
float1 t; t.x = x; return t;
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y)
|
| 239 |
+
{
|
| 240 |
+
float2 t; t.x = x; t.y = y; return t;
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z)
|
| 244 |
+
{
|
| 245 |
+
float3 t; t.x = x; t.y = y; t.z = z; return t;
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w)
|
| 249 |
+
{
|
| 250 |
+
float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x)
|
| 254 |
+
{
|
| 255 |
+
longlong1 t; t.x = x; return t;
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x)
|
| 259 |
+
{
|
| 260 |
+
ulonglong1 t; t.x = x; return t;
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y)
|
| 264 |
+
{
|
| 265 |
+
longlong2 t; t.x = x; t.y = y; return t;
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y)
|
| 269 |
+
{
|
| 270 |
+
ulonglong2 t; t.x = x; t.y = y; return t;
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z)
|
| 274 |
+
{
|
| 275 |
+
longlong3 t; t.x = x; t.y = y; t.z = z; return t;
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z)
|
| 279 |
+
{
|
| 280 |
+
ulonglong3 t; t.x = x; t.y = y; t.z = z; return t;
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w)
|
| 284 |
+
{
|
| 285 |
+
longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w)
|
| 289 |
+
{
|
| 290 |
+
ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x)
|
| 294 |
+
{
|
| 295 |
+
double1 t; t.x = x; return t;
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y)
|
| 299 |
+
{
|
| 300 |
+
double2 t; t.x = x; t.y = y; return t;
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z)
|
| 304 |
+
{
|
| 305 |
+
double3 t; t.x = x; t.y = y; t.z = z; return t;
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w)
|
| 309 |
+
{
|
| 310 |
+
double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
#undef __VECTOR_FUNCTIONS_DECL__
|
| 314 |
+
|
| 315 |
+
#endif /* !__VECTOR_FUNCTIONS_HPP__ */
|
| 316 |
+
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (213 Bytes). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (221 Bytes). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_train.h
ADDED
|
@@ -0,0 +1,540 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2017-2022 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
/* cudnn_adv_train : cuDNN's advanced and experimental features.
|
| 51 |
+
|
| 52 |
+
*/
|
| 53 |
+
|
| 54 |
+
#if !defined(CUDNN_ADV_TRAIN_H_)
|
| 55 |
+
#define CUDNN_ADV_TRAIN_H_
|
| 56 |
+
|
| 57 |
+
#include <cuda_runtime.h>
|
| 58 |
+
#include <stdint.h>
|
| 59 |
+
|
| 60 |
+
#include "cudnn_version.h"
|
| 61 |
+
#include "cudnn_ops_infer.h"
|
| 62 |
+
#include "cudnn_ops_train.h"
|
| 63 |
+
#include "cudnn_adv_infer.h"
|
| 64 |
+
|
| 65 |
+
/* These version numbers are autogenerated, do not edit manually. */
|
| 66 |
+
#define CUDNN_ADV_TRAIN_MAJOR 8
|
| 67 |
+
#define CUDNN_ADV_TRAIN_MINOR 7
|
| 68 |
+
#define CUDNN_ADV_TRAIN_PATCH 0
|
| 69 |
+
|
| 70 |
+
#if (CUDNN_ADV_TRAIN_MAJOR != CUDNN_MAJOR) || (CUDNN_ADV_TRAIN_MINOR != CUDNN_MINOR) || \
|
| 71 |
+
(CUDNN_ADV_TRAIN_PATCH != CUDNN_PATCHLEVEL)
|
| 72 |
+
#error Version mismatch in cuDNN ADV TRAIN!!!
|
| 73 |
+
#endif
|
| 74 |
+
|
| 75 |
+
#if defined(__cplusplus)
|
| 76 |
+
extern "C" {
|
| 77 |
+
#endif
|
| 78 |
+
|
| 79 |
+
typedef enum {
|
| 80 |
+
CUDNN_WGRAD_MODE_ADD = 0, /* add partial gradients to wgrad output buffers */
|
| 81 |
+
CUDNN_WGRAD_MODE_SET = 1, /* write partial gradients to wgrad output buffers */
|
| 82 |
+
} cudnnWgradMode_t;
|
| 83 |
+
|
| 84 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 85 |
+
cudnnRNNForwardTraining(cudnnHandle_t handle,
|
| 86 |
+
const cudnnRNNDescriptor_t rnnDesc,
|
| 87 |
+
const int seqLength,
|
| 88 |
+
const cudnnTensorDescriptor_t *xDesc,
|
| 89 |
+
const void *x,
|
| 90 |
+
const cudnnTensorDescriptor_t hxDesc,
|
| 91 |
+
const void *hx,
|
| 92 |
+
const cudnnTensorDescriptor_t cxDesc,
|
| 93 |
+
const void *cx,
|
| 94 |
+
const cudnnFilterDescriptor_t wDesc,
|
| 95 |
+
const void *w,
|
| 96 |
+
const cudnnTensorDescriptor_t *yDesc,
|
| 97 |
+
void *y,
|
| 98 |
+
const cudnnTensorDescriptor_t hyDesc,
|
| 99 |
+
void *hy,
|
| 100 |
+
const cudnnTensorDescriptor_t cyDesc,
|
| 101 |
+
void *cy,
|
| 102 |
+
void *workSpace,
|
| 103 |
+
size_t workSpaceSizeInBytes,
|
| 104 |
+
void *reserveSpace,
|
| 105 |
+
size_t reserveSpaceSizeInBytes);
|
| 106 |
+
|
| 107 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 108 |
+
cudnnRNNBackwardData(cudnnHandle_t handle,
|
| 109 |
+
const cudnnRNNDescriptor_t rnnDesc,
|
| 110 |
+
const int seqLength,
|
| 111 |
+
const cudnnTensorDescriptor_t *yDesc,
|
| 112 |
+
const void *y,
|
| 113 |
+
const cudnnTensorDescriptor_t *dyDesc,
|
| 114 |
+
const void *dy,
|
| 115 |
+
const cudnnTensorDescriptor_t dhyDesc,
|
| 116 |
+
const void *dhy,
|
| 117 |
+
const cudnnTensorDescriptor_t dcyDesc,
|
| 118 |
+
const void *dcy,
|
| 119 |
+
const cudnnFilterDescriptor_t wDesc,
|
| 120 |
+
const void *w,
|
| 121 |
+
const cudnnTensorDescriptor_t hxDesc,
|
| 122 |
+
const void *hx,
|
| 123 |
+
const cudnnTensorDescriptor_t cxDesc,
|
| 124 |
+
const void *cx,
|
| 125 |
+
const cudnnTensorDescriptor_t *dxDesc,
|
| 126 |
+
void *dx,
|
| 127 |
+
const cudnnTensorDescriptor_t dhxDesc,
|
| 128 |
+
void *dhx,
|
| 129 |
+
const cudnnTensorDescriptor_t dcxDesc,
|
| 130 |
+
void *dcx,
|
| 131 |
+
void *workSpace,
|
| 132 |
+
size_t workSpaceSizeInBytes,
|
| 133 |
+
void *reserveSpace,
|
| 134 |
+
size_t reserveSpaceSizeInBytes);
|
| 135 |
+
|
| 136 |
+
cudnnStatus_t CUDNNWINAPI
|
| 137 |
+
cudnnRNNBackwardData_v8(cudnnHandle_t handle,
|
| 138 |
+
cudnnRNNDescriptor_t rnnDesc,
|
| 139 |
+
const int32_t devSeqLengths[],
|
| 140 |
+
cudnnRNNDataDescriptor_t yDesc,
|
| 141 |
+
const void *y,
|
| 142 |
+
const void *dy,
|
| 143 |
+
cudnnRNNDataDescriptor_t xDesc,
|
| 144 |
+
void *dx,
|
| 145 |
+
cudnnTensorDescriptor_t hDesc,
|
| 146 |
+
const void *hx,
|
| 147 |
+
const void *dhy,
|
| 148 |
+
void *dhx,
|
| 149 |
+
cudnnTensorDescriptor_t cDesc,
|
| 150 |
+
const void *cx,
|
| 151 |
+
const void *dcy,
|
| 152 |
+
void *dcx,
|
| 153 |
+
size_t weightSpaceSize,
|
| 154 |
+
const void *weightSpace,
|
| 155 |
+
size_t workSpaceSize,
|
| 156 |
+
void *workSpace,
|
| 157 |
+
size_t reserveSpaceSize,
|
| 158 |
+
void *reserveSpace);
|
| 159 |
+
|
| 160 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 161 |
+
cudnnRNNBackwardWeights(cudnnHandle_t handle,
|
| 162 |
+
const cudnnRNNDescriptor_t rnnDesc,
|
| 163 |
+
const int seqLength,
|
| 164 |
+
const cudnnTensorDescriptor_t *xDesc,
|
| 165 |
+
const void *x,
|
| 166 |
+
const cudnnTensorDescriptor_t hxDesc,
|
| 167 |
+
const void *hx,
|
| 168 |
+
const cudnnTensorDescriptor_t *yDesc,
|
| 169 |
+
const void *y,
|
| 170 |
+
const void *workSpace,
|
| 171 |
+
size_t workSpaceSizeInBytes,
|
| 172 |
+
const cudnnFilterDescriptor_t dwDesc,
|
| 173 |
+
void *dw,
|
| 174 |
+
const void *reserveSpace,
|
| 175 |
+
size_t reserveSpaceSizeInBytes);
|
| 176 |
+
|
| 177 |
+
cudnnStatus_t CUDNNWINAPI
|
| 178 |
+
cudnnRNNBackwardWeights_v8(cudnnHandle_t handle,
|
| 179 |
+
cudnnRNNDescriptor_t rnnDesc,
|
| 180 |
+
cudnnWgradMode_t addGrad,
|
| 181 |
+
const int32_t devSeqLengths[],
|
| 182 |
+
cudnnRNNDataDescriptor_t xDesc,
|
| 183 |
+
const void *x,
|
| 184 |
+
cudnnTensorDescriptor_t hDesc,
|
| 185 |
+
const void *hx,
|
| 186 |
+
cudnnRNNDataDescriptor_t yDesc,
|
| 187 |
+
const void *y,
|
| 188 |
+
size_t weightSpaceSize,
|
| 189 |
+
void *dweightSpace,
|
| 190 |
+
size_t workSpaceSize,
|
| 191 |
+
void *workSpace,
|
| 192 |
+
size_t reserveSpaceSize,
|
| 193 |
+
void *reserveSpace);
|
| 194 |
+
|
| 195 |
+
/* RNN EX API */
|
| 196 |
+
|
| 197 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 198 |
+
cudnnRNNForwardTrainingEx(cudnnHandle_t handle,
|
| 199 |
+
const cudnnRNNDescriptor_t rnnDesc,
|
| 200 |
+
const cudnnRNNDataDescriptor_t xDesc,
|
| 201 |
+
const void *x,
|
| 202 |
+
const cudnnTensorDescriptor_t hxDesc,
|
| 203 |
+
const void *hx,
|
| 204 |
+
const cudnnTensorDescriptor_t cxDesc,
|
| 205 |
+
const void *cx,
|
| 206 |
+
const cudnnFilterDescriptor_t wDesc,
|
| 207 |
+
const void *w,
|
| 208 |
+
const cudnnRNNDataDescriptor_t yDesc,
|
| 209 |
+
void *y,
|
| 210 |
+
const cudnnTensorDescriptor_t hyDesc,
|
| 211 |
+
void *hy,
|
| 212 |
+
const cudnnTensorDescriptor_t cyDesc,
|
| 213 |
+
void *cy,
|
| 214 |
+
const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
|
| 215 |
+
const void *keys, /* reserved, should pass NULL */
|
| 216 |
+
const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
|
| 217 |
+
void *cAttn, /* reserved, should pass NULL */
|
| 218 |
+
const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
|
| 219 |
+
void *iAttn, /* reserved, should pass NULL */
|
| 220 |
+
const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
|
| 221 |
+
void *queries, /* reserved, should pass NULL */
|
| 222 |
+
void *workSpace,
|
| 223 |
+
size_t workSpaceSizeInBytes,
|
| 224 |
+
void *reserveSpace,
|
| 225 |
+
size_t reserveSpaceSizeInBytes);
|
| 226 |
+
|
| 227 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 228 |
+
cudnnRNNBackwardDataEx(cudnnHandle_t handle,
|
| 229 |
+
const cudnnRNNDescriptor_t rnnDesc,
|
| 230 |
+
const cudnnRNNDataDescriptor_t yDesc,
|
| 231 |
+
const void *y,
|
| 232 |
+
const cudnnRNNDataDescriptor_t dyDesc,
|
| 233 |
+
const void *dy,
|
| 234 |
+
const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */
|
| 235 |
+
const void *dcAttn, /* reserved, should pass NULL */
|
| 236 |
+
const cudnnTensorDescriptor_t dhyDesc,
|
| 237 |
+
const void *dhy,
|
| 238 |
+
const cudnnTensorDescriptor_t dcyDesc,
|
| 239 |
+
const void *dcy,
|
| 240 |
+
const cudnnFilterDescriptor_t wDesc,
|
| 241 |
+
const void *w,
|
| 242 |
+
const cudnnTensorDescriptor_t hxDesc,
|
| 243 |
+
const void *hx,
|
| 244 |
+
const cudnnTensorDescriptor_t cxDesc,
|
| 245 |
+
const void *cx,
|
| 246 |
+
const cudnnRNNDataDescriptor_t dxDesc,
|
| 247 |
+
void *dx,
|
| 248 |
+
const cudnnTensorDescriptor_t dhxDesc,
|
| 249 |
+
void *dhx,
|
| 250 |
+
const cudnnTensorDescriptor_t dcxDesc,
|
| 251 |
+
void *dcx,
|
| 252 |
+
const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */
|
| 253 |
+
void *dkeys, /* reserved, should pass NULL */
|
| 254 |
+
void *workSpace,
|
| 255 |
+
size_t workSpaceSizeInBytes,
|
| 256 |
+
void *reserveSpace,
|
| 257 |
+
size_t reserveSpaceSizeInBytes);
|
| 258 |
+
|
| 259 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 260 |
+
cudnnRNNBackwardWeightsEx(cudnnHandle_t handle,
|
| 261 |
+
const cudnnRNNDescriptor_t rnnDesc,
|
| 262 |
+
const cudnnRNNDataDescriptor_t xDesc,
|
| 263 |
+
const void *x,
|
| 264 |
+
const cudnnTensorDescriptor_t hxDesc,
|
| 265 |
+
const void *hx,
|
| 266 |
+
const cudnnRNNDataDescriptor_t yDesc,
|
| 267 |
+
const void *y,
|
| 268 |
+
void *workSpace,
|
| 269 |
+
size_t workSpaceSizeInBytes,
|
| 270 |
+
const cudnnFilterDescriptor_t dwDesc,
|
| 271 |
+
void *dw,
|
| 272 |
+
void *reserveSpace,
|
| 273 |
+
size_t reserveSpaceSizeInBytes);
|
| 274 |
+
|
| 275 |
+
/* RNN FIND API */
|
| 276 |
+
|
| 277 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 278 |
+
cudnnGetRNNForwardTrainingAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count);
|
| 279 |
+
|
| 280 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 281 |
+
cudnnFindRNNForwardTrainingAlgorithmEx(cudnnHandle_t handle,
|
| 282 |
+
const cudnnRNNDescriptor_t rnnDesc,
|
| 283 |
+
const int seqLength,
|
| 284 |
+
const cudnnTensorDescriptor_t *xDesc,
|
| 285 |
+
const void *x,
|
| 286 |
+
const cudnnTensorDescriptor_t hxDesc,
|
| 287 |
+
const void *hx,
|
| 288 |
+
const cudnnTensorDescriptor_t cxDesc,
|
| 289 |
+
const void *cx,
|
| 290 |
+
const cudnnFilterDescriptor_t wDesc,
|
| 291 |
+
const void *w,
|
| 292 |
+
const cudnnTensorDescriptor_t *yDesc,
|
| 293 |
+
void *y,
|
| 294 |
+
const cudnnTensorDescriptor_t hyDesc,
|
| 295 |
+
void *hy,
|
| 296 |
+
const cudnnTensorDescriptor_t cyDesc,
|
| 297 |
+
void *cy,
|
| 298 |
+
const float findIntensity,
|
| 299 |
+
const int requestedAlgoCount,
|
| 300 |
+
int *returnedAlgoCount,
|
| 301 |
+
cudnnAlgorithmPerformance_t *perfResults,
|
| 302 |
+
void *workspace,
|
| 303 |
+
size_t workSpaceSizeInBytes,
|
| 304 |
+
void *reserveSpace,
|
| 305 |
+
size_t reserveSpaceSizeInBytes);
|
| 306 |
+
|
| 307 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 308 |
+
cudnnGetRNNBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count);
|
| 309 |
+
|
| 310 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 311 |
+
cudnnFindRNNBackwardDataAlgorithmEx(cudnnHandle_t handle,
|
| 312 |
+
const cudnnRNNDescriptor_t rnnDesc,
|
| 313 |
+
const int seqLength,
|
| 314 |
+
const cudnnTensorDescriptor_t *yDesc,
|
| 315 |
+
const void *y,
|
| 316 |
+
const cudnnTensorDescriptor_t *dyDesc,
|
| 317 |
+
const void *dy,
|
| 318 |
+
const cudnnTensorDescriptor_t dhyDesc,
|
| 319 |
+
const void *dhy,
|
| 320 |
+
const cudnnTensorDescriptor_t dcyDesc,
|
| 321 |
+
const void *dcy,
|
| 322 |
+
const cudnnFilterDescriptor_t wDesc,
|
| 323 |
+
const void *w,
|
| 324 |
+
const cudnnTensorDescriptor_t hxDesc,
|
| 325 |
+
const void *hx,
|
| 326 |
+
const cudnnTensorDescriptor_t cxDesc,
|
| 327 |
+
const void *cx,
|
| 328 |
+
const cudnnTensorDescriptor_t *dxDesc,
|
| 329 |
+
void *dx,
|
| 330 |
+
const cudnnTensorDescriptor_t dhxDesc,
|
| 331 |
+
void *dhx,
|
| 332 |
+
const cudnnTensorDescriptor_t dcxDesc,
|
| 333 |
+
void *dcx,
|
| 334 |
+
const float findIntensity,
|
| 335 |
+
const int requestedAlgoCount,
|
| 336 |
+
int *returnedAlgoCount,
|
| 337 |
+
cudnnAlgorithmPerformance_t *perfResults,
|
| 338 |
+
void *workspace,
|
| 339 |
+
size_t workSpaceSizeInBytes,
|
| 340 |
+
void *reserveSpace,
|
| 341 |
+
size_t reserveSpaceSizeInBytes);
|
| 342 |
+
|
| 343 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 344 |
+
cudnnGetRNNBackwardWeightsAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count);
|
| 345 |
+
|
| 346 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 347 |
+
cudnnFindRNNBackwardWeightsAlgorithmEx(cudnnHandle_t handle,
|
| 348 |
+
const cudnnRNNDescriptor_t rnnDesc,
|
| 349 |
+
const int seqLength,
|
| 350 |
+
const cudnnTensorDescriptor_t *xDesc,
|
| 351 |
+
const void *x,
|
| 352 |
+
const cudnnTensorDescriptor_t hxDesc,
|
| 353 |
+
const void *hx,
|
| 354 |
+
const cudnnTensorDescriptor_t *yDesc,
|
| 355 |
+
const void *y,
|
| 356 |
+
const float findIntensity,
|
| 357 |
+
const int requestedAlgoCount,
|
| 358 |
+
int *returnedAlgoCount,
|
| 359 |
+
cudnnAlgorithmPerformance_t *perfResults,
|
| 360 |
+
const void *workspace,
|
| 361 |
+
size_t workSpaceSizeInBytes,
|
| 362 |
+
const cudnnFilterDescriptor_t dwDesc,
|
| 363 |
+
void *dw,
|
| 364 |
+
const void *reserveSpace,
|
| 365 |
+
size_t reserveSpaceSizeInBytes);
|
| 366 |
+
|
| 367 |
+
cudnnStatus_t CUDNNWINAPI
|
| 368 |
+
cudnnMultiHeadAttnBackwardData(cudnnHandle_t handle,
|
| 369 |
+
const cudnnAttnDescriptor_t attnDesc,
|
| 370 |
+
const int loWinIdx[],
|
| 371 |
+
const int hiWinIdx[],
|
| 372 |
+
const int devSeqLengthsDQDO[],
|
| 373 |
+
const int devSeqLengthsDKDV[],
|
| 374 |
+
const cudnnSeqDataDescriptor_t doDesc,
|
| 375 |
+
const void *dout,
|
| 376 |
+
const cudnnSeqDataDescriptor_t dqDesc,
|
| 377 |
+
void *dqueries,
|
| 378 |
+
const void *queries,
|
| 379 |
+
const cudnnSeqDataDescriptor_t dkDesc,
|
| 380 |
+
void *dkeys,
|
| 381 |
+
const void *keys,
|
| 382 |
+
const cudnnSeqDataDescriptor_t dvDesc,
|
| 383 |
+
void *dvalues,
|
| 384 |
+
const void *values,
|
| 385 |
+
size_t weightSizeInBytes,
|
| 386 |
+
const void *weights,
|
| 387 |
+
size_t workSpaceSizeInBytes,
|
| 388 |
+
void *workSpace,
|
| 389 |
+
size_t reserveSpaceSizeInBytes,
|
| 390 |
+
void *reserveSpace);
|
| 391 |
+
|
| 392 |
+
cudnnStatus_t CUDNNWINAPI
|
| 393 |
+
cudnnMultiHeadAttnBackwardWeights(cudnnHandle_t handle,
|
| 394 |
+
const cudnnAttnDescriptor_t attnDesc,
|
| 395 |
+
cudnnWgradMode_t addGrad,
|
| 396 |
+
const cudnnSeqDataDescriptor_t qDesc,
|
| 397 |
+
const void *queries,
|
| 398 |
+
const cudnnSeqDataDescriptor_t kDesc,
|
| 399 |
+
const void *keys,
|
| 400 |
+
const cudnnSeqDataDescriptor_t vDesc,
|
| 401 |
+
const void *values,
|
| 402 |
+
const cudnnSeqDataDescriptor_t doDesc,
|
| 403 |
+
const void *dout,
|
| 404 |
+
size_t weightSizeInBytes,
|
| 405 |
+
const void *weights,
|
| 406 |
+
void *dweights,
|
| 407 |
+
size_t workSpaceSizeInBytes,
|
| 408 |
+
void *workSpace,
|
| 409 |
+
size_t reserveSpaceSizeInBytes,
|
| 410 |
+
void *reserveSpace);
|
| 411 |
+
|
| 412 |
+
/*
|
| 413 |
+
* CTC (Connectionist Temporal Classification) loss descriptor create/destory/set/get functions
|
| 414 |
+
*/
|
| 415 |
+
/* Input normalization mode for loss function */
|
| 416 |
+
typedef enum {
|
| 417 |
+
CUDNN_LOSS_NORMALIZATION_NONE = 0,
|
| 418 |
+
CUDNN_LOSS_NORMALIZATION_SOFTMAX = 1,
|
| 419 |
+
} cudnnLossNormalizationMode_t;
|
| 420 |
+
|
| 421 |
+
cudnnStatus_t CUDNNWINAPI
|
| 422 |
+
cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc);
|
| 423 |
+
|
| 424 |
+
cudnnStatus_t CUDNNWINAPI
|
| 425 |
+
cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType);
|
| 426 |
+
|
| 427 |
+
cudnnStatus_t CUDNNWINAPI
|
| 428 |
+
cudnnSetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
|
| 429 |
+
cudnnDataType_t compType,
|
| 430 |
+
cudnnLossNormalizationMode_t normMode,
|
| 431 |
+
cudnnNanPropagation_t gradMode);
|
| 432 |
+
|
| 433 |
+
cudnnStatus_t CUDNNWINAPI
|
| 434 |
+
cudnnSetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
|
| 435 |
+
cudnnDataType_t compType,
|
| 436 |
+
cudnnLossNormalizationMode_t normMode,
|
| 437 |
+
cudnnNanPropagation_t gradMode,
|
| 438 |
+
int maxLabelLength);
|
| 439 |
+
|
| 440 |
+
cudnnStatus_t CUDNNWINAPI
|
| 441 |
+
cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType);
|
| 442 |
+
|
| 443 |
+
cudnnStatus_t CUDNNWINAPI
|
| 444 |
+
cudnnGetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
|
| 445 |
+
cudnnDataType_t *compType,
|
| 446 |
+
cudnnLossNormalizationMode_t *normMode,
|
| 447 |
+
cudnnNanPropagation_t *gradMode);
|
| 448 |
+
|
| 449 |
+
cudnnStatus_t CUDNNWINAPI
|
| 450 |
+
cudnnGetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
|
| 451 |
+
cudnnDataType_t *compType,
|
| 452 |
+
cudnnLossNormalizationMode_t *normMode,
|
| 453 |
+
cudnnNanPropagation_t *gradMode,
|
| 454 |
+
int *maxLabelLength);
|
| 455 |
+
|
| 456 |
+
cudnnStatus_t CUDNNWINAPI
|
| 457 |
+
cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc);
|
| 458 |
+
|
| 459 |
+
/* return the ctc costs and gradients, given the probabilities and labels */
|
| 460 |
+
cudnnStatus_t CUDNNWINAPI
|
| 461 |
+
cudnnCTCLoss(
|
| 462 |
+
cudnnHandle_t handle,
|
| 463 |
+
const cudnnTensorDescriptor_t
|
| 464 |
+
probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the
|
| 465 |
+
mini batch size, A is the alphabet size) */
|
| 466 |
+
const void *probs, /* probabilities after softmax, in GPU memory */
|
| 467 |
+
const int hostLabels[], /* labels, in CPU memory */
|
| 468 |
+
const int hostLabelLengths[], /* the length of each label, in CPU memory */
|
| 469 |
+
const int hostInputLengths[], /* the lengths of timing steps in each batch, in CPU memory */
|
| 470 |
+
void *costs, /* the returned costs of CTC, in GPU memory */
|
| 471 |
+
const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
|
| 472 |
+
void *gradients, /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
|
| 473 |
+
cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
|
| 474 |
+
cudnnCTCLossDescriptor_t ctcLossDesc,
|
| 475 |
+
void *workspace, /* pointer to the workspace, in GPU memory */
|
| 476 |
+
size_t workSpaceSizeInBytes); /* size of the workspace */
|
| 477 |
+
|
| 478 |
+
/* return the ctc costs and gradients, given the probabilities and labels */
|
| 479 |
+
cudnnStatus_t CUDNNWINAPI
|
| 480 |
+
cudnnCTCLoss_v8(
|
| 481 |
+
cudnnHandle_t handle,
|
| 482 |
+
cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
|
| 483 |
+
cudnnCTCLossDescriptor_t ctcLossDesc,
|
| 484 |
+
const cudnnTensorDescriptor_t
|
| 485 |
+
probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the
|
| 486 |
+
mini batch size, A is the alphabet size) */
|
| 487 |
+
const void *probs, /* probabilities after softmax, in GPU memory */
|
| 488 |
+
const int labels[], /* labels, in GPU memory */
|
| 489 |
+
const int labelLengths[], /* the length of each label, in GPU memory */
|
| 490 |
+
const int inputLengths[], /* the lengths of timing steps in each batch, in GPU memory */
|
| 491 |
+
void *costs, /* the returned costs of CTC, in GPU memory */
|
| 492 |
+
const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
|
| 493 |
+
void *gradients, /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
|
| 494 |
+
size_t workSpaceSizeInBytes, /* size of the workspace */
|
| 495 |
+
void *workspace); /* pointer to the workspace, in GPU memory */
|
| 496 |
+
|
| 497 |
+
/* return the workspace size needed for ctc */
|
| 498 |
+
cudnnStatus_t CUDNNWINAPI
|
| 499 |
+
cudnnGetCTCLossWorkspaceSize(
|
| 500 |
+
cudnnHandle_t handle,
|
| 501 |
+
const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
|
| 502 |
+
timing steps, N is the mini batch size, A is the alphabet size) */
|
| 503 |
+
const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
|
| 504 |
+
dimensions are T,N,A. To compute costs
|
| 505 |
+
only, set it to NULL */
|
| 506 |
+
const int *labels, /* labels, in CPU memory */
|
| 507 |
+
const int *labelLengths, /* the length of each label, in CPU memory */
|
| 508 |
+
const int *inputLengths, /* the lengths of timing steps in each batch, in CPU memory */
|
| 509 |
+
cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
|
| 510 |
+
cudnnCTCLossDescriptor_t ctcLossDesc,
|
| 511 |
+
size_t *sizeInBytes); /* pointer to the returned workspace size */
|
| 512 |
+
|
| 513 |
+
/* return the workspace size needed for ctc */
|
| 514 |
+
cudnnStatus_t CUDNNWINAPI
|
| 515 |
+
cudnnGetCTCLossWorkspaceSize_v8(
|
| 516 |
+
cudnnHandle_t handle,
|
| 517 |
+
cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
|
| 518 |
+
cudnnCTCLossDescriptor_t ctcLossDesc,
|
| 519 |
+
const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
|
| 520 |
+
timing steps, N is the mini batch size, A is the alphabet size) */
|
| 521 |
+
const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
|
| 522 |
+
dimensions are T,N,A. To compute costs
|
| 523 |
+
only, set it to NULL */
|
| 524 |
+
size_t *sizeInBytes); /* pointer to the returned workspace size */
|
| 525 |
+
|
| 526 |
+
/*
|
| 527 |
+
* \brief Cross-library version checker.
|
| 528 |
+
* This function is implemented differently in each sub-library. Each sublib
|
| 529 |
+
* checks whether its own version matches that of its dependencies.
|
| 530 |
+
* \returns CUDNN_STATUS_SUCCESS if the version check passes,
|
| 531 |
+
* CUDNN_STATUS_VERSION_MISMATCH if the versions are inconsistent.
|
| 532 |
+
*/
|
| 533 |
+
cudnnStatus_t CUDNNWINAPI
|
| 534 |
+
cudnnAdvTrainVersionCheck(void);
|
| 535 |
+
|
| 536 |
+
#if defined(__cplusplus)
|
| 537 |
+
}
|
| 538 |
+
#endif
|
| 539 |
+
|
| 540 |
+
#endif /* CUDNN_ADV_TRAIN_H_ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_backend.h
ADDED
|
@@ -0,0 +1,600 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2017-2022 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef _CUDNN_BACKEND_H_
|
| 51 |
+
#define _CUDNN_BACKEND_H_
|
| 52 |
+
|
| 53 |
+
/*
|
| 54 |
+
* The content in this header file is under development to be included in cudnn.h in the future
|
| 55 |
+
* Production code should have all include of this header file remove.
|
| 56 |
+
*/
|
| 57 |
+
|
| 58 |
+
#include "cudnn_ops_infer.h"
|
| 59 |
+
#include "cudnn_cnn_infer.h"
|
| 60 |
+
|
| 61 |
+
/* NOTE: definition in extern "C" to be copied later to public header */
|
| 62 |
+
#if defined(__cplusplus)
|
| 63 |
+
extern "C" {
|
| 64 |
+
#endif
|
| 65 |
+
|
| 66 |
+
typedef void *cudnnBackendDescriptor_t;
|
| 67 |
+
|
| 68 |
+
typedef struct cudnnFractionStruct {
|
| 69 |
+
int64_t numerator;
|
| 70 |
+
int64_t denominator;
|
| 71 |
+
} cudnnFraction_t;
|
| 72 |
+
|
| 73 |
+
typedef enum {
|
| 74 |
+
CUDNN_POINTWISE_ADD = 0,
|
| 75 |
+
CUDNN_POINTWISE_ADD_SQUARE = 5,
|
| 76 |
+
CUDNN_POINTWISE_DIV = 6,
|
| 77 |
+
CUDNN_POINTWISE_MAX = 3,
|
| 78 |
+
CUDNN_POINTWISE_MIN = 2,
|
| 79 |
+
CUDNN_POINTWISE_MOD = 7,
|
| 80 |
+
CUDNN_POINTWISE_MUL = 1,
|
| 81 |
+
CUDNN_POINTWISE_POW = 8,
|
| 82 |
+
CUDNN_POINTWISE_SUB = 9,
|
| 83 |
+
|
| 84 |
+
CUDNN_POINTWISE_ABS = 10,
|
| 85 |
+
CUDNN_POINTWISE_CEIL = 11,
|
| 86 |
+
CUDNN_POINTWISE_COS = 12,
|
| 87 |
+
CUDNN_POINTWISE_EXP = 13,
|
| 88 |
+
CUDNN_POINTWISE_FLOOR = 14,
|
| 89 |
+
CUDNN_POINTWISE_LOG = 15,
|
| 90 |
+
CUDNN_POINTWISE_NEG = 16,
|
| 91 |
+
CUDNN_POINTWISE_RSQRT = 17,
|
| 92 |
+
CUDNN_POINTWISE_SIN = 18,
|
| 93 |
+
CUDNN_POINTWISE_SQRT = 4,
|
| 94 |
+
CUDNN_POINTWISE_TAN = 19,
|
| 95 |
+
CUDNN_POINTWISE_ERF = 20,
|
| 96 |
+
CUDNN_POINTWISE_IDENTITY = 21,
|
| 97 |
+
|
| 98 |
+
CUDNN_POINTWISE_RELU_FWD = 100,
|
| 99 |
+
CUDNN_POINTWISE_TANH_FWD = 101,
|
| 100 |
+
CUDNN_POINTWISE_SIGMOID_FWD = 102,
|
| 101 |
+
CUDNN_POINTWISE_ELU_FWD = 103,
|
| 102 |
+
CUDNN_POINTWISE_GELU_FWD = 104,
|
| 103 |
+
CUDNN_POINTWISE_SOFTPLUS_FWD = 105,
|
| 104 |
+
CUDNN_POINTWISE_SWISH_FWD = 106,
|
| 105 |
+
CUDNN_POINTWISE_GELU_APPROX_TANH_FWD = 107,
|
| 106 |
+
|
| 107 |
+
CUDNN_POINTWISE_RELU_BWD = 200,
|
| 108 |
+
CUDNN_POINTWISE_TANH_BWD = 201,
|
| 109 |
+
CUDNN_POINTWISE_SIGMOID_BWD = 202,
|
| 110 |
+
CUDNN_POINTWISE_ELU_BWD = 203,
|
| 111 |
+
CUDNN_POINTWISE_GELU_BWD = 204,
|
| 112 |
+
CUDNN_POINTWISE_SOFTPLUS_BWD = 205,
|
| 113 |
+
CUDNN_POINTWISE_SWISH_BWD = 206,
|
| 114 |
+
CUDNN_POINTWISE_GELU_APPROX_TANH_BWD = 207,
|
| 115 |
+
|
| 116 |
+
CUDNN_POINTWISE_CMP_EQ = 300,
|
| 117 |
+
CUDNN_POINTWISE_CMP_NEQ = 301,
|
| 118 |
+
CUDNN_POINTWISE_CMP_GT = 302,
|
| 119 |
+
CUDNN_POINTWISE_CMP_GE = 303,
|
| 120 |
+
CUDNN_POINTWISE_CMP_LT = 304,
|
| 121 |
+
CUDNN_POINTWISE_CMP_LE = 305,
|
| 122 |
+
|
| 123 |
+
CUDNN_POINTWISE_LOGICAL_AND = 400,
|
| 124 |
+
CUDNN_POINTWISE_LOGICAL_OR = 401,
|
| 125 |
+
CUDNN_POINTWISE_LOGICAL_NOT = 402,
|
| 126 |
+
|
| 127 |
+
CUDNN_POINTWISE_GEN_INDEX = 501,
|
| 128 |
+
|
| 129 |
+
CUDNN_POINTWISE_BINARY_SELECT = 601,
|
| 130 |
+
} cudnnPointwiseMode_t;
|
| 131 |
+
|
| 132 |
+
typedef enum {
|
| 133 |
+
CUDNN_RESAMPLE_NEAREST = 0,
|
| 134 |
+
CUDNN_RESAMPLE_BILINEAR = 1,
|
| 135 |
+
CUDNN_RESAMPLE_AVGPOOL = 2,
|
| 136 |
+
CUDNN_RESAMPLE_AVGPOOL_INCLUDE_PADDING = 2,
|
| 137 |
+
CUDNN_RESAMPLE_AVGPOOL_EXCLUDE_PADDING = 4,
|
| 138 |
+
CUDNN_RESAMPLE_MAXPOOL = 3,
|
| 139 |
+
} cudnnResampleMode_t;
|
| 140 |
+
|
| 141 |
+
typedef enum {
|
| 142 |
+
CUDNN_SIGNAL_SET = 0,
|
| 143 |
+
CUDNN_SIGNAL_WAIT = 1,
|
| 144 |
+
} cudnnSignalMode_t;
|
| 145 |
+
|
| 146 |
+
typedef enum {
|
| 147 |
+
CUDNN_GENSTATS_SUM_SQSUM = 0,
|
| 148 |
+
} cudnnGenStatsMode_t;
|
| 149 |
+
|
| 150 |
+
typedef enum {
|
| 151 |
+
CUDNN_BN_FINALIZE_STATISTICS_TRAINING = 0,
|
| 152 |
+
CUDNN_BN_FINALIZE_STATISTICS_INFERENCE = 1,
|
| 153 |
+
} cudnnBnFinalizeStatsMode_t;
|
| 154 |
+
|
| 155 |
+
typedef enum {
|
| 156 |
+
CUDNN_RNG_DISTRIBUTION_BERNOULLI,
|
| 157 |
+
CUDNN_RNG_DISTRIBUTION_UNIFORM,
|
| 158 |
+
CUDNN_RNG_DISTRIBUTION_NORMAL,
|
| 159 |
+
} cudnnRngDistribution_t;
|
| 160 |
+
|
| 161 |
+
typedef enum {
|
| 162 |
+
CUDNN_ATTR_POINTWISE_MODE = 0,
|
| 163 |
+
CUDNN_ATTR_POINTWISE_MATH_PREC = 1,
|
| 164 |
+
CUDNN_ATTR_POINTWISE_NAN_PROPAGATION = 2,
|
| 165 |
+
CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP = 3,
|
| 166 |
+
CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP = 4,
|
| 167 |
+
CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE = 5,
|
| 168 |
+
CUDNN_ATTR_POINTWISE_ELU_ALPHA = 6,
|
| 169 |
+
CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA = 7,
|
| 170 |
+
CUDNN_ATTR_POINTWISE_SWISH_BETA = 8,
|
| 171 |
+
CUDNN_ATTR_POINTWISE_AXIS = 9,
|
| 172 |
+
|
| 173 |
+
CUDNN_ATTR_CONVOLUTION_COMP_TYPE = 100,
|
| 174 |
+
CUDNN_ATTR_CONVOLUTION_CONV_MODE = 101,
|
| 175 |
+
CUDNN_ATTR_CONVOLUTION_DILATIONS = 102,
|
| 176 |
+
CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES = 103,
|
| 177 |
+
CUDNN_ATTR_CONVOLUTION_POST_PADDINGS = 104,
|
| 178 |
+
CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS = 105,
|
| 179 |
+
CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS = 106,
|
| 180 |
+
|
| 181 |
+
CUDNN_ATTR_ENGINEHEUR_MODE = 200,
|
| 182 |
+
CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH = 201,
|
| 183 |
+
CUDNN_ATTR_ENGINEHEUR_RESULTS = 202,
|
| 184 |
+
|
| 185 |
+
CUDNN_ATTR_ENGINECFG_ENGINE = 300,
|
| 186 |
+
CUDNN_ATTR_ENGINECFG_INTERMEDIATE_INFO = 301,
|
| 187 |
+
CUDNN_ATTR_ENGINECFG_KNOB_CHOICES = 302,
|
| 188 |
+
|
| 189 |
+
CUDNN_ATTR_EXECUTION_PLAN_HANDLE = 400,
|
| 190 |
+
CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG = 401,
|
| 191 |
+
CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE = 402,
|
| 192 |
+
CUDNN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDS = 403,
|
| 193 |
+
CUDNN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS = 404,
|
| 194 |
+
CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION = 405,
|
| 195 |
+
|
| 196 |
+
CUDNN_ATTR_INTERMEDIATE_INFO_UNIQUE_ID = 500,
|
| 197 |
+
CUDNN_ATTR_INTERMEDIATE_INFO_SIZE = 501,
|
| 198 |
+
CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_DATA_UIDS = 502,
|
| 199 |
+
CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_ATTRIBUTES = 503,
|
| 200 |
+
|
| 201 |
+
CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE = 600,
|
| 202 |
+
CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE = 601,
|
| 203 |
+
|
| 204 |
+
CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA = 700,
|
| 205 |
+
CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA = 701,
|
| 206 |
+
CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC = 702,
|
| 207 |
+
CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W = 703,
|
| 208 |
+
CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X = 704,
|
| 209 |
+
CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y = 705,
|
| 210 |
+
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA = 706,
|
| 211 |
+
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA = 707,
|
| 212 |
+
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC = 708,
|
| 213 |
+
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W = 709,
|
| 214 |
+
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX = 710,
|
| 215 |
+
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY = 711,
|
| 216 |
+
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA = 712,
|
| 217 |
+
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA = 713,
|
| 218 |
+
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC = 714,
|
| 219 |
+
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW = 715,
|
| 220 |
+
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X = 716,
|
| 221 |
+
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY = 717,
|
| 222 |
+
|
| 223 |
+
CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR = 750,
|
| 224 |
+
CUDNN_ATTR_OPERATION_POINTWISE_XDESC = 751,
|
| 225 |
+
CUDNN_ATTR_OPERATION_POINTWISE_BDESC = 752,
|
| 226 |
+
CUDNN_ATTR_OPERATION_POINTWISE_YDESC = 753,
|
| 227 |
+
CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1 = 754,
|
| 228 |
+
CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2 = 755,
|
| 229 |
+
CUDNN_ATTR_OPERATION_POINTWISE_DXDESC = 756,
|
| 230 |
+
CUDNN_ATTR_OPERATION_POINTWISE_DYDESC = 757,
|
| 231 |
+
CUDNN_ATTR_OPERATION_POINTWISE_TDESC = 758,
|
| 232 |
+
|
| 233 |
+
CUDNN_ATTR_OPERATION_GENSTATS_MODE = 770,
|
| 234 |
+
CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC = 771,
|
| 235 |
+
CUDNN_ATTR_OPERATION_GENSTATS_XDESC = 772,
|
| 236 |
+
CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC = 773,
|
| 237 |
+
CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC = 774,
|
| 238 |
+
|
| 239 |
+
CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE = 780,
|
| 240 |
+
CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC = 781,
|
| 241 |
+
CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC = 782,
|
| 242 |
+
CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC = 783,
|
| 243 |
+
CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC = 784,
|
| 244 |
+
CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC = 785,
|
| 245 |
+
CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC = 786,
|
| 246 |
+
CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC = 787,
|
| 247 |
+
CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC = 788,
|
| 248 |
+
CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC = 789,
|
| 249 |
+
CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC = 790,
|
| 250 |
+
CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC = 791,
|
| 251 |
+
CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC = 792,
|
| 252 |
+
CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC = 793,
|
| 253 |
+
CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC = 794,
|
| 254 |
+
CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC = 795,
|
| 255 |
+
CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERATE_FACTOR_DESC = 796,
|
| 256 |
+
|
| 257 |
+
CUDNN_ATTR_OPERATIONGRAPH_HANDLE = 800,
|
| 258 |
+
CUDNN_ATTR_OPERATIONGRAPH_OPS = 801,
|
| 259 |
+
CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT = 802,
|
| 260 |
+
|
| 261 |
+
CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT = 900,
|
| 262 |
+
CUDNN_ATTR_TENSOR_DATA_TYPE = 901,
|
| 263 |
+
CUDNN_ATTR_TENSOR_DIMENSIONS = 902,
|
| 264 |
+
CUDNN_ATTR_TENSOR_STRIDES = 903,
|
| 265 |
+
CUDNN_ATTR_TENSOR_VECTOR_COUNT = 904,
|
| 266 |
+
CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION = 905,
|
| 267 |
+
CUDNN_ATTR_TENSOR_UNIQUE_ID = 906,
|
| 268 |
+
CUDNN_ATTR_TENSOR_IS_VIRTUAL = 907,
|
| 269 |
+
CUDNN_ATTR_TENSOR_IS_BY_VALUE = 908,
|
| 270 |
+
CUDNN_ATTR_TENSOR_REORDERING_MODE = 909,
|
| 271 |
+
|
| 272 |
+
CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS = 1000,
|
| 273 |
+
CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS = 1001,
|
| 274 |
+
CUDNN_ATTR_VARIANT_PACK_INTERMEDIATES = 1002,
|
| 275 |
+
CUDNN_ATTR_VARIANT_PACK_WORKSPACE = 1003,
|
| 276 |
+
|
| 277 |
+
CUDNN_ATTR_LAYOUT_INFO_TENSOR_UID = 1100,
|
| 278 |
+
CUDNN_ATTR_LAYOUT_INFO_TYPES = 1101,
|
| 279 |
+
|
| 280 |
+
CUDNN_ATTR_KNOB_INFO_TYPE = 1200,
|
| 281 |
+
CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE = 1201,
|
| 282 |
+
CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE = 1202,
|
| 283 |
+
CUDNN_ATTR_KNOB_INFO_STRIDE = 1203,
|
| 284 |
+
|
| 285 |
+
CUDNN_ATTR_ENGINE_OPERATION_GRAPH = 1300,
|
| 286 |
+
CUDNN_ATTR_ENGINE_GLOBAL_INDEX = 1301,
|
| 287 |
+
CUDNN_ATTR_ENGINE_KNOB_INFO = 1302,
|
| 288 |
+
CUDNN_ATTR_ENGINE_NUMERICAL_NOTE = 1303,
|
| 289 |
+
CUDNN_ATTR_ENGINE_LAYOUT_INFO = 1304,
|
| 290 |
+
CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE = 1305,
|
| 291 |
+
|
| 292 |
+
CUDNN_ATTR_MATMUL_COMP_TYPE = 1500,
|
| 293 |
+
|
| 294 |
+
CUDNN_ATTR_OPERATION_MATMUL_ADESC = 1520,
|
| 295 |
+
CUDNN_ATTR_OPERATION_MATMUL_BDESC = 1521,
|
| 296 |
+
CUDNN_ATTR_OPERATION_MATMUL_CDESC = 1522,
|
| 297 |
+
CUDNN_ATTR_OPERATION_MATMUL_DESC = 1523,
|
| 298 |
+
CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT = 1524,
|
| 299 |
+
CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC = 1525,
|
| 300 |
+
CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC = 1526,
|
| 301 |
+
CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC = 1527,
|
| 302 |
+
|
| 303 |
+
CUDNN_ATTR_REDUCTION_OPERATOR = 1600,
|
| 304 |
+
CUDNN_ATTR_REDUCTION_COMP_TYPE = 1601,
|
| 305 |
+
|
| 306 |
+
CUDNN_ATTR_OPERATION_REDUCTION_XDESC = 1610,
|
| 307 |
+
CUDNN_ATTR_OPERATION_REDUCTION_YDESC = 1611,
|
| 308 |
+
CUDNN_ATTR_OPERATION_REDUCTION_DESC = 1612,
|
| 309 |
+
|
| 310 |
+
CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC = 1620,
|
| 311 |
+
CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MEAN_DESC = 1621,
|
| 312 |
+
CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_INVSTD_DESC = 1622,
|
| 313 |
+
CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_BN_SCALE_DESC = 1623,
|
| 314 |
+
CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_X_DESC = 1624,
|
| 315 |
+
CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DY_DESC = 1625,
|
| 316 |
+
CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_SCALE_DESC = 1626,
|
| 317 |
+
CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_BIAS_DESC = 1627,
|
| 318 |
+
CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_DY_SCALE_DESC = 1628,
|
| 319 |
+
CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_X_SCALE_DESC = 1629,
|
| 320 |
+
CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_BIAS = 1630,
|
| 321 |
+
|
| 322 |
+
CUDNN_ATTR_RESAMPLE_MODE = 1700,
|
| 323 |
+
CUDNN_ATTR_RESAMPLE_COMP_TYPE = 1701,
|
| 324 |
+
CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS = 1702,
|
| 325 |
+
CUDNN_ATTR_RESAMPLE_POST_PADDINGS = 1703,
|
| 326 |
+
CUDNN_ATTR_RESAMPLE_PRE_PADDINGS = 1704,
|
| 327 |
+
CUDNN_ATTR_RESAMPLE_STRIDES = 1705,
|
| 328 |
+
CUDNN_ATTR_RESAMPLE_WINDOW_DIMS = 1706,
|
| 329 |
+
CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION = 1707,
|
| 330 |
+
CUDNN_ATTR_RESAMPLE_PADDING_MODE = 1708,
|
| 331 |
+
|
| 332 |
+
CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC = 1710,
|
| 333 |
+
CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC = 1711,
|
| 334 |
+
CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC = 1712,
|
| 335 |
+
CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA = 1713,
|
| 336 |
+
CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA = 1714,
|
| 337 |
+
CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC = 1716,
|
| 338 |
+
|
| 339 |
+
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC = 1720,
|
| 340 |
+
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC = 1721,
|
| 341 |
+
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC = 1722,
|
| 342 |
+
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA = 1723,
|
| 343 |
+
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA = 1724,
|
| 344 |
+
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC = 1725,
|
| 345 |
+
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC = 1726,
|
| 346 |
+
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC = 1727,
|
| 347 |
+
|
| 348 |
+
CUDNN_ATTR_OPERATION_CONCAT_AXIS = 1800,
|
| 349 |
+
CUDNN_ATTR_OPERATION_CONCAT_INPUT_DESCS = 1801,
|
| 350 |
+
CUDNN_ATTR_OPERATION_CONCAT_INPLACE_INDEX = 1802,
|
| 351 |
+
CUDNN_ATTR_OPERATION_CONCAT_OUTPUT_DESC = 1803,
|
| 352 |
+
|
| 353 |
+
CUDNN_ATTR_OPERATION_SIGNAL_MODE = 1900,
|
| 354 |
+
CUDNN_ATTR_OPERATION_SIGNAL_FLAGDESC = 1901,
|
| 355 |
+
CUDNN_ATTR_OPERATION_SIGNAL_VALUE = 1902,
|
| 356 |
+
CUDNN_ATTR_OPERATION_SIGNAL_XDESC = 1903,
|
| 357 |
+
CUDNN_ATTR_OPERATION_SIGNAL_YDESC = 1904,
|
| 358 |
+
|
| 359 |
+
CUDNN_ATTR_OPERATION_NORM_FWD_MODE = 2000,
|
| 360 |
+
CUDNN_ATTR_OPERATION_NORM_FWD_PHASE = 2001,
|
| 361 |
+
CUDNN_ATTR_OPERATION_NORM_FWD_XDESC = 2002,
|
| 362 |
+
CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC = 2003,
|
| 363 |
+
CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC = 2004,
|
| 364 |
+
CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC = 2005,
|
| 365 |
+
CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC = 2006,
|
| 366 |
+
CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC = 2007,
|
| 367 |
+
CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESC = 2008,
|
| 368 |
+
CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC = 2009,
|
| 369 |
+
CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC = 2010,
|
| 370 |
+
CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC = 2011,
|
| 371 |
+
CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC = 2012,
|
| 372 |
+
CUDNN_ATTR_OPERATION_NORM_FWD_YDESC = 2013,
|
| 373 |
+
CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS = 2014,
|
| 374 |
+
|
| 375 |
+
CUDNN_ATTR_OPERATION_NORM_BWD_MODE = 2100,
|
| 376 |
+
CUDNN_ATTR_OPERATION_NORM_BWD_XDESC = 2101,
|
| 377 |
+
CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC = 2102,
|
| 378 |
+
CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC = 2103,
|
| 379 |
+
CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC = 2104,
|
| 380 |
+
CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC = 2105,
|
| 381 |
+
CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESC = 2106,
|
| 382 |
+
CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC = 2107,
|
| 383 |
+
CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC = 2108,
|
| 384 |
+
CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC = 2109,
|
| 385 |
+
CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS = 2110,
|
| 386 |
+
|
| 387 |
+
CUDNN_ATTR_OPERATION_RESHAPE_XDESC = 2200,
|
| 388 |
+
CUDNN_ATTR_OPERATION_RESHAPE_YDESC = 2201,
|
| 389 |
+
|
| 390 |
+
CUDNN_ATTR_RNG_DISTRIBUTION = 2300,
|
| 391 |
+
CUDNN_ATTR_RNG_NORMAL_DIST_MEAN = 2301,
|
| 392 |
+
CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION = 2302,
|
| 393 |
+
CUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM = 2303,
|
| 394 |
+
CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM = 2304,
|
| 395 |
+
CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY = 2305,
|
| 396 |
+
|
| 397 |
+
CUDNN_ATTR_OPERATION_RNG_YDESC = 2310,
|
| 398 |
+
CUDNN_ATTR_OPERATION_RNG_SEED = 2311,
|
| 399 |
+
CUDNN_ATTR_OPERATION_RNG_DESC = 2312,
|
| 400 |
+
|
| 401 |
+
} cudnnBackendAttributeName_t;
|
| 402 |
+
|
| 403 |
+
typedef enum {
|
| 404 |
+
CUDNN_TYPE_HANDLE = 0,
|
| 405 |
+
CUDNN_TYPE_DATA_TYPE,
|
| 406 |
+
CUDNN_TYPE_BOOLEAN,
|
| 407 |
+
CUDNN_TYPE_INT64,
|
| 408 |
+
CUDNN_TYPE_FLOAT,
|
| 409 |
+
CUDNN_TYPE_DOUBLE,
|
| 410 |
+
CUDNN_TYPE_VOID_PTR,
|
| 411 |
+
CUDNN_TYPE_CONVOLUTION_MODE,
|
| 412 |
+
CUDNN_TYPE_HEUR_MODE,
|
| 413 |
+
CUDNN_TYPE_KNOB_TYPE,
|
| 414 |
+
CUDNN_TYPE_NAN_PROPOGATION,
|
| 415 |
+
CUDNN_TYPE_NUMERICAL_NOTE,
|
| 416 |
+
CUDNN_TYPE_LAYOUT_TYPE,
|
| 417 |
+
CUDNN_TYPE_ATTRIB_NAME,
|
| 418 |
+
CUDNN_TYPE_POINTWISE_MODE,
|
| 419 |
+
CUDNN_TYPE_BACKEND_DESCRIPTOR,
|
| 420 |
+
CUDNN_TYPE_GENSTATS_MODE,
|
| 421 |
+
CUDNN_TYPE_BN_FINALIZE_STATS_MODE,
|
| 422 |
+
CUDNN_TYPE_REDUCTION_OPERATOR_TYPE,
|
| 423 |
+
CUDNN_TYPE_BEHAVIOR_NOTE,
|
| 424 |
+
CUDNN_TYPE_TENSOR_REORDERING_MODE,
|
| 425 |
+
CUDNN_TYPE_RESAMPLE_MODE,
|
| 426 |
+
CUDNN_TYPE_PADDING_MODE,
|
| 427 |
+
CUDNN_TYPE_INT32,
|
| 428 |
+
CUDNN_TYPE_CHAR,
|
| 429 |
+
CUDNN_TYPE_SIGNAL_MODE,
|
| 430 |
+
CUDNN_TYPE_FRACTION,
|
| 431 |
+
CUDNN_TYPE_NORM_MODE,
|
| 432 |
+
CUDNN_TYPE_NORM_FWD_PHASE,
|
| 433 |
+
CUDNN_TYPE_RNG_DISTRIBUTION
|
| 434 |
+
} cudnnBackendAttributeType_t;
|
| 435 |
+
|
| 436 |
+
typedef enum {
|
| 437 |
+
CUDNN_BACKEND_POINTWISE_DESCRIPTOR = 0,
|
| 438 |
+
CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR,
|
| 439 |
+
CUDNN_BACKEND_ENGINE_DESCRIPTOR,
|
| 440 |
+
CUDNN_BACKEND_ENGINECFG_DESCRIPTOR,
|
| 441 |
+
CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR,
|
| 442 |
+
CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR,
|
| 443 |
+
CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR,
|
| 444 |
+
CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR,
|
| 445 |
+
CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR,
|
| 446 |
+
CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR,
|
| 447 |
+
CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR,
|
| 448 |
+
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR,
|
| 449 |
+
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR,
|
| 450 |
+
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR,
|
| 451 |
+
CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR,
|
| 452 |
+
CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR,
|
| 453 |
+
CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR,
|
| 454 |
+
CUDNN_BACKEND_TENSOR_DESCRIPTOR,
|
| 455 |
+
CUDNN_BACKEND_MATMUL_DESCRIPTOR,
|
| 456 |
+
CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR,
|
| 457 |
+
CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR,
|
| 458 |
+
CUDNN_BACKEND_REDUCTION_DESCRIPTOR,
|
| 459 |
+
CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR,
|
| 460 |
+
CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR,
|
| 461 |
+
CUDNN_BACKEND_RESAMPLE_DESCRIPTOR,
|
| 462 |
+
CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR,
|
| 463 |
+
CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR,
|
| 464 |
+
CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR,
|
| 465 |
+
CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR,
|
| 466 |
+
CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR,
|
| 467 |
+
CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR,
|
| 468 |
+
CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR,
|
| 469 |
+
CUDNN_BACKEND_RNG_DESCRIPTOR,
|
| 470 |
+
CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR
|
| 471 |
+
} cudnnBackendDescriptorType_t;
|
| 472 |
+
|
| 473 |
+
typedef enum {
|
| 474 |
+
CUDNN_NUMERICAL_NOTE_TENSOR_CORE = 0,
|
| 475 |
+
CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS,
|
| 476 |
+
CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION,
|
| 477 |
+
CUDNN_NUMERICAL_NOTE_FFT,
|
| 478 |
+
CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC,
|
| 479 |
+
CUDNN_NUMERICAL_NOTE_WINOGRAD,
|
| 480 |
+
CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4,
|
| 481 |
+
CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6,
|
| 482 |
+
CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13,
|
| 483 |
+
CUDNN_NUMERICAL_NOTE_TYPE_COUNT,
|
| 484 |
+
} cudnnBackendNumericalNote_t;
|
| 485 |
+
|
| 486 |
+
typedef enum {
|
| 487 |
+
CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION = 0,
|
| 488 |
+
CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER = 1,
|
| 489 |
+
CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER = 2,
|
| 490 |
+
CUDNN_BEHAVIOR_NOTE_TYPE_COUNT,
|
| 491 |
+
} cudnnBackendBehaviorNote_t;
|
| 492 |
+
|
| 493 |
+
typedef enum {
|
| 494 |
+
CUDNN_KNOB_TYPE_SPLIT_K = 0,
|
| 495 |
+
CUDNN_KNOB_TYPE_SWIZZLE = 1,
|
| 496 |
+
CUDNN_KNOB_TYPE_TILE_SIZE = 2,
|
| 497 |
+
CUDNN_KNOB_TYPE_USE_TEX = 3,
|
| 498 |
+
CUDNN_KNOB_TYPE_EDGE = 4,
|
| 499 |
+
CUDNN_KNOB_TYPE_KBLOCK = 5,
|
| 500 |
+
CUDNN_KNOB_TYPE_LDGA = 6,
|
| 501 |
+
CUDNN_KNOB_TYPE_LDGB = 7,
|
| 502 |
+
CUDNN_KNOB_TYPE_CHUNK_K = 8,
|
| 503 |
+
CUDNN_KNOB_TYPE_SPLIT_H = 9,
|
| 504 |
+
CUDNN_KNOB_TYPE_WINO_TILE = 10,
|
| 505 |
+
CUDNN_KNOB_TYPE_MULTIPLY = 11,
|
| 506 |
+
CUDNN_KNOB_TYPE_SPLIT_K_BUF = 12,
|
| 507 |
+
CUDNN_KNOB_TYPE_TILEK = 13,
|
| 508 |
+
CUDNN_KNOB_TYPE_STAGES = 14,
|
| 509 |
+
CUDNN_KNOB_TYPE_REDUCTION_MODE = 15,
|
| 510 |
+
CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE = 16,
|
| 511 |
+
CUDNN_KNOB_TYPE_SPLIT_K_SLC = 17,
|
| 512 |
+
CUDNN_KNOB_TYPE_IDX_MODE = 18,
|
| 513 |
+
CUDNN_KNOB_TYPE_SLICED = 19,
|
| 514 |
+
CUDNN_KNOB_TYPE_SPLIT_RS = 20,
|
| 515 |
+
CUDNN_KNOB_TYPE_SINGLEBUFFER = 21,
|
| 516 |
+
CUDNN_KNOB_TYPE_LDGC = 22,
|
| 517 |
+
CUDNN_KNOB_TYPE_SPECFILT = 23,
|
| 518 |
+
CUDNN_KNOB_TYPE_KERNEL_CFG = 24,
|
| 519 |
+
CUDNN_KNOB_TYPE_WORKSPACE = 25,
|
| 520 |
+
CUDNN_KNOB_TYPE_TILE_CGA = 26,
|
| 521 |
+
CUDNN_KNOB_TYPE_TILE_CGA_M = 27,
|
| 522 |
+
CUDNN_KNOB_TYPE_TILE_CGA_N = 28,
|
| 523 |
+
|
| 524 |
+
CUDNN_KNOB_TYPE_COUNTS,
|
| 525 |
+
} cudnnBackendKnobType_t;
|
| 526 |
+
|
| 527 |
+
typedef enum {
|
| 528 |
+
CUDNN_LAYOUT_TYPE_PREFERRED_NCHW = 0,
|
| 529 |
+
CUDNN_LAYOUT_TYPE_PREFERRED_NHWC = 1,
|
| 530 |
+
CUDNN_LAYOUT_TYPE_PREFERRED_PAD4CK = 2,
|
| 531 |
+
CUDNN_LAYOUT_TYPE_PREFERRED_PAD8CK = 3,
|
| 532 |
+
CUDNN_LAYOUT_TYPE_COUNT = 4,
|
| 533 |
+
} cudnnBackendLayoutType_t;
|
| 534 |
+
|
| 535 |
+
typedef enum {
|
| 536 |
+
CUDNN_HEUR_MODE_INSTANT = 0,
|
| 537 |
+
CUDNN_HEUR_MODE_B = 1,
|
| 538 |
+
CUDNN_HEUR_MODE_FALLBACK = 2,
|
| 539 |
+
CUDNN_HEUR_MODE_A = 3,
|
| 540 |
+
CUDNN_HEUR_MODES_COUNT = 4,
|
| 541 |
+
} cudnnBackendHeurMode_t;
|
| 542 |
+
|
| 543 |
+
typedef enum {
|
| 544 |
+
CUDNN_TENSOR_REORDERING_NONE = 0,
|
| 545 |
+
CUDNN_TENSOR_REORDERING_INT8x32 = 1,
|
| 546 |
+
} cudnnBackendTensorReordering_t;
|
| 547 |
+
|
| 548 |
+
typedef enum {
|
| 549 |
+
CUDNN_ZERO_PAD = 0,
|
| 550 |
+
CUDNN_NEG_INF_PAD = 1,
|
| 551 |
+
CUDNN_EDGE_VAL_PAD = 2,
|
| 552 |
+
} cudnnPaddingMode_t;
|
| 553 |
+
|
| 554 |
+
typedef enum {
|
| 555 |
+
CUDNN_LAYER_NORM = 0,
|
| 556 |
+
CUDNN_INSTANCE_NORM = 1,
|
| 557 |
+
CUDNN_BATCH_NORM = 2,
|
| 558 |
+
CUDNN_GROUP_NORM = 3,
|
| 559 |
+
} cudnnBackendNormMode_t;
|
| 560 |
+
|
| 561 |
+
typedef enum {
|
| 562 |
+
CUDNN_NORM_FWD_INFERENCE = 0,
|
| 563 |
+
CUDNN_NORM_FWD_TRAINING = 1,
|
| 564 |
+
} cudnnBackendNormFwdPhase_t;
|
| 565 |
+
|
| 566 |
+
cudnnStatus_t CUDNNWINAPI
|
| 567 |
+
cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t *descriptor);
|
| 568 |
+
|
| 569 |
+
cudnnStatus_t CUDNNWINAPI
|
| 570 |
+
cudnnBackendDestroyDescriptor(cudnnBackendDescriptor_t descriptor);
|
| 571 |
+
|
| 572 |
+
cudnnStatus_t CUDNNWINAPI
|
| 573 |
+
cudnnBackendInitialize(cudnnBackendDescriptor_t descriptor);
|
| 574 |
+
|
| 575 |
+
cudnnStatus_t CUDNNWINAPI
|
| 576 |
+
cudnnBackendFinalize(cudnnBackendDescriptor_t descriptor);
|
| 577 |
+
|
| 578 |
+
cudnnStatus_t CUDNNWINAPI
|
| 579 |
+
cudnnBackendSetAttribute(cudnnBackendDescriptor_t descriptor,
|
| 580 |
+
cudnnBackendAttributeName_t attributeName,
|
| 581 |
+
cudnnBackendAttributeType_t attributeType,
|
| 582 |
+
int64_t elementCount,
|
| 583 |
+
const void *arrayOfElements);
|
| 584 |
+
|
| 585 |
+
cudnnStatus_t CUDNNWINAPI
|
| 586 |
+
cudnnBackendGetAttribute(cudnnBackendDescriptor_t const descriptor,
|
| 587 |
+
cudnnBackendAttributeName_t attributeName,
|
| 588 |
+
cudnnBackendAttributeType_t attributeType,
|
| 589 |
+
int64_t requestedElementCount,
|
| 590 |
+
int64_t *elementCount,
|
| 591 |
+
void *arrayOfElements);
|
| 592 |
+
|
| 593 |
+
cudnnStatus_t CUDNNWINAPI
|
| 594 |
+
cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack);
|
| 595 |
+
|
| 596 |
+
#if defined(__cplusplus)
|
| 597 |
+
}
|
| 598 |
+
#endif
|
| 599 |
+
|
| 600 |
+
#endif /* _CUDNN_BACKEND_H_ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_infer.h
ADDED
|
@@ -0,0 +1,1183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2017-2022 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
/*
|
| 51 |
+
* cudnn_ops_infer : cuDNN's basic definitions and inference operations.
|
| 52 |
+
*/
|
| 53 |
+
|
| 54 |
+
#if !defined(CUDNN_OPS_INFER_H_)
|
| 55 |
+
#define CUDNN_OPS_INFER_H_
|
| 56 |
+
|
| 57 |
+
#include <cuda_runtime.h>
|
| 58 |
+
#include <stdint.h>
|
| 59 |
+
|
| 60 |
+
#include "cudnn_version.h"
|
| 61 |
+
|
| 62 |
+
/* These version numbers are autogenerated, do not edit manually. */
|
| 63 |
+
#define CUDNN_OPS_INFER_MAJOR 8
|
| 64 |
+
#define CUDNN_OPS_INFER_MINOR 7
|
| 65 |
+
#define CUDNN_OPS_INFER_PATCH 0
|
| 66 |
+
|
| 67 |
+
#if (CUDNN_OPS_INFER_MAJOR != CUDNN_MAJOR) || (CUDNN_OPS_INFER_MINOR != CUDNN_MINOR) || \
|
| 68 |
+
(CUDNN_OPS_INFER_PATCH != CUDNN_PATCHLEVEL)
|
| 69 |
+
#error Version mismatch in cuDNN OPS INFER!!!
|
| 70 |
+
#endif
|
| 71 |
+
|
| 72 |
+
#ifndef CUDNNWINAPI
|
| 73 |
+
#ifdef _WIN32
|
| 74 |
+
#define CUDNNWINAPI __stdcall
|
| 75 |
+
#else
|
| 76 |
+
#define CUDNNWINAPI
|
| 77 |
+
#endif
|
| 78 |
+
#endif
|
| 79 |
+
|
| 80 |
+
/* Warnings for deprecated API-s are enabled using the CUDNN_WARN_DEPRECATED macro */
|
| 81 |
+
#if defined(CUDNN_WARN_DEPRECATED) && (defined(__GNUC__) || defined(__clang__))
|
| 82 |
+
/* GCC, Intel C/C++, Cray C/C++, CLANG, IBM XL C/C++ little endian */
|
| 83 |
+
#define CUDNN_DEPRECATED __attribute__((deprecated))
|
| 84 |
+
#elif defined(CUDNN_WARN_DEPRECATED) && defined(_MSC_VER)
|
| 85 |
+
/* Microsoft Visual C++ */
|
| 86 |
+
#define CUDNN_DEPRECATED __declspec(deprecated)
|
| 87 |
+
#elif defined(CUDNN_WARN_DEPRECATED) && (__cplusplus >= 201402L)
|
| 88 |
+
/* C++14 compilers */
|
| 89 |
+
#define CUDNN_DEPRECATED [[deprecated]]
|
| 90 |
+
#else
|
| 91 |
+
/* No support for the deprecated attribute */
|
| 92 |
+
#define CUDNN_DEPRECATED
|
| 93 |
+
#endif
|
| 94 |
+
|
| 95 |
+
#if defined(__cplusplus)
|
| 96 |
+
extern "C" {
|
| 97 |
+
#endif
|
| 98 |
+
|
| 99 |
+
struct cudnnContext;
|
| 100 |
+
typedef struct cudnnContext *cudnnHandle_t;
|
| 101 |
+
|
| 102 |
+
size_t CUDNNWINAPI
|
| 103 |
+
cudnnGetVersion(void);
|
| 104 |
+
|
| 105 |
+
size_t CUDNNWINAPI
|
| 106 |
+
cudnnGetMaxDeviceVersion(void);
|
| 107 |
+
|
| 108 |
+
/* Returns CUDA Runtime version statically linked against cudnn */
|
| 109 |
+
size_t CUDNNWINAPI
|
| 110 |
+
cudnnGetCudartVersion(void);
|
| 111 |
+
|
| 112 |
+
/*
|
| 113 |
+
* CUDNN return codes
|
| 114 |
+
*/
|
| 115 |
+
typedef enum {
|
| 116 |
+
CUDNN_STATUS_SUCCESS = 0,
|
| 117 |
+
CUDNN_STATUS_NOT_INITIALIZED = 1,
|
| 118 |
+
CUDNN_STATUS_ALLOC_FAILED = 2,
|
| 119 |
+
CUDNN_STATUS_BAD_PARAM = 3,
|
| 120 |
+
CUDNN_STATUS_INTERNAL_ERROR = 4,
|
| 121 |
+
CUDNN_STATUS_INVALID_VALUE = 5,
|
| 122 |
+
CUDNN_STATUS_ARCH_MISMATCH = 6,
|
| 123 |
+
CUDNN_STATUS_MAPPING_ERROR = 7,
|
| 124 |
+
CUDNN_STATUS_EXECUTION_FAILED = 8,
|
| 125 |
+
CUDNN_STATUS_NOT_SUPPORTED = 9,
|
| 126 |
+
CUDNN_STATUS_LICENSE_ERROR = 10,
|
| 127 |
+
CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING = 11,
|
| 128 |
+
CUDNN_STATUS_RUNTIME_IN_PROGRESS = 12,
|
| 129 |
+
CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 13,
|
| 130 |
+
CUDNN_STATUS_VERSION_MISMATCH = 14,
|
| 131 |
+
} cudnnStatus_t;
|
| 132 |
+
|
| 133 |
+
/* human-readable error messages */
|
| 134 |
+
const char *CUDNNWINAPI
|
| 135 |
+
cudnnGetErrorString(cudnnStatus_t status);
|
| 136 |
+
|
| 137 |
+
/* Forward definition in this version only */
|
| 138 |
+
typedef struct cudnnRuntimeTag_t cudnnRuntimeTag_t;
|
| 139 |
+
|
| 140 |
+
typedef enum {
|
| 141 |
+
CUDNN_ERRQUERY_RAWCODE = 0,
|
| 142 |
+
CUDNN_ERRQUERY_NONBLOCKING = 1,
|
| 143 |
+
CUDNN_ERRQUERY_BLOCKING = 2,
|
| 144 |
+
} cudnnErrQueryMode_t;
|
| 145 |
+
|
| 146 |
+
cudnnStatus_t CUDNNWINAPI
|
| 147 |
+
cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag);
|
| 148 |
+
|
| 149 |
+
#ifndef __LIBRARY_TYPES_H__
|
| 150 |
+
|
| 151 |
+
typedef enum libraryPropertyType_t { MAJOR_VERSION, MINOR_VERSION, PATCH_LEVEL } libraryPropertyType;
|
| 152 |
+
|
| 153 |
+
#endif
|
| 154 |
+
|
| 155 |
+
cudnnStatus_t CUDNNWINAPI
|
| 156 |
+
cudnnGetProperty(libraryPropertyType type, int *value);
|
| 157 |
+
|
| 158 |
+
cudnnStatus_t CUDNNWINAPI
|
| 159 |
+
cudnnCreate(cudnnHandle_t *handle);
|
| 160 |
+
cudnnStatus_t CUDNNWINAPI
|
| 161 |
+
cudnnDestroy(cudnnHandle_t handle);
|
| 162 |
+
cudnnStatus_t CUDNNWINAPI
|
| 163 |
+
cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId);
|
| 164 |
+
cudnnStatus_t CUDNNWINAPI
|
| 165 |
+
cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId);
|
| 166 |
+
|
| 167 |
+
/* Data structures to represent Image/Filter and the Neural Network Layer */
|
| 168 |
+
typedef struct cudnnTensorStruct *cudnnTensorDescriptor_t;
|
| 169 |
+
typedef struct cudnnPoolingStruct *cudnnPoolingDescriptor_t;
|
| 170 |
+
typedef struct cudnnFilterStruct *cudnnFilterDescriptor_t;
|
| 171 |
+
typedef struct cudnnLRNStruct *cudnnLRNDescriptor_t;
|
| 172 |
+
typedef struct cudnnActivationStruct *cudnnActivationDescriptor_t;
|
| 173 |
+
typedef struct cudnnSpatialTransformerStruct *cudnnSpatialTransformerDescriptor_t;
|
| 174 |
+
typedef struct cudnnOpTensorStruct *cudnnOpTensorDescriptor_t;
|
| 175 |
+
typedef struct cudnnReduceTensorStruct *cudnnReduceTensorDescriptor_t;
|
| 176 |
+
typedef struct cudnnCTCLossStruct *cudnnCTCLossDescriptor_t;
|
| 177 |
+
typedef struct cudnnTensorTransformStruct *cudnnTensorTransformDescriptor_t;
|
| 178 |
+
/*
|
| 179 |
+
* CUDNN data type
|
| 180 |
+
*/
|
| 181 |
+
typedef enum {
|
| 182 |
+
CUDNN_DATA_FLOAT = 0,
|
| 183 |
+
CUDNN_DATA_DOUBLE = 1,
|
| 184 |
+
CUDNN_DATA_HALF = 2,
|
| 185 |
+
CUDNN_DATA_INT8 = 3,
|
| 186 |
+
CUDNN_DATA_INT32 = 4,
|
| 187 |
+
CUDNN_DATA_INT8x4 = 5,
|
| 188 |
+
CUDNN_DATA_UINT8 = 6,
|
| 189 |
+
CUDNN_DATA_UINT8x4 = 7,
|
| 190 |
+
CUDNN_DATA_INT8x32 = 8,
|
| 191 |
+
CUDNN_DATA_BFLOAT16 = 9,
|
| 192 |
+
CUDNN_DATA_INT64 = 10,
|
| 193 |
+
CUDNN_DATA_BOOLEAN = 11,
|
| 194 |
+
CUDNN_DATA_FP8_E4M3 = 12,
|
| 195 |
+
CUDNN_DATA_FP8_E5M2 = 13,
|
| 196 |
+
CUDNN_DATA_FAST_FLOAT_FOR_FP8 = 14,
|
| 197 |
+
} cudnnDataType_t;
|
| 198 |
+
|
| 199 |
+
/*
|
| 200 |
+
* CUDNN math type
|
| 201 |
+
*/
|
| 202 |
+
typedef enum {
|
| 203 |
+
CUDNN_DEFAULT_MATH = 0,
|
| 204 |
+
CUDNN_TENSOR_OP_MATH = 1,
|
| 205 |
+
CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION = 2,
|
| 206 |
+
CUDNN_FMA_MATH = 3,
|
| 207 |
+
} cudnnMathType_t;
|
| 208 |
+
|
| 209 |
+
/*
|
| 210 |
+
* CUDNN propagate Nan
|
| 211 |
+
*/
|
| 212 |
+
typedef enum {
|
| 213 |
+
CUDNN_NOT_PROPAGATE_NAN = 0,
|
| 214 |
+
CUDNN_PROPAGATE_NAN = 1,
|
| 215 |
+
} cudnnNanPropagation_t;
|
| 216 |
+
|
| 217 |
+
/*
|
| 218 |
+
* CUDNN Determinism
|
| 219 |
+
*/
|
| 220 |
+
typedef enum {
|
| 221 |
+
CUDNN_NON_DETERMINISTIC = 0,
|
| 222 |
+
CUDNN_DETERMINISTIC = 1,
|
| 223 |
+
} cudnnDeterminism_t;
|
| 224 |
+
|
| 225 |
+
/* Maximum supported number of tensor dimensions */
|
| 226 |
+
#define CUDNN_DIM_MAX 8
|
| 227 |
+
|
| 228 |
+
/* Create an instance of a generic Tensor descriptor */
|
| 229 |
+
cudnnStatus_t CUDNNWINAPI
|
| 230 |
+
cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc);
|
| 231 |
+
|
| 232 |
+
typedef enum {
|
| 233 |
+
CUDNN_TENSOR_NCHW = 0, /* row major (wStride = 1, hStride = w) */
|
| 234 |
+
CUDNN_TENSOR_NHWC = 1, /* feature maps interleaved ( cStride = 1 )*/
|
| 235 |
+
CUDNN_TENSOR_NCHW_VECT_C = 2, /* each image point is vector of element of C, vector length in data type */
|
| 236 |
+
} cudnnTensorFormat_t;
|
| 237 |
+
|
| 238 |
+
cudnnStatus_t CUDNNWINAPI
|
| 239 |
+
cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc,
|
| 240 |
+
cudnnTensorFormat_t format,
|
| 241 |
+
cudnnDataType_t dataType, /* image data type */
|
| 242 |
+
int n, /* number of inputs (batch size) */
|
| 243 |
+
int c, /* number of input feature maps */
|
| 244 |
+
int h, /* height of input section */
|
| 245 |
+
int w); /* width of input section */
|
| 246 |
+
|
| 247 |
+
cudnnStatus_t CUDNNWINAPI
|
| 248 |
+
cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
|
| 249 |
+
cudnnDataType_t dataType, /* image data type */
|
| 250 |
+
int n, /* number of inputs (batch size) */
|
| 251 |
+
int c, /* number of input feature maps */
|
| 252 |
+
int h, /* height of input section */
|
| 253 |
+
int w, /* width of input section */
|
| 254 |
+
int nStride,
|
| 255 |
+
int cStride,
|
| 256 |
+
int hStride,
|
| 257 |
+
int wStride);
|
| 258 |
+
|
| 259 |
+
cudnnStatus_t CUDNNWINAPI
|
| 260 |
+
cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc,
|
| 261 |
+
cudnnDataType_t *dataType, /* image data type */
|
| 262 |
+
int *n, /* number of inputs (batch size) */
|
| 263 |
+
int *c, /* number of input feature maps */
|
| 264 |
+
int *h, /* height of input section */
|
| 265 |
+
int *w, /* width of input section */
|
| 266 |
+
int *nStride,
|
| 267 |
+
int *cStride,
|
| 268 |
+
int *hStride,
|
| 269 |
+
int *wStride);
|
| 270 |
+
|
| 271 |
+
cudnnStatus_t CUDNNWINAPI
|
| 272 |
+
cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
|
| 273 |
+
cudnnDataType_t dataType,
|
| 274 |
+
int nbDims,
|
| 275 |
+
const int dimA[],
|
| 276 |
+
const int strideA[]);
|
| 277 |
+
|
| 278 |
+
cudnnStatus_t CUDNNWINAPI
|
| 279 |
+
cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
|
| 280 |
+
cudnnTensorFormat_t format,
|
| 281 |
+
cudnnDataType_t dataType,
|
| 282 |
+
int nbDims,
|
| 283 |
+
const int dimA[]);
|
| 284 |
+
|
| 285 |
+
cudnnStatus_t CUDNNWINAPI
|
| 286 |
+
cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc,
|
| 287 |
+
int nbDimsRequested,
|
| 288 |
+
cudnnDataType_t *dataType,
|
| 289 |
+
int *nbDims,
|
| 290 |
+
int dimA[],
|
| 291 |
+
int strideA[]);
|
| 292 |
+
|
| 293 |
+
cudnnStatus_t CUDNNWINAPI
|
| 294 |
+
cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size);
|
| 295 |
+
|
| 296 |
+
/* PixelOffset( n, c, h, w ) = n *input_stride + c * feature_stride + h * h_stride + w * w_stride
|
| 297 |
+
|
| 298 |
+
1)Example of all images in row major order one batch of features after the other (with an optional padding on row)
|
| 299 |
+
input_stride : c x h x h_stride
|
| 300 |
+
feature_stride : h x h_stride
|
| 301 |
+
h_stride : >= w ( h_stride = w if no padding)
|
| 302 |
+
w_stride : 1
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
2)Example of all images in row major with features maps interleaved
|
| 306 |
+
input_stride : c x h x h_stride
|
| 307 |
+
feature_stride : 1
|
| 308 |
+
h_stride : w x c
|
| 309 |
+
w_stride : c
|
| 310 |
+
|
| 311 |
+
3)Example of all images in column major order one batch of features after the other (with optional padding on column)
|
| 312 |
+
input_stride : c x w x w_stride
|
| 313 |
+
feature_stride : w x w_stride
|
| 314 |
+
h_stride : 1
|
| 315 |
+
w_stride : >= h
|
| 316 |
+
|
| 317 |
+
*/
|
| 318 |
+
|
| 319 |
+
/* Destroy an instance of Tensor4d descriptor */
|
| 320 |
+
cudnnStatus_t CUDNNWINAPI
|
| 321 |
+
cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc);
|
| 322 |
+
|
| 323 |
+
/* Fold/unfold transforms */
|
| 324 |
+
typedef enum {
|
| 325 |
+
CUDNN_TRANSFORM_FOLD = 0U,
|
| 326 |
+
CUDNN_TRANSFORM_UNFOLD = 1U,
|
| 327 |
+
} cudnnFoldingDirection_t;
|
| 328 |
+
|
| 329 |
+
/** Create a destination descriptor for cudnnTransformTensor */
|
| 330 |
+
cudnnStatus_t CUDNNWINAPI
|
| 331 |
+
cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t transformDesc,
|
| 332 |
+
const cudnnTensorDescriptor_t srcDesc,
|
| 333 |
+
cudnnTensorDescriptor_t destDesc,
|
| 334 |
+
size_t *destSizeInBytes);
|
| 335 |
+
|
| 336 |
+
/** Create an empty tensor transform descriptor */
|
| 337 |
+
cudnnStatus_t CUDNNWINAPI
|
| 338 |
+
cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescriptor_t *transformDesc);
|
| 339 |
+
|
| 340 |
+
/** Initialize a previously created tensor transform descriptor. */
|
| 341 |
+
cudnnStatus_t CUDNNWINAPI
|
| 342 |
+
cudnnSetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
|
| 343 |
+
const uint32_t nbDims,
|
| 344 |
+
const cudnnTensorFormat_t destFormat,
|
| 345 |
+
const int32_t padBeforeA[],
|
| 346 |
+
const int32_t padAfterA[],
|
| 347 |
+
const uint32_t foldA[],
|
| 348 |
+
const cudnnFoldingDirection_t direction);
|
| 349 |
+
|
| 350 |
+
/**
|
| 351 |
+
* Retrieves the values stored in a previously initialized tensor transform
|
| 352 |
+
* descriptor.
|
| 353 |
+
*/
|
| 354 |
+
cudnnStatus_t CUDNNWINAPI
|
| 355 |
+
cudnnGetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
|
| 356 |
+
uint32_t nbDimsRequested,
|
| 357 |
+
cudnnTensorFormat_t *destFormat,
|
| 358 |
+
int32_t padBeforeA[],
|
| 359 |
+
int32_t padAfterA[],
|
| 360 |
+
uint32_t foldA[],
|
| 361 |
+
cudnnFoldingDirection_t *direction);
|
| 362 |
+
|
| 363 |
+
/**
|
| 364 |
+
* Destroys a previously created tensor transform descriptor.
|
| 365 |
+
*/
|
| 366 |
+
cudnnStatus_t CUDNNWINAPI
|
| 367 |
+
cudnnDestroyTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc);
|
| 368 |
+
|
| 369 |
+
/* Tensor layout conversion helper (y = alpha * x + beta * y) */
|
| 370 |
+
cudnnStatus_t CUDNNWINAPI
|
| 371 |
+
cudnnTransformTensor(cudnnHandle_t handle,
|
| 372 |
+
const void *alpha,
|
| 373 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 374 |
+
const void *x,
|
| 375 |
+
const void *beta,
|
| 376 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 377 |
+
void *y);
|
| 378 |
+
|
| 379 |
+
cudnnStatus_t CUDNNWINAPI
|
| 380 |
+
cudnnTransformTensorEx(cudnnHandle_t handle,
|
| 381 |
+
const cudnnTensorTransformDescriptor_t transDesc,
|
| 382 |
+
const void *alpha,
|
| 383 |
+
const cudnnTensorDescriptor_t srcDesc,
|
| 384 |
+
const void *srcData,
|
| 385 |
+
const void *beta,
|
| 386 |
+
const cudnnTensorDescriptor_t destDesc,
|
| 387 |
+
void *destData);
|
| 388 |
+
|
| 389 |
+
/* Tensor Bias addition : C = alpha * A + beta * C */
|
| 390 |
+
cudnnStatus_t CUDNNWINAPI
|
| 391 |
+
cudnnAddTensor(cudnnHandle_t handle,
|
| 392 |
+
const void *alpha,
|
| 393 |
+
const cudnnTensorDescriptor_t aDesc,
|
| 394 |
+
const void *A,
|
| 395 |
+
const void *beta,
|
| 396 |
+
const cudnnTensorDescriptor_t cDesc,
|
| 397 |
+
void *C);
|
| 398 |
+
|
| 399 |
+
/*
|
| 400 |
+
* CUDNN OpTensor op type
|
| 401 |
+
*/
|
| 402 |
+
typedef enum {
|
| 403 |
+
CUDNN_OP_TENSOR_ADD = 0,
|
| 404 |
+
CUDNN_OP_TENSOR_MUL = 1,
|
| 405 |
+
CUDNN_OP_TENSOR_MIN = 2,
|
| 406 |
+
CUDNN_OP_TENSOR_MAX = 3,
|
| 407 |
+
CUDNN_OP_TENSOR_SQRT = 4,
|
| 408 |
+
CUDNN_OP_TENSOR_NOT = 5,
|
| 409 |
+
} cudnnOpTensorOp_t;
|
| 410 |
+
|
| 411 |
+
cudnnStatus_t CUDNNWINAPI
|
| 412 |
+
cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc);
|
| 413 |
+
|
| 414 |
+
cudnnStatus_t CUDNNWINAPI
|
| 415 |
+
cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc,
|
| 416 |
+
cudnnOpTensorOp_t opTensorOp,
|
| 417 |
+
cudnnDataType_t opTensorCompType,
|
| 418 |
+
cudnnNanPropagation_t opTensorNanOpt);
|
| 419 |
+
|
| 420 |
+
cudnnStatus_t CUDNNWINAPI
|
| 421 |
+
cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
|
| 422 |
+
cudnnOpTensorOp_t *opTensorOp,
|
| 423 |
+
cudnnDataType_t *opTensorCompType,
|
| 424 |
+
cudnnNanPropagation_t *opTensorNanOpt);
|
| 425 |
+
|
| 426 |
+
cudnnStatus_t CUDNNWINAPI
|
| 427 |
+
cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc);
|
| 428 |
+
|
| 429 |
+
/* Tensor operation : C = op( alpha1 * A, alpha2 * B ) + beta * C */
|
| 430 |
+
/* B tensor is ignored for CUDNN_OP_TENSOR_SQRT, CUDNN_OP_TENSOR_NOT. */
|
| 431 |
+
cudnnStatus_t CUDNNWINAPI
|
| 432 |
+
cudnnOpTensor(cudnnHandle_t handle,
|
| 433 |
+
const cudnnOpTensorDescriptor_t opTensorDesc,
|
| 434 |
+
const void *alpha1,
|
| 435 |
+
const cudnnTensorDescriptor_t aDesc,
|
| 436 |
+
const void *A,
|
| 437 |
+
const void *alpha2,
|
| 438 |
+
const cudnnTensorDescriptor_t bDesc,
|
| 439 |
+
const void *B,
|
| 440 |
+
const void *beta,
|
| 441 |
+
const cudnnTensorDescriptor_t cDesc,
|
| 442 |
+
void *C);
|
| 443 |
+
|
| 444 |
+
/*
|
| 445 |
+
* CUDNN ReduceTensor op type
|
| 446 |
+
*/
|
| 447 |
+
typedef enum {
|
| 448 |
+
CUDNN_REDUCE_TENSOR_ADD = 0,
|
| 449 |
+
CUDNN_REDUCE_TENSOR_MUL = 1,
|
| 450 |
+
CUDNN_REDUCE_TENSOR_MIN = 2,
|
| 451 |
+
CUDNN_REDUCE_TENSOR_MAX = 3,
|
| 452 |
+
CUDNN_REDUCE_TENSOR_AMAX = 4,
|
| 453 |
+
CUDNN_REDUCE_TENSOR_AVG = 5,
|
| 454 |
+
CUDNN_REDUCE_TENSOR_NORM1 = 6,
|
| 455 |
+
CUDNN_REDUCE_TENSOR_NORM2 = 7,
|
| 456 |
+
CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8,
|
| 457 |
+
} cudnnReduceTensorOp_t;
|
| 458 |
+
|
| 459 |
+
/*
|
| 460 |
+
* CUDNN ReduceTensor indices type
|
| 461 |
+
*/
|
| 462 |
+
typedef enum {
|
| 463 |
+
CUDNN_REDUCE_TENSOR_NO_INDICES = 0,
|
| 464 |
+
CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1,
|
| 465 |
+
} cudnnReduceTensorIndices_t;
|
| 466 |
+
|
| 467 |
+
/*
|
| 468 |
+
* CUDNN tensor indices type size (all unsigned)
|
| 469 |
+
* Currently not supported, default is 32 bit unsigned.
|
| 470 |
+
*/
|
| 471 |
+
typedef enum {
|
| 472 |
+
CUDNN_32BIT_INDICES = 0,
|
| 473 |
+
CUDNN_64BIT_INDICES = 1,
|
| 474 |
+
CUDNN_16BIT_INDICES = 2,
|
| 475 |
+
CUDNN_8BIT_INDICES = 3,
|
| 476 |
+
} cudnnIndicesType_t;
|
| 477 |
+
|
| 478 |
+
cudnnStatus_t CUDNNWINAPI
|
| 479 |
+
cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc);
|
| 480 |
+
|
| 481 |
+
cudnnStatus_t CUDNNWINAPI
|
| 482 |
+
cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc,
|
| 483 |
+
cudnnReduceTensorOp_t reduceTensorOp,
|
| 484 |
+
cudnnDataType_t reduceTensorCompType,
|
| 485 |
+
cudnnNanPropagation_t reduceTensorNanOpt,
|
| 486 |
+
cudnnReduceTensorIndices_t reduceTensorIndices,
|
| 487 |
+
cudnnIndicesType_t reduceTensorIndicesType);
|
| 488 |
+
|
| 489 |
+
cudnnStatus_t CUDNNWINAPI
|
| 490 |
+
cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc,
|
| 491 |
+
cudnnReduceTensorOp_t *reduceTensorOp,
|
| 492 |
+
cudnnDataType_t *reduceTensorCompType,
|
| 493 |
+
cudnnNanPropagation_t *reduceTensorNanOpt,
|
| 494 |
+
cudnnReduceTensorIndices_t *reduceTensorIndices,
|
| 495 |
+
cudnnIndicesType_t *reduceTensorIndicesType);
|
| 496 |
+
|
| 497 |
+
cudnnStatus_t CUDNNWINAPI
|
| 498 |
+
cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc);
|
| 499 |
+
|
| 500 |
+
/* Helper function to return the minimum size of the index space to be passed to the reduction given the input and
|
| 501 |
+
* output tensors */
|
| 502 |
+
cudnnStatus_t CUDNNWINAPI
|
| 503 |
+
cudnnGetReductionIndicesSize(cudnnHandle_t handle,
|
| 504 |
+
const cudnnReduceTensorDescriptor_t reduceTensorDesc,
|
| 505 |
+
const cudnnTensorDescriptor_t aDesc,
|
| 506 |
+
const cudnnTensorDescriptor_t cDesc,
|
| 507 |
+
size_t *sizeInBytes);
|
| 508 |
+
|
| 509 |
+
/* Helper function to return the minimum size of the workspace to be passed to the reduction given the input and output
|
| 510 |
+
* tensors */
|
| 511 |
+
cudnnStatus_t CUDNNWINAPI
|
| 512 |
+
cudnnGetReductionWorkspaceSize(cudnnHandle_t handle,
|
| 513 |
+
const cudnnReduceTensorDescriptor_t reduceTensorDesc,
|
| 514 |
+
const cudnnTensorDescriptor_t aDesc,
|
| 515 |
+
const cudnnTensorDescriptor_t cDesc,
|
| 516 |
+
size_t *sizeInBytes);
|
| 517 |
+
|
| 518 |
+
/* Tensor operation : C = reduce op( alpha * A ) + beta * C */
|
| 519 |
+
/* The NaN propagation enum applies to only the min and max reduce ops; the other reduce ops propagate NaN as usual. */
|
| 520 |
+
/* The indices space is ignored for reduce ops other than min or max. */
|
| 521 |
+
cudnnStatus_t CUDNNWINAPI
|
| 522 |
+
cudnnReduceTensor(cudnnHandle_t handle,
|
| 523 |
+
const cudnnReduceTensorDescriptor_t reduceTensorDesc,
|
| 524 |
+
void *indices,
|
| 525 |
+
size_t indicesSizeInBytes,
|
| 526 |
+
void *workspace,
|
| 527 |
+
size_t workspaceSizeInBytes,
|
| 528 |
+
const void *alpha,
|
| 529 |
+
const cudnnTensorDescriptor_t aDesc,
|
| 530 |
+
const void *A,
|
| 531 |
+
const void *beta,
|
| 532 |
+
const cudnnTensorDescriptor_t cDesc,
|
| 533 |
+
void *C);
|
| 534 |
+
|
| 535 |
+
/* Set all values of a tensor to a given value : y[i] = value[0] */
|
| 536 |
+
cudnnStatus_t CUDNNWINAPI
|
| 537 |
+
cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr);
|
| 538 |
+
|
| 539 |
+
/* Scale all values of a tensor by a given factor : y[i] = alpha * y[i] */
|
| 540 |
+
cudnnStatus_t CUDNNWINAPI
|
| 541 |
+
cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha);
|
| 542 |
+
|
| 543 |
+
/* Create an instance of FilterStruct */
|
| 544 |
+
cudnnStatus_t CUDNNWINAPI
|
| 545 |
+
cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc);
|
| 546 |
+
|
| 547 |
+
cudnnStatus_t CUDNNWINAPI
|
| 548 |
+
cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
|
| 549 |
+
cudnnDataType_t dataType, /* image data type */
|
| 550 |
+
cudnnTensorFormat_t format,
|
| 551 |
+
int k, /* number of output feature maps */
|
| 552 |
+
int c, /* number of input feature maps */
|
| 553 |
+
int h, /* height of each input filter */
|
| 554 |
+
int w); /* width of each input filter */
|
| 555 |
+
|
| 556 |
+
cudnnStatus_t CUDNNWINAPI
|
| 557 |
+
cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
|
| 558 |
+
cudnnDataType_t *dataType, /* image data type */
|
| 559 |
+
cudnnTensorFormat_t *format,
|
| 560 |
+
int *k, /* number of output feature maps */
|
| 561 |
+
int *c, /* number of input feature maps */
|
| 562 |
+
int *h, /* height of each input filter */
|
| 563 |
+
int *w); /* width of each input filter */
|
| 564 |
+
|
| 565 |
+
cudnnStatus_t CUDNNWINAPI
|
| 566 |
+
cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc,
|
| 567 |
+
cudnnDataType_t dataType, /* image data type */
|
| 568 |
+
cudnnTensorFormat_t format,
|
| 569 |
+
int nbDims,
|
| 570 |
+
const int filterDimA[]);
|
| 571 |
+
|
| 572 |
+
cudnnStatus_t CUDNNWINAPI
|
| 573 |
+
cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc,
|
| 574 |
+
int nbDimsRequested,
|
| 575 |
+
cudnnDataType_t *dataType, /* image data type */
|
| 576 |
+
cudnnTensorFormat_t *format,
|
| 577 |
+
int *nbDims,
|
| 578 |
+
int filterDimA[]);
|
| 579 |
+
cudnnStatus_t CUDNNWINAPI
|
| 580 |
+
cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t *size);
|
| 581 |
+
|
| 582 |
+
cudnnStatus_t CUDNNWINAPI
|
| 583 |
+
cudnnTransformFilter(cudnnHandle_t handle,
|
| 584 |
+
const cudnnTensorTransformDescriptor_t transDesc,
|
| 585 |
+
const void *alpha,
|
| 586 |
+
const cudnnFilterDescriptor_t srcDesc,
|
| 587 |
+
const void *srcData,
|
| 588 |
+
const void *beta,
|
| 589 |
+
const cudnnFilterDescriptor_t destDesc,
|
| 590 |
+
void *destData);
|
| 591 |
+
|
| 592 |
+
cudnnStatus_t CUDNNWINAPI
|
| 593 |
+
cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc);
|
| 594 |
+
|
| 595 |
+
/*
|
| 596 |
+
* softmax algorithm
|
| 597 |
+
*/
|
| 598 |
+
typedef enum {
|
| 599 |
+
CUDNN_SOFTMAX_FAST = 0, /* straightforward implementation */
|
| 600 |
+
CUDNN_SOFTMAX_ACCURATE = 1, /* subtract max from every point to avoid overflow */
|
| 601 |
+
CUDNN_SOFTMAX_LOG = 2
|
| 602 |
+
} cudnnSoftmaxAlgorithm_t;
|
| 603 |
+
|
| 604 |
+
typedef enum {
|
| 605 |
+
CUDNN_SOFTMAX_MODE_INSTANCE = 0, /* compute the softmax over all C, H, W for each N */
|
| 606 |
+
CUDNN_SOFTMAX_MODE_CHANNEL = 1 /* compute the softmax over all C for each H, W, N */
|
| 607 |
+
} cudnnSoftmaxMode_t;
|
| 608 |
+
|
| 609 |
+
/* Softmax functions: All of the form "output = alpha * Op(inputs) + beta * output" */
|
| 610 |
+
|
| 611 |
+
/* Function to perform forward softmax */
|
| 612 |
+
cudnnStatus_t CUDNNWINAPI
|
| 613 |
+
cudnnSoftmaxForward(cudnnHandle_t handle,
|
| 614 |
+
cudnnSoftmaxAlgorithm_t algo,
|
| 615 |
+
cudnnSoftmaxMode_t mode,
|
| 616 |
+
const void *alpha,
|
| 617 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 618 |
+
const void *x,
|
| 619 |
+
const void *beta,
|
| 620 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 621 |
+
void *y);
|
| 622 |
+
|
| 623 |
+
/*
|
| 624 |
+
* pooling mode
|
| 625 |
+
*/
|
| 626 |
+
typedef enum {
|
| 627 |
+
CUDNN_POOLING_MAX = 0,
|
| 628 |
+
CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1, /* count for average includes padded values */
|
| 629 |
+
CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2, /* count for average does not include padded values */
|
| 630 |
+
CUDNN_POOLING_MAX_DETERMINISTIC = 3
|
| 631 |
+
} cudnnPoolingMode_t;
|
| 632 |
+
|
| 633 |
+
/* Create an instance of pooling descriptor */
|
| 634 |
+
cudnnStatus_t CUDNNWINAPI
|
| 635 |
+
cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc);
|
| 636 |
+
|
| 637 |
+
cudnnStatus_t CUDNNWINAPI
|
| 638 |
+
cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
|
| 639 |
+
cudnnPoolingMode_t mode,
|
| 640 |
+
cudnnNanPropagation_t maxpoolingNanOpt,
|
| 641 |
+
int windowHeight,
|
| 642 |
+
int windowWidth,
|
| 643 |
+
int verticalPadding,
|
| 644 |
+
int horizontalPadding,
|
| 645 |
+
int verticalStride,
|
| 646 |
+
int horizontalStride);
|
| 647 |
+
|
| 648 |
+
cudnnStatus_t CUDNNWINAPI
|
| 649 |
+
cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
|
| 650 |
+
cudnnPoolingMode_t *mode,
|
| 651 |
+
cudnnNanPropagation_t *maxpoolingNanOpt,
|
| 652 |
+
int *windowHeight,
|
| 653 |
+
int *windowWidth,
|
| 654 |
+
int *verticalPadding,
|
| 655 |
+
int *horizontalPadding,
|
| 656 |
+
int *verticalStride,
|
| 657 |
+
int *horizontalStride);
|
| 658 |
+
|
| 659 |
+
cudnnStatus_t CUDNNWINAPI
|
| 660 |
+
cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc,
|
| 661 |
+
const cudnnPoolingMode_t mode,
|
| 662 |
+
const cudnnNanPropagation_t maxpoolingNanOpt,
|
| 663 |
+
int nbDims,
|
| 664 |
+
const int windowDimA[],
|
| 665 |
+
const int paddingA[],
|
| 666 |
+
const int strideA[]);
|
| 667 |
+
|
| 668 |
+
cudnnStatus_t CUDNNWINAPI
|
| 669 |
+
cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
|
| 670 |
+
int nbDimsRequested,
|
| 671 |
+
cudnnPoolingMode_t *mode,
|
| 672 |
+
cudnnNanPropagation_t *maxpoolingNanOpt,
|
| 673 |
+
int *nbDims,
|
| 674 |
+
int windowDimA[],
|
| 675 |
+
int paddingA[],
|
| 676 |
+
int strideA[]);
|
| 677 |
+
|
| 678 |
+
cudnnStatus_t CUDNNWINAPI
|
| 679 |
+
cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
|
| 680 |
+
const cudnnTensorDescriptor_t inputTensorDesc,
|
| 681 |
+
int nbDims,
|
| 682 |
+
int outputTensorDimA[]);
|
| 683 |
+
|
| 684 |
+
cudnnStatus_t CUDNNWINAPI
|
| 685 |
+
cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
|
| 686 |
+
const cudnnTensorDescriptor_t inputTensorDesc,
|
| 687 |
+
int *n,
|
| 688 |
+
int *c,
|
| 689 |
+
int *h,
|
| 690 |
+
int *w);
|
| 691 |
+
|
| 692 |
+
/* Destroy an instance of pooling descriptor */
|
| 693 |
+
cudnnStatus_t CUDNNWINAPI
|
| 694 |
+
cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc);
|
| 695 |
+
|
| 696 |
+
/* Pooling functions: All of the form "output = alpha * Op(inputs) + beta * output" */
|
| 697 |
+
|
| 698 |
+
/* Function to perform forward pooling */
|
| 699 |
+
cudnnStatus_t CUDNNWINAPI
|
| 700 |
+
cudnnPoolingForward(cudnnHandle_t handle,
|
| 701 |
+
const cudnnPoolingDescriptor_t poolingDesc,
|
| 702 |
+
const void *alpha,
|
| 703 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 704 |
+
const void *x,
|
| 705 |
+
const void *beta,
|
| 706 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 707 |
+
void *y);
|
| 708 |
+
|
| 709 |
+
/*
|
| 710 |
+
* activation mode
|
| 711 |
+
*/
|
| 712 |
+
typedef enum {
|
| 713 |
+
CUDNN_ACTIVATION_SIGMOID = 0,
|
| 714 |
+
CUDNN_ACTIVATION_RELU = 1,
|
| 715 |
+
CUDNN_ACTIVATION_TANH = 2,
|
| 716 |
+
CUDNN_ACTIVATION_CLIPPED_RELU = 3,
|
| 717 |
+
CUDNN_ACTIVATION_ELU = 4,
|
| 718 |
+
CUDNN_ACTIVATION_IDENTITY = 5,
|
| 719 |
+
CUDNN_ACTIVATION_SWISH = 6
|
| 720 |
+
} cudnnActivationMode_t;
|
| 721 |
+
|
| 722 |
+
/* Activation functions: All of the form "output = alpha * Op(inputs) + beta * output" */
|
| 723 |
+
cudnnStatus_t CUDNNWINAPI
|
| 724 |
+
cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc);
|
| 725 |
+
|
| 726 |
+
cudnnStatus_t CUDNNWINAPI
|
| 727 |
+
cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
|
| 728 |
+
cudnnActivationMode_t mode,
|
| 729 |
+
cudnnNanPropagation_t reluNanOpt,
|
| 730 |
+
double coef); /* ceiling for clipped RELU, alpha for ELU */
|
| 731 |
+
|
| 732 |
+
cudnnStatus_t CUDNNWINAPI
|
| 733 |
+
cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
|
| 734 |
+
cudnnActivationMode_t *mode,
|
| 735 |
+
cudnnNanPropagation_t *reluNanOpt,
|
| 736 |
+
double *coef); /* ceiling for clipped RELU, alpha for ELU */
|
| 737 |
+
|
| 738 |
+
cudnnStatus_t CUDNNWINAPI
|
| 739 |
+
cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double swish_beta);
|
| 740 |
+
|
| 741 |
+
cudnnStatus_t CUDNNWINAPI
|
| 742 |
+
cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double *swish_beta);
|
| 743 |
+
|
| 744 |
+
cudnnStatus_t CUDNNWINAPI
|
| 745 |
+
cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc);
|
| 746 |
+
|
| 747 |
+
/* Function to perform forward activation */
|
| 748 |
+
cudnnStatus_t CUDNNWINAPI
|
| 749 |
+
cudnnActivationForward(cudnnHandle_t handle,
|
| 750 |
+
cudnnActivationDescriptor_t activationDesc,
|
| 751 |
+
const void *alpha,
|
| 752 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 753 |
+
const void *x,
|
| 754 |
+
const void *beta,
|
| 755 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 756 |
+
void *y);
|
| 757 |
+
|
| 758 |
+
/*
|
| 759 |
+
* Create an instance of LRN (Local Response Normalization) descriptor
|
| 760 |
+
* Uses lrnN=5, lrnAlpha=1e-4, lrnBeta=0.75, lrnK=2.0 as defaults from Krizhevsky'12 ImageNet paper
|
| 761 |
+
*/
|
| 762 |
+
cudnnStatus_t CUDNNWINAPI
|
| 763 |
+
cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc);
|
| 764 |
+
|
| 765 |
+
#define CUDNN_LRN_MIN_N 1 /* minimum allowed lrnN */
|
| 766 |
+
#define CUDNN_LRN_MAX_N 16 /* maximum allowed lrnN */
|
| 767 |
+
#define CUDNN_LRN_MIN_K 1e-5 /* minimum allowed lrnK */
|
| 768 |
+
#define CUDNN_LRN_MIN_BETA 0.01 /* minimum allowed lrnBeta */
|
| 769 |
+
|
| 770 |
+
/* LRN layer mode */
|
| 771 |
+
typedef enum {
|
| 772 |
+
CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0, /* Normalize across tensor's dimA[1] dimension */
|
| 773 |
+
} cudnnLRNMode_t;
|
| 774 |
+
|
| 775 |
+
/*
|
| 776 |
+
* Uses a window [center-lookBehind, center+lookAhead], where
|
| 777 |
+
* lookBehind = floor( (lrnN-1)/2 ), lookAhead = lrnN-lookBehind-1.
|
| 778 |
+
* Values of double parameters cast to tensor data type.
|
| 779 |
+
*/
|
| 780 |
+
cudnnStatus_t CUDNNWINAPI
|
| 781 |
+
cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK);
|
| 782 |
+
/*
|
| 783 |
+
* Retrieve the settings currently stored in an LRN layer descriptor
|
| 784 |
+
* Any of the provided pointers can be NULL (no corresponding value will be returned)
|
| 785 |
+
*/
|
| 786 |
+
cudnnStatus_t CUDNNWINAPI
|
| 787 |
+
cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK);
|
| 788 |
+
|
| 789 |
+
/* Destroy an instance of LRN descriptor */
|
| 790 |
+
cudnnStatus_t CUDNNWINAPI
|
| 791 |
+
cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc);
|
| 792 |
+
|
| 793 |
+
/* LRN functions: output = alpha * normalize(x) + beta * old_y */
|
| 794 |
+
|
| 795 |
+
/* LRN cross-channel forward computation. Double parameters cast to tensor data type */
|
| 796 |
+
cudnnStatus_t CUDNNWINAPI
|
| 797 |
+
cudnnLRNCrossChannelForward(cudnnHandle_t handle,
|
| 798 |
+
cudnnLRNDescriptor_t normDesc,
|
| 799 |
+
cudnnLRNMode_t lrnMode,
|
| 800 |
+
const void *alpha,
|
| 801 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 802 |
+
const void *x,
|
| 803 |
+
const void *beta,
|
| 804 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 805 |
+
void *y);
|
| 806 |
+
|
| 807 |
+
typedef enum {
|
| 808 |
+
CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0,
|
| 809 |
+
} cudnnDivNormMode_t;
|
| 810 |
+
|
| 811 |
+
/* LCN/divisive normalization functions: y = alpha * normalize(x) + beta * y */
|
| 812 |
+
cudnnStatus_t CUDNNWINAPI
|
| 813 |
+
cudnnDivisiveNormalizationForward(cudnnHandle_t handle,
|
| 814 |
+
cudnnLRNDescriptor_t normDesc,
|
| 815 |
+
cudnnDivNormMode_t mode,
|
| 816 |
+
const void *alpha,
|
| 817 |
+
const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
|
| 818 |
+
const void *x,
|
| 819 |
+
const void *means, /* if NULL, means are assumed to be zero */
|
| 820 |
+
void *temp,
|
| 821 |
+
void *temp2,
|
| 822 |
+
const void *beta,
|
| 823 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 824 |
+
void *y);
|
| 825 |
+
|
| 826 |
+
typedef enum {
|
| 827 |
+
/* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
|
| 828 |
+
CUDNN_BATCHNORM_PER_ACTIVATION = 0,
|
| 829 |
+
|
| 830 |
+
/* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
|
| 831 |
+
CUDNN_BATCHNORM_SPATIAL = 1,
|
| 832 |
+
|
| 833 |
+
/*
|
| 834 |
+
* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors).
|
| 835 |
+
* May be faster than CUDNN_BATCHNORM_SPATIAL but imposes some limits on the range of values
|
| 836 |
+
*/
|
| 837 |
+
CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2,
|
| 838 |
+
} cudnnBatchNormMode_t;
|
| 839 |
+
|
| 840 |
+
#define CUDNN_BN_MIN_EPSILON 0.0 /* Minimum epsilon allowed to be used in the Batch Normalization formula */
|
| 841 |
+
|
| 842 |
+
/*
|
| 843 |
+
* Derives a tensor descriptor from layer data descriptor for BatchNormalization
|
| 844 |
+
* scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
|
| 845 |
+
* bnScaleBiasMeanVarDesc and bnScaleBiasDiffDesc in Batch Normalization forward and backward functions.
|
| 846 |
+
*/
|
| 847 |
+
cudnnStatus_t CUDNNWINAPI
|
| 848 |
+
cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc,
|
| 849 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 850 |
+
cudnnBatchNormMode_t mode);
|
| 851 |
+
|
| 852 |
+
typedef enum {
|
| 853 |
+
CUDNN_BATCHNORM_OPS_BN = 0, /* do batch normalization only */
|
| 854 |
+
CUDNN_BATCHNORM_OPS_BN_ACTIVATION = 1, /* do batchNorm, then activation */
|
| 855 |
+
CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2, /* do batchNorm, then elemWiseAdd, then activation */
|
| 856 |
+
} cudnnBatchNormOps_t;
|
| 857 |
+
|
| 858 |
+
/*
|
| 859 |
+
* Performs Batch Normalization during Inference:
|
| 860 |
+
* y[i] = bnScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + bnBias[k]
|
| 861 |
+
* with bnScale, bnBias, runningMean, runningInvVariance tensors indexed
|
| 862 |
+
* according to spatial or per-activation mode. Refer to cudnnBatchNormalizationForwardTraining
|
| 863 |
+
* above for notes on function arguments.
|
| 864 |
+
*/
|
| 865 |
+
cudnnStatus_t CUDNNWINAPI
|
| 866 |
+
cudnnBatchNormalizationForwardInference(cudnnHandle_t handle,
|
| 867 |
+
cudnnBatchNormMode_t mode,
|
| 868 |
+
const void *alpha, /* alpha[0] = result blend factor */
|
| 869 |
+
const void *beta, /* beta[0] = dest layer blend factor */
|
| 870 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 871 |
+
const void *x, /* NxCxHxW */
|
| 872 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 873 |
+
void *y, /* NxCxHxW */
|
| 874 |
+
const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
|
| 875 |
+
const void *bnScale,
|
| 876 |
+
const void *bnBias,
|
| 877 |
+
const void *estimatedMean,
|
| 878 |
+
const void *estimatedVariance,
|
| 879 |
+
double epsilon);
|
| 880 |
+
|
| 881 |
+
typedef enum {
|
| 882 |
+
/* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
|
| 883 |
+
CUDNN_NORM_PER_ACTIVATION = 0,
|
| 884 |
+
|
| 885 |
+
/* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
|
| 886 |
+
CUDNN_NORM_PER_CHANNEL = 1,
|
| 887 |
+
} cudnnNormMode_t;
|
| 888 |
+
|
| 889 |
+
typedef enum { CUDNN_NORM_ALGO_STANDARD = 0, CUDNN_NORM_ALGO_PERSIST = 1 } cudnnNormAlgo_t;
|
| 890 |
+
|
| 891 |
+
/*
|
| 892 |
+
* Derives a tensor descriptor from layer data descriptor for Normalization
|
| 893 |
+
* scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
|
| 894 |
+
* normScaleBiasMeanVarDesc and normScaleBiasDiffDesc in Normalization forward and backward functions.
|
| 895 |
+
*/
|
| 896 |
+
cudnnStatus_t CUDNNWINAPI
|
| 897 |
+
cudnnDeriveNormTensorDescriptor(cudnnTensorDescriptor_t derivedNormScaleBiasDesc,
|
| 898 |
+
cudnnTensorDescriptor_t derivedNormMeanVarDesc,
|
| 899 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 900 |
+
cudnnNormMode_t mode,
|
| 901 |
+
int groupCnt); /* Place hold for future work, should be set to 1 now*/
|
| 902 |
+
|
| 903 |
+
typedef enum {
|
| 904 |
+
CUDNN_NORM_OPS_NORM = 0, /* do normalization only */
|
| 905 |
+
CUDNN_NORM_OPS_NORM_ACTIVATION = 1, /* do Norm, then activation */
|
| 906 |
+
CUDNN_NORM_OPS_NORM_ADD_ACTIVATION = 2, /* do Norm, then elemWiseAdd, then activation */
|
| 907 |
+
} cudnnNormOps_t;
|
| 908 |
+
|
| 909 |
+
/*
|
| 910 |
+
* Performs Normalization during Inference:
|
| 911 |
+
* y[i] = normScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + normBias[k]
|
| 912 |
+
* with normScale, normBias, runningMean, runningInvVariance tensors indexed
|
| 913 |
+
* according to per-channel or per-activation mode. Refer to cudnnNormalizationForwardTraining
|
| 914 |
+
* above for notes on function arguments.
|
| 915 |
+
*/
|
| 916 |
+
cudnnStatus_t CUDNNWINAPI
|
| 917 |
+
cudnnNormalizationForwardInference(cudnnHandle_t handle,
|
| 918 |
+
cudnnNormMode_t mode,
|
| 919 |
+
cudnnNormOps_t normOps,
|
| 920 |
+
cudnnNormAlgo_t algo,
|
| 921 |
+
const void *alpha, /* alpha[0] = result blend factor */
|
| 922 |
+
const void *beta, /* beta[0] = dest layer blend factor */
|
| 923 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 924 |
+
const void *x, /* NxCxHxW */
|
| 925 |
+
const cudnnTensorDescriptor_t normScaleBiasDesc,
|
| 926 |
+
const void *normScale,
|
| 927 |
+
const void *normBias,
|
| 928 |
+
const cudnnTensorDescriptor_t normMeanVarDesc,
|
| 929 |
+
const void *estimatedMean,
|
| 930 |
+
const void *estimatedVariance,
|
| 931 |
+
const cudnnTensorDescriptor_t zDesc,
|
| 932 |
+
const void *z,
|
| 933 |
+
cudnnActivationDescriptor_t activationDesc,
|
| 934 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 935 |
+
void *y, /* NxCxHxW */
|
| 936 |
+
double epsilon,
|
| 937 |
+
int groupCnt); /* Place hold for future work*/
|
| 938 |
+
|
| 939 |
+
/* APIs for spatial transformer network*/
|
| 940 |
+
typedef enum {
|
| 941 |
+
CUDNN_SAMPLER_BILINEAR = 0,
|
| 942 |
+
} cudnnSamplerType_t;
|
| 943 |
+
|
| 944 |
+
cudnnStatus_t CUDNNWINAPI
|
| 945 |
+
cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc);
|
| 946 |
+
|
| 947 |
+
cudnnStatus_t CUDNNWINAPI
|
| 948 |
+
cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc,
|
| 949 |
+
cudnnSamplerType_t samplerType,
|
| 950 |
+
cudnnDataType_t dataType,
|
| 951 |
+
const int nbDims,
|
| 952 |
+
const int dimA[]);
|
| 953 |
+
|
| 954 |
+
cudnnStatus_t CUDNNWINAPI
|
| 955 |
+
cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc);
|
| 956 |
+
|
| 957 |
+
cudnnStatus_t CUDNNWINAPI
|
| 958 |
+
cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle,
|
| 959 |
+
const cudnnSpatialTransformerDescriptor_t stDesc,
|
| 960 |
+
const void *theta,
|
| 961 |
+
void *grid);
|
| 962 |
+
|
| 963 |
+
cudnnStatus_t CUDNNWINAPI
|
| 964 |
+
cudnnSpatialTfSamplerForward(cudnnHandle_t handle,
|
| 965 |
+
cudnnSpatialTransformerDescriptor_t stDesc,
|
| 966 |
+
const void *alpha,
|
| 967 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 968 |
+
const void *x,
|
| 969 |
+
const void *grid,
|
| 970 |
+
const void *beta,
|
| 971 |
+
cudnnTensorDescriptor_t yDesc,
|
| 972 |
+
void *y);
|
| 973 |
+
|
| 974 |
+
typedef struct cudnnDropoutStruct *cudnnDropoutDescriptor_t;
|
| 975 |
+
|
| 976 |
+
cudnnStatus_t CUDNNWINAPI
|
| 977 |
+
cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc);
|
| 978 |
+
|
| 979 |
+
cudnnStatus_t CUDNNWINAPI
|
| 980 |
+
cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc);
|
| 981 |
+
|
| 982 |
+
/*helper function to determine size of the states to be passed to cudnnSetDropoutDescriptor */
|
| 983 |
+
cudnnStatus_t CUDNNWINAPI
|
| 984 |
+
cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes);
|
| 985 |
+
|
| 986 |
+
/*helper function to determine size of the reserve space to be passed to dropout forward/backward calls */
|
| 987 |
+
cudnnStatus_t CUDNNWINAPI
|
| 988 |
+
cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes);
|
| 989 |
+
|
| 990 |
+
cudnnStatus_t CUDNNWINAPI
|
| 991 |
+
cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
|
| 992 |
+
cudnnHandle_t handle,
|
| 993 |
+
float dropout,
|
| 994 |
+
void *states,
|
| 995 |
+
size_t stateSizeInBytes,
|
| 996 |
+
unsigned long long seed);
|
| 997 |
+
|
| 998 |
+
/* Restores the dropout descriptor to a previously saved-off state */
|
| 999 |
+
cudnnStatus_t CUDNNWINAPI
|
| 1000 |
+
cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
|
| 1001 |
+
cudnnHandle_t handle,
|
| 1002 |
+
float dropout,
|
| 1003 |
+
void *states,
|
| 1004 |
+
size_t stateSizeInBytes,
|
| 1005 |
+
unsigned long long seed);
|
| 1006 |
+
|
| 1007 |
+
cudnnStatus_t CUDNNWINAPI
|
| 1008 |
+
cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
|
| 1009 |
+
cudnnHandle_t handle,
|
| 1010 |
+
float *dropout,
|
| 1011 |
+
void **states,
|
| 1012 |
+
unsigned long long *seed);
|
| 1013 |
+
|
| 1014 |
+
cudnnStatus_t CUDNNWINAPI
|
| 1015 |
+
cudnnDropoutForward(cudnnHandle_t handle,
|
| 1016 |
+
const cudnnDropoutDescriptor_t dropoutDesc,
|
| 1017 |
+
const cudnnTensorDescriptor_t xdesc,
|
| 1018 |
+
const void *x,
|
| 1019 |
+
const cudnnTensorDescriptor_t ydesc,
|
| 1020 |
+
void *y,
|
| 1021 |
+
void *reserveSpace,
|
| 1022 |
+
size_t reserveSpaceSizeInBytes);
|
| 1023 |
+
|
| 1024 |
+
/* TODO: remove */
|
| 1025 |
+
|
| 1026 |
+
typedef struct cudnnAlgorithmStruct *cudnnAlgorithmDescriptor_t;
|
| 1027 |
+
typedef struct cudnnAlgorithmPerformanceStruct *cudnnAlgorithmPerformance_t;
|
| 1028 |
+
|
| 1029 |
+
/* TODO: move these enums out to the appropriate submodule */
|
| 1030 |
+
typedef enum {
|
| 1031 |
+
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 0,
|
| 1032 |
+
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1,
|
| 1033 |
+
CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2,
|
| 1034 |
+
CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 3,
|
| 1035 |
+
CUDNN_CONVOLUTION_FWD_ALGO_FFT = 4,
|
| 1036 |
+
CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 5,
|
| 1037 |
+
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 6,
|
| 1038 |
+
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED = 7,
|
| 1039 |
+
CUDNN_CONVOLUTION_FWD_ALGO_COUNT = 8
|
| 1040 |
+
} cudnnConvolutionFwdAlgo_t;
|
| 1041 |
+
|
| 1042 |
+
typedef enum {
|
| 1043 |
+
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 0, /* non-deterministic */
|
| 1044 |
+
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 1,
|
| 1045 |
+
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 2,
|
| 1046 |
+
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 3, /* non-deterministic */
|
| 1047 |
+
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD = 4, /* not implemented */
|
| 1048 |
+
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5,
|
| 1049 |
+
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING = 6,
|
| 1050 |
+
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT = 7
|
| 1051 |
+
} cudnnConvolutionBwdFilterAlgo_t;
|
| 1052 |
+
|
| 1053 |
+
typedef enum {
|
| 1054 |
+
CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 0, /* non-deterministic */
|
| 1055 |
+
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 1,
|
| 1056 |
+
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 2,
|
| 1057 |
+
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 3,
|
| 1058 |
+
CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 4,
|
| 1059 |
+
CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5,
|
| 1060 |
+
CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT = 6
|
| 1061 |
+
} cudnnConvolutionBwdDataAlgo_t;
|
| 1062 |
+
|
| 1063 |
+
typedef enum {
|
| 1064 |
+
CUDNN_RNN_ALGO_STANDARD = 0,
|
| 1065 |
+
CUDNN_RNN_ALGO_PERSIST_STATIC = 1,
|
| 1066 |
+
CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 2,
|
| 1067 |
+
CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H = 3,
|
| 1068 |
+
CUDNN_RNN_ALGO_COUNT = 4,
|
| 1069 |
+
} cudnnRNNAlgo_t;
|
| 1070 |
+
|
| 1071 |
+
typedef enum { CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0, CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 } cudnnCTCLossAlgo_t;
|
| 1072 |
+
|
| 1073 |
+
/* TODO: remove */
|
| 1074 |
+
typedef struct cudnnAlgorithmUnionStruct {
|
| 1075 |
+
union Algorithm {
|
| 1076 |
+
cudnnConvolutionFwdAlgo_t convFwdAlgo;
|
| 1077 |
+
cudnnConvolutionBwdFilterAlgo_t convBwdFilterAlgo;
|
| 1078 |
+
cudnnConvolutionBwdDataAlgo_t convBwdDataAlgo;
|
| 1079 |
+
cudnnRNNAlgo_t RNNAlgo;
|
| 1080 |
+
cudnnCTCLossAlgo_t CTCLossAlgo;
|
| 1081 |
+
} algo;
|
| 1082 |
+
} cudnnAlgorithm_t;
|
| 1083 |
+
|
| 1084 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 1085 |
+
cudnnCreateAlgorithmDescriptor(cudnnAlgorithmDescriptor_t *algoDesc);
|
| 1086 |
+
|
| 1087 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 1088 |
+
cudnnSetAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm);
|
| 1089 |
+
|
| 1090 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 1091 |
+
cudnnGetAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm);
|
| 1092 |
+
|
| 1093 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 1094 |
+
cudnnCopyAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest);
|
| 1095 |
+
|
| 1096 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 1097 |
+
cudnnDestroyAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc);
|
| 1098 |
+
|
| 1099 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 1100 |
+
cudnnCreateAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate);
|
| 1101 |
+
|
| 1102 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 1103 |
+
cudnnSetAlgorithmPerformance(cudnnAlgorithmPerformance_t algoPerf,
|
| 1104 |
+
cudnnAlgorithmDescriptor_t algoDesc,
|
| 1105 |
+
cudnnStatus_t status,
|
| 1106 |
+
float time,
|
| 1107 |
+
size_t memory);
|
| 1108 |
+
|
| 1109 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 1110 |
+
cudnnGetAlgorithmPerformance(const cudnnAlgorithmPerformance_t algoPerf,
|
| 1111 |
+
cudnnAlgorithmDescriptor_t *algoDesc,
|
| 1112 |
+
cudnnStatus_t *status,
|
| 1113 |
+
float *time,
|
| 1114 |
+
size_t *memory);
|
| 1115 |
+
|
| 1116 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 1117 |
+
cudnnDestroyAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy);
|
| 1118 |
+
|
| 1119 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 1120 |
+
cudnnGetAlgorithmSpaceSize(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc, size_t *algoSpaceSizeInBytes);
|
| 1121 |
+
|
| 1122 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 1123 |
+
cudnnSaveAlgorithm(cudnnHandle_t handle,
|
| 1124 |
+
cudnnAlgorithmDescriptor_t algoDesc,
|
| 1125 |
+
void *algoSpace,
|
| 1126 |
+
size_t algoSpaceSizeInBytes);
|
| 1127 |
+
|
| 1128 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 1129 |
+
cudnnRestoreAlgorithm(cudnnHandle_t handle,
|
| 1130 |
+
void *algoSpace,
|
| 1131 |
+
size_t algoSpaceSizeInBytes,
|
| 1132 |
+
cudnnAlgorithmDescriptor_t algoDesc);
|
| 1133 |
+
|
| 1134 |
+
typedef enum {
|
| 1135 |
+
CUDNN_SEV_FATAL = 0,
|
| 1136 |
+
CUDNN_SEV_ERROR = 1,
|
| 1137 |
+
CUDNN_SEV_WARNING = 2,
|
| 1138 |
+
CUDNN_SEV_INFO = 3,
|
| 1139 |
+
} cudnnSeverity_t;
|
| 1140 |
+
|
| 1141 |
+
/* Message masks to be used with cudnnSetCallback() */
|
| 1142 |
+
#define CUDNN_SEV_ERROR_EN (1U << CUDNN_SEV_ERROR)
|
| 1143 |
+
#define CUDNN_SEV_WARNING_EN (1U << CUDNN_SEV_WARNING)
|
| 1144 |
+
#define CUDNN_SEV_INFO_EN (1U << CUDNN_SEV_INFO)
|
| 1145 |
+
|
| 1146 |
+
/* struct containing useful informaiton for each API call */
|
| 1147 |
+
typedef struct cudnnDebugStruct {
|
| 1148 |
+
unsigned cudnn_version;
|
| 1149 |
+
cudnnStatus_t cudnnStatus;
|
| 1150 |
+
unsigned time_sec; /* epoch time in seconds */
|
| 1151 |
+
unsigned time_usec; /* microseconds part of epoch time */
|
| 1152 |
+
unsigned time_delta; /* time since start in seconds */
|
| 1153 |
+
cudnnHandle_t handle; /* cudnn handle */
|
| 1154 |
+
cudaStream_t stream; /* cuda stream ID */
|
| 1155 |
+
unsigned long long pid; /* process ID */
|
| 1156 |
+
unsigned long long tid; /* thread ID */
|
| 1157 |
+
int cudaDeviceId; /* CUDA device ID */
|
| 1158 |
+
int reserved[15]; /* reserved for future use */
|
| 1159 |
+
} cudnnDebug_t;
|
| 1160 |
+
|
| 1161 |
+
typedef void (*cudnnCallback_t)(cudnnSeverity_t sev, void *udata, const cudnnDebug_t *dbg, const char *msg);
|
| 1162 |
+
|
| 1163 |
+
cudnnStatus_t CUDNNWINAPI
|
| 1164 |
+
cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr);
|
| 1165 |
+
|
| 1166 |
+
cudnnStatus_t CUDNNWINAPI
|
| 1167 |
+
cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr);
|
| 1168 |
+
|
| 1169 |
+
/*
|
| 1170 |
+
* \brief Cross-library version checker.
|
| 1171 |
+
* This function is implemented differently in each sub-library. Each sublib
|
| 1172 |
+
* checks whether its own version matches that of its dependencies.
|
| 1173 |
+
* \returns CUDNN_STATUS_SUCCESS if the version check passes,
|
| 1174 |
+
* CUDNN_STATUS_VERSION_MISMATCH if the versions are inconsistent.
|
| 1175 |
+
*/
|
| 1176 |
+
cudnnStatus_t CUDNNWINAPI
|
| 1177 |
+
cudnnOpsInferVersionCheck(void);
|
| 1178 |
+
|
| 1179 |
+
#if defined(__cplusplus)
|
| 1180 |
+
}
|
| 1181 |
+
#endif
|
| 1182 |
+
|
| 1183 |
+
#endif /* CUDNN_OPS_INFER_H_ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_infer_v8.h
ADDED
|
@@ -0,0 +1,1183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2017-2022 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
/*
|
| 51 |
+
* cudnn_ops_infer : cuDNN's basic definitions and inference operations.
|
| 52 |
+
*/
|
| 53 |
+
|
| 54 |
+
#if !defined(CUDNN_OPS_INFER_H_)
|
| 55 |
+
#define CUDNN_OPS_INFER_H_
|
| 56 |
+
|
| 57 |
+
#include <cuda_runtime.h>
|
| 58 |
+
#include <stdint.h>
|
| 59 |
+
|
| 60 |
+
#include "cudnn_version.h"
|
| 61 |
+
|
| 62 |
+
/* These version numbers are autogenerated, do not edit manually. */
|
| 63 |
+
#define CUDNN_OPS_INFER_MAJOR 8
|
| 64 |
+
#define CUDNN_OPS_INFER_MINOR 7
|
| 65 |
+
#define CUDNN_OPS_INFER_PATCH 0
|
| 66 |
+
|
| 67 |
+
#if (CUDNN_OPS_INFER_MAJOR != CUDNN_MAJOR) || (CUDNN_OPS_INFER_MINOR != CUDNN_MINOR) || \
|
| 68 |
+
(CUDNN_OPS_INFER_PATCH != CUDNN_PATCHLEVEL)
|
| 69 |
+
#error Version mismatch in cuDNN OPS INFER!!!
|
| 70 |
+
#endif
|
| 71 |
+
|
| 72 |
+
#ifndef CUDNNWINAPI
|
| 73 |
+
#ifdef _WIN32
|
| 74 |
+
#define CUDNNWINAPI __stdcall
|
| 75 |
+
#else
|
| 76 |
+
#define CUDNNWINAPI
|
| 77 |
+
#endif
|
| 78 |
+
#endif
|
| 79 |
+
|
| 80 |
+
/* Warnings for deprecated API-s are enabled using the CUDNN_WARN_DEPRECATED macro */
|
| 81 |
+
#if defined(CUDNN_WARN_DEPRECATED) && (defined(__GNUC__) || defined(__clang__))
|
| 82 |
+
/* GCC, Intel C/C++, Cray C/C++, CLANG, IBM XL C/C++ little endian */
|
| 83 |
+
#define CUDNN_DEPRECATED __attribute__((deprecated))
|
| 84 |
+
#elif defined(CUDNN_WARN_DEPRECATED) && defined(_MSC_VER)
|
| 85 |
+
/* Microsoft Visual C++ */
|
| 86 |
+
#define CUDNN_DEPRECATED __declspec(deprecated)
|
| 87 |
+
#elif defined(CUDNN_WARN_DEPRECATED) && (__cplusplus >= 201402L)
|
| 88 |
+
/* C++14 compilers */
|
| 89 |
+
#define CUDNN_DEPRECATED [[deprecated]]
|
| 90 |
+
#else
|
| 91 |
+
/* No support for the deprecated attribute */
|
| 92 |
+
#define CUDNN_DEPRECATED
|
| 93 |
+
#endif
|
| 94 |
+
|
| 95 |
+
#if defined(__cplusplus)
|
| 96 |
+
extern "C" {
|
| 97 |
+
#endif
|
| 98 |
+
|
| 99 |
+
struct cudnnContext;
|
| 100 |
+
typedef struct cudnnContext *cudnnHandle_t;
|
| 101 |
+
|
| 102 |
+
size_t CUDNNWINAPI
|
| 103 |
+
cudnnGetVersion(void);
|
| 104 |
+
|
| 105 |
+
size_t CUDNNWINAPI
|
| 106 |
+
cudnnGetMaxDeviceVersion(void);
|
| 107 |
+
|
| 108 |
+
/* Returns CUDA Runtime version statically linked against cudnn */
|
| 109 |
+
size_t CUDNNWINAPI
|
| 110 |
+
cudnnGetCudartVersion(void);
|
| 111 |
+
|
| 112 |
+
/*
|
| 113 |
+
* CUDNN return codes
|
| 114 |
+
*/
|
| 115 |
+
typedef enum {
|
| 116 |
+
CUDNN_STATUS_SUCCESS = 0,
|
| 117 |
+
CUDNN_STATUS_NOT_INITIALIZED = 1,
|
| 118 |
+
CUDNN_STATUS_ALLOC_FAILED = 2,
|
| 119 |
+
CUDNN_STATUS_BAD_PARAM = 3,
|
| 120 |
+
CUDNN_STATUS_INTERNAL_ERROR = 4,
|
| 121 |
+
CUDNN_STATUS_INVALID_VALUE = 5,
|
| 122 |
+
CUDNN_STATUS_ARCH_MISMATCH = 6,
|
| 123 |
+
CUDNN_STATUS_MAPPING_ERROR = 7,
|
| 124 |
+
CUDNN_STATUS_EXECUTION_FAILED = 8,
|
| 125 |
+
CUDNN_STATUS_NOT_SUPPORTED = 9,
|
| 126 |
+
CUDNN_STATUS_LICENSE_ERROR = 10,
|
| 127 |
+
CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING = 11,
|
| 128 |
+
CUDNN_STATUS_RUNTIME_IN_PROGRESS = 12,
|
| 129 |
+
CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 13,
|
| 130 |
+
CUDNN_STATUS_VERSION_MISMATCH = 14,
|
| 131 |
+
} cudnnStatus_t;
|
| 132 |
+
|
| 133 |
+
/* human-readable error messages */
|
| 134 |
+
const char *CUDNNWINAPI
|
| 135 |
+
cudnnGetErrorString(cudnnStatus_t status);
|
| 136 |
+
|
| 137 |
+
/* Forward definition in this version only */
|
| 138 |
+
typedef struct cudnnRuntimeTag_t cudnnRuntimeTag_t;
|
| 139 |
+
|
| 140 |
+
typedef enum {
|
| 141 |
+
CUDNN_ERRQUERY_RAWCODE = 0,
|
| 142 |
+
CUDNN_ERRQUERY_NONBLOCKING = 1,
|
| 143 |
+
CUDNN_ERRQUERY_BLOCKING = 2,
|
| 144 |
+
} cudnnErrQueryMode_t;
|
| 145 |
+
|
| 146 |
+
cudnnStatus_t CUDNNWINAPI
|
| 147 |
+
cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag);
|
| 148 |
+
|
| 149 |
+
#ifndef __LIBRARY_TYPES_H__
|
| 150 |
+
|
| 151 |
+
typedef enum libraryPropertyType_t { MAJOR_VERSION, MINOR_VERSION, PATCH_LEVEL } libraryPropertyType;
|
| 152 |
+
|
| 153 |
+
#endif
|
| 154 |
+
|
| 155 |
+
cudnnStatus_t CUDNNWINAPI
|
| 156 |
+
cudnnGetProperty(libraryPropertyType type, int *value);
|
| 157 |
+
|
| 158 |
+
cudnnStatus_t CUDNNWINAPI
|
| 159 |
+
cudnnCreate(cudnnHandle_t *handle);
|
| 160 |
+
cudnnStatus_t CUDNNWINAPI
|
| 161 |
+
cudnnDestroy(cudnnHandle_t handle);
|
| 162 |
+
cudnnStatus_t CUDNNWINAPI
|
| 163 |
+
cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId);
|
| 164 |
+
cudnnStatus_t CUDNNWINAPI
|
| 165 |
+
cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId);
|
| 166 |
+
|
| 167 |
+
/* Data structures to represent Image/Filter and the Neural Network Layer */
|
| 168 |
+
typedef struct cudnnTensorStruct *cudnnTensorDescriptor_t;
|
| 169 |
+
typedef struct cudnnPoolingStruct *cudnnPoolingDescriptor_t;
|
| 170 |
+
typedef struct cudnnFilterStruct *cudnnFilterDescriptor_t;
|
| 171 |
+
typedef struct cudnnLRNStruct *cudnnLRNDescriptor_t;
|
| 172 |
+
typedef struct cudnnActivationStruct *cudnnActivationDescriptor_t;
|
| 173 |
+
typedef struct cudnnSpatialTransformerStruct *cudnnSpatialTransformerDescriptor_t;
|
| 174 |
+
typedef struct cudnnOpTensorStruct *cudnnOpTensorDescriptor_t;
|
| 175 |
+
typedef struct cudnnReduceTensorStruct *cudnnReduceTensorDescriptor_t;
|
| 176 |
+
typedef struct cudnnCTCLossStruct *cudnnCTCLossDescriptor_t;
|
| 177 |
+
typedef struct cudnnTensorTransformStruct *cudnnTensorTransformDescriptor_t;
|
| 178 |
+
/*
|
| 179 |
+
* CUDNN data type
|
| 180 |
+
*/
|
| 181 |
+
typedef enum {
|
| 182 |
+
CUDNN_DATA_FLOAT = 0,
|
| 183 |
+
CUDNN_DATA_DOUBLE = 1,
|
| 184 |
+
CUDNN_DATA_HALF = 2,
|
| 185 |
+
CUDNN_DATA_INT8 = 3,
|
| 186 |
+
CUDNN_DATA_INT32 = 4,
|
| 187 |
+
CUDNN_DATA_INT8x4 = 5,
|
| 188 |
+
CUDNN_DATA_UINT8 = 6,
|
| 189 |
+
CUDNN_DATA_UINT8x4 = 7,
|
| 190 |
+
CUDNN_DATA_INT8x32 = 8,
|
| 191 |
+
CUDNN_DATA_BFLOAT16 = 9,
|
| 192 |
+
CUDNN_DATA_INT64 = 10,
|
| 193 |
+
CUDNN_DATA_BOOLEAN = 11,
|
| 194 |
+
CUDNN_DATA_FP8_E4M3 = 12,
|
| 195 |
+
CUDNN_DATA_FP8_E5M2 = 13,
|
| 196 |
+
CUDNN_DATA_FAST_FLOAT_FOR_FP8 = 14,
|
| 197 |
+
} cudnnDataType_t;
|
| 198 |
+
|
| 199 |
+
/*
|
| 200 |
+
* CUDNN math type
|
| 201 |
+
*/
|
| 202 |
+
typedef enum {
|
| 203 |
+
CUDNN_DEFAULT_MATH = 0,
|
| 204 |
+
CUDNN_TENSOR_OP_MATH = 1,
|
| 205 |
+
CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION = 2,
|
| 206 |
+
CUDNN_FMA_MATH = 3,
|
| 207 |
+
} cudnnMathType_t;
|
| 208 |
+
|
| 209 |
+
/*
|
| 210 |
+
* CUDNN propagate Nan
|
| 211 |
+
*/
|
| 212 |
+
typedef enum {
|
| 213 |
+
CUDNN_NOT_PROPAGATE_NAN = 0,
|
| 214 |
+
CUDNN_PROPAGATE_NAN = 1,
|
| 215 |
+
} cudnnNanPropagation_t;
|
| 216 |
+
|
| 217 |
+
/*
|
| 218 |
+
* CUDNN Determinism
|
| 219 |
+
*/
|
| 220 |
+
typedef enum {
|
| 221 |
+
CUDNN_NON_DETERMINISTIC = 0,
|
| 222 |
+
CUDNN_DETERMINISTIC = 1,
|
| 223 |
+
} cudnnDeterminism_t;
|
| 224 |
+
|
| 225 |
+
/* Maximum supported number of tensor dimensions */
|
| 226 |
+
#define CUDNN_DIM_MAX 8
|
| 227 |
+
|
| 228 |
+
/* Create an instance of a generic Tensor descriptor */
|
| 229 |
+
cudnnStatus_t CUDNNWINAPI
|
| 230 |
+
cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc);
|
| 231 |
+
|
| 232 |
+
typedef enum {
|
| 233 |
+
CUDNN_TENSOR_NCHW = 0, /* row major (wStride = 1, hStride = w) */
|
| 234 |
+
CUDNN_TENSOR_NHWC = 1, /* feature maps interleaved ( cStride = 1 )*/
|
| 235 |
+
CUDNN_TENSOR_NCHW_VECT_C = 2, /* each image point is vector of element of C, vector length in data type */
|
| 236 |
+
} cudnnTensorFormat_t;
|
| 237 |
+
|
| 238 |
+
cudnnStatus_t CUDNNWINAPI
|
| 239 |
+
cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc,
|
| 240 |
+
cudnnTensorFormat_t format,
|
| 241 |
+
cudnnDataType_t dataType, /* image data type */
|
| 242 |
+
int n, /* number of inputs (batch size) */
|
| 243 |
+
int c, /* number of input feature maps */
|
| 244 |
+
int h, /* height of input section */
|
| 245 |
+
int w); /* width of input section */
|
| 246 |
+
|
| 247 |
+
cudnnStatus_t CUDNNWINAPI
|
| 248 |
+
cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
|
| 249 |
+
cudnnDataType_t dataType, /* image data type */
|
| 250 |
+
int n, /* number of inputs (batch size) */
|
| 251 |
+
int c, /* number of input feature maps */
|
| 252 |
+
int h, /* height of input section */
|
| 253 |
+
int w, /* width of input section */
|
| 254 |
+
int nStride,
|
| 255 |
+
int cStride,
|
| 256 |
+
int hStride,
|
| 257 |
+
int wStride);
|
| 258 |
+
|
| 259 |
+
cudnnStatus_t CUDNNWINAPI
|
| 260 |
+
cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc,
|
| 261 |
+
cudnnDataType_t *dataType, /* image data type */
|
| 262 |
+
int *n, /* number of inputs (batch size) */
|
| 263 |
+
int *c, /* number of input feature maps */
|
| 264 |
+
int *h, /* height of input section */
|
| 265 |
+
int *w, /* width of input section */
|
| 266 |
+
int *nStride,
|
| 267 |
+
int *cStride,
|
| 268 |
+
int *hStride,
|
| 269 |
+
int *wStride);
|
| 270 |
+
|
| 271 |
+
cudnnStatus_t CUDNNWINAPI
|
| 272 |
+
cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
|
| 273 |
+
cudnnDataType_t dataType,
|
| 274 |
+
int nbDims,
|
| 275 |
+
const int dimA[],
|
| 276 |
+
const int strideA[]);
|
| 277 |
+
|
| 278 |
+
cudnnStatus_t CUDNNWINAPI
|
| 279 |
+
cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
|
| 280 |
+
cudnnTensorFormat_t format,
|
| 281 |
+
cudnnDataType_t dataType,
|
| 282 |
+
int nbDims,
|
| 283 |
+
const int dimA[]);
|
| 284 |
+
|
| 285 |
+
cudnnStatus_t CUDNNWINAPI
|
| 286 |
+
cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc,
|
| 287 |
+
int nbDimsRequested,
|
| 288 |
+
cudnnDataType_t *dataType,
|
| 289 |
+
int *nbDims,
|
| 290 |
+
int dimA[],
|
| 291 |
+
int strideA[]);
|
| 292 |
+
|
| 293 |
+
cudnnStatus_t CUDNNWINAPI
|
| 294 |
+
cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size);
|
| 295 |
+
|
| 296 |
+
/* PixelOffset( n, c, h, w ) = n *input_stride + c * feature_stride + h * h_stride + w * w_stride
|
| 297 |
+
|
| 298 |
+
1)Example of all images in row major order one batch of features after the other (with an optional padding on row)
|
| 299 |
+
input_stride : c x h x h_stride
|
| 300 |
+
feature_stride : h x h_stride
|
| 301 |
+
h_stride : >= w ( h_stride = w if no padding)
|
| 302 |
+
w_stride : 1
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
2)Example of all images in row major with features maps interleaved
|
| 306 |
+
input_stride : c x h x h_stride
|
| 307 |
+
feature_stride : 1
|
| 308 |
+
h_stride : w x c
|
| 309 |
+
w_stride : c
|
| 310 |
+
|
| 311 |
+
3)Example of all images in column major order one batch of features after the other (with optional padding on column)
|
| 312 |
+
input_stride : c x w x w_stride
|
| 313 |
+
feature_stride : w x w_stride
|
| 314 |
+
h_stride : 1
|
| 315 |
+
w_stride : >= h
|
| 316 |
+
|
| 317 |
+
*/
|
| 318 |
+
|
| 319 |
+
/* Destroy an instance of Tensor4d descriptor */
|
| 320 |
+
cudnnStatus_t CUDNNWINAPI
|
| 321 |
+
cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc);
|
| 322 |
+
|
| 323 |
+
/* Fold/unfold transforms */
|
| 324 |
+
typedef enum {
|
| 325 |
+
CUDNN_TRANSFORM_FOLD = 0U,
|
| 326 |
+
CUDNN_TRANSFORM_UNFOLD = 1U,
|
| 327 |
+
} cudnnFoldingDirection_t;
|
| 328 |
+
|
| 329 |
+
/** Create a destination descriptor for cudnnTransformTensor */
|
| 330 |
+
cudnnStatus_t CUDNNWINAPI
|
| 331 |
+
cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t transformDesc,
|
| 332 |
+
const cudnnTensorDescriptor_t srcDesc,
|
| 333 |
+
cudnnTensorDescriptor_t destDesc,
|
| 334 |
+
size_t *destSizeInBytes);
|
| 335 |
+
|
| 336 |
+
/** Create an empty tensor transform descriptor */
|
| 337 |
+
cudnnStatus_t CUDNNWINAPI
|
| 338 |
+
cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescriptor_t *transformDesc);
|
| 339 |
+
|
| 340 |
+
/** Initialize a previously created tensor transform descriptor. */
|
| 341 |
+
cudnnStatus_t CUDNNWINAPI
|
| 342 |
+
cudnnSetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
|
| 343 |
+
const uint32_t nbDims,
|
| 344 |
+
const cudnnTensorFormat_t destFormat,
|
| 345 |
+
const int32_t padBeforeA[],
|
| 346 |
+
const int32_t padAfterA[],
|
| 347 |
+
const uint32_t foldA[],
|
| 348 |
+
const cudnnFoldingDirection_t direction);
|
| 349 |
+
|
| 350 |
+
/**
|
| 351 |
+
* Retrieves the values stored in a previously initialized tensor transform
|
| 352 |
+
* descriptor.
|
| 353 |
+
*/
|
| 354 |
+
cudnnStatus_t CUDNNWINAPI
|
| 355 |
+
cudnnGetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
|
| 356 |
+
uint32_t nbDimsRequested,
|
| 357 |
+
cudnnTensorFormat_t *destFormat,
|
| 358 |
+
int32_t padBeforeA[],
|
| 359 |
+
int32_t padAfterA[],
|
| 360 |
+
uint32_t foldA[],
|
| 361 |
+
cudnnFoldingDirection_t *direction);
|
| 362 |
+
|
| 363 |
+
/**
|
| 364 |
+
* Destroys a previously created tensor transform descriptor.
|
| 365 |
+
*/
|
| 366 |
+
cudnnStatus_t CUDNNWINAPI
|
| 367 |
+
cudnnDestroyTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc);
|
| 368 |
+
|
| 369 |
+
/* Tensor layout conversion helper (y = alpha * x + beta * y) */
|
| 370 |
+
cudnnStatus_t CUDNNWINAPI
|
| 371 |
+
cudnnTransformTensor(cudnnHandle_t handle,
|
| 372 |
+
const void *alpha,
|
| 373 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 374 |
+
const void *x,
|
| 375 |
+
const void *beta,
|
| 376 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 377 |
+
void *y);
|
| 378 |
+
|
| 379 |
+
cudnnStatus_t CUDNNWINAPI
|
| 380 |
+
cudnnTransformTensorEx(cudnnHandle_t handle,
|
| 381 |
+
const cudnnTensorTransformDescriptor_t transDesc,
|
| 382 |
+
const void *alpha,
|
| 383 |
+
const cudnnTensorDescriptor_t srcDesc,
|
| 384 |
+
const void *srcData,
|
| 385 |
+
const void *beta,
|
| 386 |
+
const cudnnTensorDescriptor_t destDesc,
|
| 387 |
+
void *destData);
|
| 388 |
+
|
| 389 |
+
/* Tensor Bias addition : C = alpha * A + beta * C */
|
| 390 |
+
cudnnStatus_t CUDNNWINAPI
|
| 391 |
+
cudnnAddTensor(cudnnHandle_t handle,
|
| 392 |
+
const void *alpha,
|
| 393 |
+
const cudnnTensorDescriptor_t aDesc,
|
| 394 |
+
const void *A,
|
| 395 |
+
const void *beta,
|
| 396 |
+
const cudnnTensorDescriptor_t cDesc,
|
| 397 |
+
void *C);
|
| 398 |
+
|
| 399 |
+
/*
|
| 400 |
+
* CUDNN OpTensor op type
|
| 401 |
+
*/
|
| 402 |
+
typedef enum {
|
| 403 |
+
CUDNN_OP_TENSOR_ADD = 0,
|
| 404 |
+
CUDNN_OP_TENSOR_MUL = 1,
|
| 405 |
+
CUDNN_OP_TENSOR_MIN = 2,
|
| 406 |
+
CUDNN_OP_TENSOR_MAX = 3,
|
| 407 |
+
CUDNN_OP_TENSOR_SQRT = 4,
|
| 408 |
+
CUDNN_OP_TENSOR_NOT = 5,
|
| 409 |
+
} cudnnOpTensorOp_t;
|
| 410 |
+
|
| 411 |
+
cudnnStatus_t CUDNNWINAPI
|
| 412 |
+
cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc);
|
| 413 |
+
|
| 414 |
+
cudnnStatus_t CUDNNWINAPI
|
| 415 |
+
cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc,
|
| 416 |
+
cudnnOpTensorOp_t opTensorOp,
|
| 417 |
+
cudnnDataType_t opTensorCompType,
|
| 418 |
+
cudnnNanPropagation_t opTensorNanOpt);
|
| 419 |
+
|
| 420 |
+
cudnnStatus_t CUDNNWINAPI
|
| 421 |
+
cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
|
| 422 |
+
cudnnOpTensorOp_t *opTensorOp,
|
| 423 |
+
cudnnDataType_t *opTensorCompType,
|
| 424 |
+
cudnnNanPropagation_t *opTensorNanOpt);
|
| 425 |
+
|
| 426 |
+
cudnnStatus_t CUDNNWINAPI
|
| 427 |
+
cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc);
|
| 428 |
+
|
| 429 |
+
/* Tensor operation : C = op( alpha1 * A, alpha2 * B ) + beta * C */
|
| 430 |
+
/* B tensor is ignored for CUDNN_OP_TENSOR_SQRT, CUDNN_OP_TENSOR_NOT. */
|
| 431 |
+
cudnnStatus_t CUDNNWINAPI
|
| 432 |
+
cudnnOpTensor(cudnnHandle_t handle,
|
| 433 |
+
const cudnnOpTensorDescriptor_t opTensorDesc,
|
| 434 |
+
const void *alpha1,
|
| 435 |
+
const cudnnTensorDescriptor_t aDesc,
|
| 436 |
+
const void *A,
|
| 437 |
+
const void *alpha2,
|
| 438 |
+
const cudnnTensorDescriptor_t bDesc,
|
| 439 |
+
const void *B,
|
| 440 |
+
const void *beta,
|
| 441 |
+
const cudnnTensorDescriptor_t cDesc,
|
| 442 |
+
void *C);
|
| 443 |
+
|
| 444 |
+
/*
|
| 445 |
+
* CUDNN ReduceTensor op type
|
| 446 |
+
*/
|
| 447 |
+
typedef enum {
|
| 448 |
+
CUDNN_REDUCE_TENSOR_ADD = 0,
|
| 449 |
+
CUDNN_REDUCE_TENSOR_MUL = 1,
|
| 450 |
+
CUDNN_REDUCE_TENSOR_MIN = 2,
|
| 451 |
+
CUDNN_REDUCE_TENSOR_MAX = 3,
|
| 452 |
+
CUDNN_REDUCE_TENSOR_AMAX = 4,
|
| 453 |
+
CUDNN_REDUCE_TENSOR_AVG = 5,
|
| 454 |
+
CUDNN_REDUCE_TENSOR_NORM1 = 6,
|
| 455 |
+
CUDNN_REDUCE_TENSOR_NORM2 = 7,
|
| 456 |
+
CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8,
|
| 457 |
+
} cudnnReduceTensorOp_t;
|
| 458 |
+
|
| 459 |
+
/*
|
| 460 |
+
* CUDNN ReduceTensor indices type
|
| 461 |
+
*/
|
| 462 |
+
typedef enum {
|
| 463 |
+
CUDNN_REDUCE_TENSOR_NO_INDICES = 0,
|
| 464 |
+
CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1,
|
| 465 |
+
} cudnnReduceTensorIndices_t;
|
| 466 |
+
|
| 467 |
+
/*
|
| 468 |
+
* CUDNN tensor indices type size (all unsigned)
|
| 469 |
+
* Currently not supported, default is 32 bit unsigned.
|
| 470 |
+
*/
|
| 471 |
+
typedef enum {
|
| 472 |
+
CUDNN_32BIT_INDICES = 0,
|
| 473 |
+
CUDNN_64BIT_INDICES = 1,
|
| 474 |
+
CUDNN_16BIT_INDICES = 2,
|
| 475 |
+
CUDNN_8BIT_INDICES = 3,
|
| 476 |
+
} cudnnIndicesType_t;
|
| 477 |
+
|
| 478 |
+
cudnnStatus_t CUDNNWINAPI
|
| 479 |
+
cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc);
|
| 480 |
+
|
| 481 |
+
cudnnStatus_t CUDNNWINAPI
|
| 482 |
+
cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc,
|
| 483 |
+
cudnnReduceTensorOp_t reduceTensorOp,
|
| 484 |
+
cudnnDataType_t reduceTensorCompType,
|
| 485 |
+
cudnnNanPropagation_t reduceTensorNanOpt,
|
| 486 |
+
cudnnReduceTensorIndices_t reduceTensorIndices,
|
| 487 |
+
cudnnIndicesType_t reduceTensorIndicesType);
|
| 488 |
+
|
| 489 |
+
cudnnStatus_t CUDNNWINAPI
|
| 490 |
+
cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc,
|
| 491 |
+
cudnnReduceTensorOp_t *reduceTensorOp,
|
| 492 |
+
cudnnDataType_t *reduceTensorCompType,
|
| 493 |
+
cudnnNanPropagation_t *reduceTensorNanOpt,
|
| 494 |
+
cudnnReduceTensorIndices_t *reduceTensorIndices,
|
| 495 |
+
cudnnIndicesType_t *reduceTensorIndicesType);
|
| 496 |
+
|
| 497 |
+
cudnnStatus_t CUDNNWINAPI
|
| 498 |
+
cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc);
|
| 499 |
+
|
| 500 |
+
/* Helper function to return the minimum size of the index space to be passed to the reduction given the input and
|
| 501 |
+
* output tensors */
|
| 502 |
+
cudnnStatus_t CUDNNWINAPI
|
| 503 |
+
cudnnGetReductionIndicesSize(cudnnHandle_t handle,
|
| 504 |
+
const cudnnReduceTensorDescriptor_t reduceTensorDesc,
|
| 505 |
+
const cudnnTensorDescriptor_t aDesc,
|
| 506 |
+
const cudnnTensorDescriptor_t cDesc,
|
| 507 |
+
size_t *sizeInBytes);
|
| 508 |
+
|
| 509 |
+
/* Helper function to return the minimum size of the workspace to be passed to the reduction given the input and output
|
| 510 |
+
* tensors */
|
| 511 |
+
cudnnStatus_t CUDNNWINAPI
|
| 512 |
+
cudnnGetReductionWorkspaceSize(cudnnHandle_t handle,
|
| 513 |
+
const cudnnReduceTensorDescriptor_t reduceTensorDesc,
|
| 514 |
+
const cudnnTensorDescriptor_t aDesc,
|
| 515 |
+
const cudnnTensorDescriptor_t cDesc,
|
| 516 |
+
size_t *sizeInBytes);
|
| 517 |
+
|
| 518 |
+
/* Tensor operation : C = reduce op( alpha * A ) + beta * C */
|
| 519 |
+
/* The NaN propagation enum applies to only the min and max reduce ops; the other reduce ops propagate NaN as usual. */
|
| 520 |
+
/* The indices space is ignored for reduce ops other than min or max. */
|
| 521 |
+
cudnnStatus_t CUDNNWINAPI
|
| 522 |
+
cudnnReduceTensor(cudnnHandle_t handle,
|
| 523 |
+
const cudnnReduceTensorDescriptor_t reduceTensorDesc,
|
| 524 |
+
void *indices,
|
| 525 |
+
size_t indicesSizeInBytes,
|
| 526 |
+
void *workspace,
|
| 527 |
+
size_t workspaceSizeInBytes,
|
| 528 |
+
const void *alpha,
|
| 529 |
+
const cudnnTensorDescriptor_t aDesc,
|
| 530 |
+
const void *A,
|
| 531 |
+
const void *beta,
|
| 532 |
+
const cudnnTensorDescriptor_t cDesc,
|
| 533 |
+
void *C);
|
| 534 |
+
|
| 535 |
+
/* Set all values of a tensor to a given value : y[i] = value[0] */
|
| 536 |
+
cudnnStatus_t CUDNNWINAPI
|
| 537 |
+
cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr);
|
| 538 |
+
|
| 539 |
+
/* Scale all values of a tensor by a given factor : y[i] = alpha * y[i] */
|
| 540 |
+
cudnnStatus_t CUDNNWINAPI
|
| 541 |
+
cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha);
|
| 542 |
+
|
| 543 |
+
/* Create an instance of FilterStruct */
|
| 544 |
+
cudnnStatus_t CUDNNWINAPI
|
| 545 |
+
cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc);
|
| 546 |
+
|
| 547 |
+
cudnnStatus_t CUDNNWINAPI
|
| 548 |
+
cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
|
| 549 |
+
cudnnDataType_t dataType, /* image data type */
|
| 550 |
+
cudnnTensorFormat_t format,
|
| 551 |
+
int k, /* number of output feature maps */
|
| 552 |
+
int c, /* number of input feature maps */
|
| 553 |
+
int h, /* height of each input filter */
|
| 554 |
+
int w); /* width of each input filter */
|
| 555 |
+
|
| 556 |
+
cudnnStatus_t CUDNNWINAPI
|
| 557 |
+
cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
|
| 558 |
+
cudnnDataType_t *dataType, /* image data type */
|
| 559 |
+
cudnnTensorFormat_t *format,
|
| 560 |
+
int *k, /* number of output feature maps */
|
| 561 |
+
int *c, /* number of input feature maps */
|
| 562 |
+
int *h, /* height of each input filter */
|
| 563 |
+
int *w); /* width of each input filter */
|
| 564 |
+
|
| 565 |
+
cudnnStatus_t CUDNNWINAPI
|
| 566 |
+
cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc,
|
| 567 |
+
cudnnDataType_t dataType, /* image data type */
|
| 568 |
+
cudnnTensorFormat_t format,
|
| 569 |
+
int nbDims,
|
| 570 |
+
const int filterDimA[]);
|
| 571 |
+
|
| 572 |
+
cudnnStatus_t CUDNNWINAPI
|
| 573 |
+
cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc,
|
| 574 |
+
int nbDimsRequested,
|
| 575 |
+
cudnnDataType_t *dataType, /* image data type */
|
| 576 |
+
cudnnTensorFormat_t *format,
|
| 577 |
+
int *nbDims,
|
| 578 |
+
int filterDimA[]);
|
| 579 |
+
cudnnStatus_t CUDNNWINAPI
|
| 580 |
+
cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t *size);
|
| 581 |
+
|
| 582 |
+
cudnnStatus_t CUDNNWINAPI
|
| 583 |
+
cudnnTransformFilter(cudnnHandle_t handle,
|
| 584 |
+
const cudnnTensorTransformDescriptor_t transDesc,
|
| 585 |
+
const void *alpha,
|
| 586 |
+
const cudnnFilterDescriptor_t srcDesc,
|
| 587 |
+
const void *srcData,
|
| 588 |
+
const void *beta,
|
| 589 |
+
const cudnnFilterDescriptor_t destDesc,
|
| 590 |
+
void *destData);
|
| 591 |
+
|
| 592 |
+
cudnnStatus_t CUDNNWINAPI
|
| 593 |
+
cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc);
|
| 594 |
+
|
| 595 |
+
/*
|
| 596 |
+
* softmax algorithm
|
| 597 |
+
*/
|
| 598 |
+
typedef enum {
|
| 599 |
+
CUDNN_SOFTMAX_FAST = 0, /* straightforward implementation */
|
| 600 |
+
CUDNN_SOFTMAX_ACCURATE = 1, /* subtract max from every point to avoid overflow */
|
| 601 |
+
CUDNN_SOFTMAX_LOG = 2
|
| 602 |
+
} cudnnSoftmaxAlgorithm_t;
|
| 603 |
+
|
| 604 |
+
typedef enum {
|
| 605 |
+
CUDNN_SOFTMAX_MODE_INSTANCE = 0, /* compute the softmax over all C, H, W for each N */
|
| 606 |
+
CUDNN_SOFTMAX_MODE_CHANNEL = 1 /* compute the softmax over all C for each H, W, N */
|
| 607 |
+
} cudnnSoftmaxMode_t;
|
| 608 |
+
|
| 609 |
+
/* Softmax functions: All of the form "output = alpha * Op(inputs) + beta * output" */
|
| 610 |
+
|
| 611 |
+
/* Function to perform forward softmax */
|
| 612 |
+
cudnnStatus_t CUDNNWINAPI
|
| 613 |
+
cudnnSoftmaxForward(cudnnHandle_t handle,
|
| 614 |
+
cudnnSoftmaxAlgorithm_t algo,
|
| 615 |
+
cudnnSoftmaxMode_t mode,
|
| 616 |
+
const void *alpha,
|
| 617 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 618 |
+
const void *x,
|
| 619 |
+
const void *beta,
|
| 620 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 621 |
+
void *y);
|
| 622 |
+
|
| 623 |
+
/*
|
| 624 |
+
* pooling mode
|
| 625 |
+
*/
|
| 626 |
+
typedef enum {
|
| 627 |
+
CUDNN_POOLING_MAX = 0,
|
| 628 |
+
CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1, /* count for average includes padded values */
|
| 629 |
+
CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2, /* count for average does not include padded values */
|
| 630 |
+
CUDNN_POOLING_MAX_DETERMINISTIC = 3
|
| 631 |
+
} cudnnPoolingMode_t;
|
| 632 |
+
|
| 633 |
+
/* Create an instance of pooling descriptor */
|
| 634 |
+
cudnnStatus_t CUDNNWINAPI
|
| 635 |
+
cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc);
|
| 636 |
+
|
| 637 |
+
cudnnStatus_t CUDNNWINAPI
|
| 638 |
+
cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
|
| 639 |
+
cudnnPoolingMode_t mode,
|
| 640 |
+
cudnnNanPropagation_t maxpoolingNanOpt,
|
| 641 |
+
int windowHeight,
|
| 642 |
+
int windowWidth,
|
| 643 |
+
int verticalPadding,
|
| 644 |
+
int horizontalPadding,
|
| 645 |
+
int verticalStride,
|
| 646 |
+
int horizontalStride);
|
| 647 |
+
|
| 648 |
+
cudnnStatus_t CUDNNWINAPI
|
| 649 |
+
cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
|
| 650 |
+
cudnnPoolingMode_t *mode,
|
| 651 |
+
cudnnNanPropagation_t *maxpoolingNanOpt,
|
| 652 |
+
int *windowHeight,
|
| 653 |
+
int *windowWidth,
|
| 654 |
+
int *verticalPadding,
|
| 655 |
+
int *horizontalPadding,
|
| 656 |
+
int *verticalStride,
|
| 657 |
+
int *horizontalStride);
|
| 658 |
+
|
| 659 |
+
cudnnStatus_t CUDNNWINAPI
|
| 660 |
+
cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc,
|
| 661 |
+
const cudnnPoolingMode_t mode,
|
| 662 |
+
const cudnnNanPropagation_t maxpoolingNanOpt,
|
| 663 |
+
int nbDims,
|
| 664 |
+
const int windowDimA[],
|
| 665 |
+
const int paddingA[],
|
| 666 |
+
const int strideA[]);
|
| 667 |
+
|
| 668 |
+
cudnnStatus_t CUDNNWINAPI
|
| 669 |
+
cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
|
| 670 |
+
int nbDimsRequested,
|
| 671 |
+
cudnnPoolingMode_t *mode,
|
| 672 |
+
cudnnNanPropagation_t *maxpoolingNanOpt,
|
| 673 |
+
int *nbDims,
|
| 674 |
+
int windowDimA[],
|
| 675 |
+
int paddingA[],
|
| 676 |
+
int strideA[]);
|
| 677 |
+
|
| 678 |
+
cudnnStatus_t CUDNNWINAPI
|
| 679 |
+
cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
|
| 680 |
+
const cudnnTensorDescriptor_t inputTensorDesc,
|
| 681 |
+
int nbDims,
|
| 682 |
+
int outputTensorDimA[]);
|
| 683 |
+
|
| 684 |
+
cudnnStatus_t CUDNNWINAPI
|
| 685 |
+
cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
|
| 686 |
+
const cudnnTensorDescriptor_t inputTensorDesc,
|
| 687 |
+
int *n,
|
| 688 |
+
int *c,
|
| 689 |
+
int *h,
|
| 690 |
+
int *w);
|
| 691 |
+
|
| 692 |
+
/* Destroy an instance of pooling descriptor */
|
| 693 |
+
cudnnStatus_t CUDNNWINAPI
|
| 694 |
+
cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc);
|
| 695 |
+
|
| 696 |
+
/* Pooling functions: All of the form "output = alpha * Op(inputs) + beta * output" */
|
| 697 |
+
|
| 698 |
+
/* Function to perform forward pooling */
|
| 699 |
+
cudnnStatus_t CUDNNWINAPI
|
| 700 |
+
cudnnPoolingForward(cudnnHandle_t handle,
|
| 701 |
+
const cudnnPoolingDescriptor_t poolingDesc,
|
| 702 |
+
const void *alpha,
|
| 703 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 704 |
+
const void *x,
|
| 705 |
+
const void *beta,
|
| 706 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 707 |
+
void *y);
|
| 708 |
+
|
| 709 |
+
/*
|
| 710 |
+
* activation mode
|
| 711 |
+
*/
|
| 712 |
+
typedef enum {
|
| 713 |
+
CUDNN_ACTIVATION_SIGMOID = 0,
|
| 714 |
+
CUDNN_ACTIVATION_RELU = 1,
|
| 715 |
+
CUDNN_ACTIVATION_TANH = 2,
|
| 716 |
+
CUDNN_ACTIVATION_CLIPPED_RELU = 3,
|
| 717 |
+
CUDNN_ACTIVATION_ELU = 4,
|
| 718 |
+
CUDNN_ACTIVATION_IDENTITY = 5,
|
| 719 |
+
CUDNN_ACTIVATION_SWISH = 6
|
| 720 |
+
} cudnnActivationMode_t;
|
| 721 |
+
|
| 722 |
+
/* Activation functions: All of the form "output = alpha * Op(inputs) + beta * output" */
|
| 723 |
+
cudnnStatus_t CUDNNWINAPI
|
| 724 |
+
cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc);
|
| 725 |
+
|
| 726 |
+
cudnnStatus_t CUDNNWINAPI
|
| 727 |
+
cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
|
| 728 |
+
cudnnActivationMode_t mode,
|
| 729 |
+
cudnnNanPropagation_t reluNanOpt,
|
| 730 |
+
double coef); /* ceiling for clipped RELU, alpha for ELU */
|
| 731 |
+
|
| 732 |
+
cudnnStatus_t CUDNNWINAPI
|
| 733 |
+
cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
|
| 734 |
+
cudnnActivationMode_t *mode,
|
| 735 |
+
cudnnNanPropagation_t *reluNanOpt,
|
| 736 |
+
double *coef); /* ceiling for clipped RELU, alpha for ELU */
|
| 737 |
+
|
| 738 |
+
cudnnStatus_t CUDNNWINAPI
|
| 739 |
+
cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double swish_beta);
|
| 740 |
+
|
| 741 |
+
cudnnStatus_t CUDNNWINAPI
|
| 742 |
+
cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double *swish_beta);
|
| 743 |
+
|
| 744 |
+
cudnnStatus_t CUDNNWINAPI
|
| 745 |
+
cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc);
|
| 746 |
+
|
| 747 |
+
/* Function to perform forward activation */
|
| 748 |
+
cudnnStatus_t CUDNNWINAPI
|
| 749 |
+
cudnnActivationForward(cudnnHandle_t handle,
|
| 750 |
+
cudnnActivationDescriptor_t activationDesc,
|
| 751 |
+
const void *alpha,
|
| 752 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 753 |
+
const void *x,
|
| 754 |
+
const void *beta,
|
| 755 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 756 |
+
void *y);
|
| 757 |
+
|
| 758 |
+
/*
|
| 759 |
+
* Create an instance of LRN (Local Response Normalization) descriptor
|
| 760 |
+
* Uses lrnN=5, lrnAlpha=1e-4, lrnBeta=0.75, lrnK=2.0 as defaults from Krizhevsky'12 ImageNet paper
|
| 761 |
+
*/
|
| 762 |
+
cudnnStatus_t CUDNNWINAPI
|
| 763 |
+
cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc);
|
| 764 |
+
|
| 765 |
+
#define CUDNN_LRN_MIN_N 1 /* minimum allowed lrnN */
|
| 766 |
+
#define CUDNN_LRN_MAX_N 16 /* maximum allowed lrnN */
|
| 767 |
+
#define CUDNN_LRN_MIN_K 1e-5 /* minimum allowed lrnK */
|
| 768 |
+
#define CUDNN_LRN_MIN_BETA 0.01 /* minimum allowed lrnBeta */
|
| 769 |
+
|
| 770 |
+
/* LRN layer mode */
|
| 771 |
+
typedef enum {
|
| 772 |
+
CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0, /* Normalize across tensor's dimA[1] dimension */
|
| 773 |
+
} cudnnLRNMode_t;
|
| 774 |
+
|
| 775 |
+
/*
|
| 776 |
+
* Uses a window [center-lookBehind, center+lookAhead], where
|
| 777 |
+
* lookBehind = floor( (lrnN-1)/2 ), lookAhead = lrnN-lookBehind-1.
|
| 778 |
+
* Values of double parameters cast to tensor data type.
|
| 779 |
+
*/
|
| 780 |
+
cudnnStatus_t CUDNNWINAPI
|
| 781 |
+
cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK);
|
| 782 |
+
/*
|
| 783 |
+
* Retrieve the settings currently stored in an LRN layer descriptor
|
| 784 |
+
* Any of the provided pointers can be NULL (no corresponding value will be returned)
|
| 785 |
+
*/
|
| 786 |
+
cudnnStatus_t CUDNNWINAPI
|
| 787 |
+
cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK);
|
| 788 |
+
|
| 789 |
+
/* Destroy an instance of LRN descriptor */
|
| 790 |
+
cudnnStatus_t CUDNNWINAPI
|
| 791 |
+
cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc);
|
| 792 |
+
|
| 793 |
+
/* LRN functions: output = alpha * normalize(x) + beta * old_y */
|
| 794 |
+
|
| 795 |
+
/* LRN cross-channel forward computation. Double parameters cast to tensor data type */
|
| 796 |
+
cudnnStatus_t CUDNNWINAPI
|
| 797 |
+
cudnnLRNCrossChannelForward(cudnnHandle_t handle,
|
| 798 |
+
cudnnLRNDescriptor_t normDesc,
|
| 799 |
+
cudnnLRNMode_t lrnMode,
|
| 800 |
+
const void *alpha,
|
| 801 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 802 |
+
const void *x,
|
| 803 |
+
const void *beta,
|
| 804 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 805 |
+
void *y);
|
| 806 |
+
|
| 807 |
+
typedef enum {
|
| 808 |
+
CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0,
|
| 809 |
+
} cudnnDivNormMode_t;
|
| 810 |
+
|
| 811 |
+
/* LCN/divisive normalization functions: y = alpha * normalize(x) + beta * y */
|
| 812 |
+
cudnnStatus_t CUDNNWINAPI
|
| 813 |
+
cudnnDivisiveNormalizationForward(cudnnHandle_t handle,
|
| 814 |
+
cudnnLRNDescriptor_t normDesc,
|
| 815 |
+
cudnnDivNormMode_t mode,
|
| 816 |
+
const void *alpha,
|
| 817 |
+
const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
|
| 818 |
+
const void *x,
|
| 819 |
+
const void *means, /* if NULL, means are assumed to be zero */
|
| 820 |
+
void *temp,
|
| 821 |
+
void *temp2,
|
| 822 |
+
const void *beta,
|
| 823 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 824 |
+
void *y);
|
| 825 |
+
|
| 826 |
+
typedef enum {
|
| 827 |
+
/* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
|
| 828 |
+
CUDNN_BATCHNORM_PER_ACTIVATION = 0,
|
| 829 |
+
|
| 830 |
+
/* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
|
| 831 |
+
CUDNN_BATCHNORM_SPATIAL = 1,
|
| 832 |
+
|
| 833 |
+
/*
|
| 834 |
+
* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors).
|
| 835 |
+
* May be faster than CUDNN_BATCHNORM_SPATIAL but imposes some limits on the range of values
|
| 836 |
+
*/
|
| 837 |
+
CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2,
|
| 838 |
+
} cudnnBatchNormMode_t;
|
| 839 |
+
|
| 840 |
+
#define CUDNN_BN_MIN_EPSILON 0.0 /* Minimum epsilon allowed to be used in the Batch Normalization formula */
|
| 841 |
+
|
| 842 |
+
/*
|
| 843 |
+
* Derives a tensor descriptor from layer data descriptor for BatchNormalization
|
| 844 |
+
* scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
|
| 845 |
+
* bnScaleBiasMeanVarDesc and bnScaleBiasDiffDesc in Batch Normalization forward and backward functions.
|
| 846 |
+
*/
|
| 847 |
+
cudnnStatus_t CUDNNWINAPI
|
| 848 |
+
cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc,
|
| 849 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 850 |
+
cudnnBatchNormMode_t mode);
|
| 851 |
+
|
| 852 |
+
typedef enum {
|
| 853 |
+
CUDNN_BATCHNORM_OPS_BN = 0, /* do batch normalization only */
|
| 854 |
+
CUDNN_BATCHNORM_OPS_BN_ACTIVATION = 1, /* do batchNorm, then activation */
|
| 855 |
+
CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2, /* do batchNorm, then elemWiseAdd, then activation */
|
| 856 |
+
} cudnnBatchNormOps_t;
|
| 857 |
+
|
| 858 |
+
/*
|
| 859 |
+
* Performs Batch Normalization during Inference:
|
| 860 |
+
* y[i] = bnScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + bnBias[k]
|
| 861 |
+
* with bnScale, bnBias, runningMean, runningInvVariance tensors indexed
|
| 862 |
+
* according to spatial or per-activation mode. Refer to cudnnBatchNormalizationForwardTraining
|
| 863 |
+
* above for notes on function arguments.
|
| 864 |
+
*/
|
| 865 |
+
cudnnStatus_t CUDNNWINAPI
|
| 866 |
+
cudnnBatchNormalizationForwardInference(cudnnHandle_t handle,
|
| 867 |
+
cudnnBatchNormMode_t mode,
|
| 868 |
+
const void *alpha, /* alpha[0] = result blend factor */
|
| 869 |
+
const void *beta, /* beta[0] = dest layer blend factor */
|
| 870 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 871 |
+
const void *x, /* NxCxHxW */
|
| 872 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 873 |
+
void *y, /* NxCxHxW */
|
| 874 |
+
const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
|
| 875 |
+
const void *bnScale,
|
| 876 |
+
const void *bnBias,
|
| 877 |
+
const void *estimatedMean,
|
| 878 |
+
const void *estimatedVariance,
|
| 879 |
+
double epsilon);
|
| 880 |
+
|
| 881 |
+
typedef enum {
|
| 882 |
+
/* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
|
| 883 |
+
CUDNN_NORM_PER_ACTIVATION = 0,
|
| 884 |
+
|
| 885 |
+
/* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
|
| 886 |
+
CUDNN_NORM_PER_CHANNEL = 1,
|
| 887 |
+
} cudnnNormMode_t;
|
| 888 |
+
|
| 889 |
+
typedef enum { CUDNN_NORM_ALGO_STANDARD = 0, CUDNN_NORM_ALGO_PERSIST = 1 } cudnnNormAlgo_t;
|
| 890 |
+
|
| 891 |
+
/*
|
| 892 |
+
* Derives a tensor descriptor from layer data descriptor for Normalization
|
| 893 |
+
* scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
|
| 894 |
+
* normScaleBiasMeanVarDesc and normScaleBiasDiffDesc in Normalization forward and backward functions.
|
| 895 |
+
*/
|
| 896 |
+
cudnnStatus_t CUDNNWINAPI
|
| 897 |
+
cudnnDeriveNormTensorDescriptor(cudnnTensorDescriptor_t derivedNormScaleBiasDesc,
|
| 898 |
+
cudnnTensorDescriptor_t derivedNormMeanVarDesc,
|
| 899 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 900 |
+
cudnnNormMode_t mode,
|
| 901 |
+
int groupCnt); /* Place hold for future work, should be set to 1 now*/
|
| 902 |
+
|
| 903 |
+
typedef enum {
|
| 904 |
+
CUDNN_NORM_OPS_NORM = 0, /* do normalization only */
|
| 905 |
+
CUDNN_NORM_OPS_NORM_ACTIVATION = 1, /* do Norm, then activation */
|
| 906 |
+
CUDNN_NORM_OPS_NORM_ADD_ACTIVATION = 2, /* do Norm, then elemWiseAdd, then activation */
|
| 907 |
+
} cudnnNormOps_t;
|
| 908 |
+
|
| 909 |
+
/*
|
| 910 |
+
* Performs Normalization during Inference:
|
| 911 |
+
* y[i] = normScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + normBias[k]
|
| 912 |
+
* with normScale, normBias, runningMean, runningInvVariance tensors indexed
|
| 913 |
+
* according to per-channel or per-activation mode. Refer to cudnnNormalizationForwardTraining
|
| 914 |
+
* above for notes on function arguments.
|
| 915 |
+
*/
|
| 916 |
+
cudnnStatus_t CUDNNWINAPI
|
| 917 |
+
cudnnNormalizationForwardInference(cudnnHandle_t handle,
|
| 918 |
+
cudnnNormMode_t mode,
|
| 919 |
+
cudnnNormOps_t normOps,
|
| 920 |
+
cudnnNormAlgo_t algo,
|
| 921 |
+
const void *alpha, /* alpha[0] = result blend factor */
|
| 922 |
+
const void *beta, /* beta[0] = dest layer blend factor */
|
| 923 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 924 |
+
const void *x, /* NxCxHxW */
|
| 925 |
+
const cudnnTensorDescriptor_t normScaleBiasDesc,
|
| 926 |
+
const void *normScale,
|
| 927 |
+
const void *normBias,
|
| 928 |
+
const cudnnTensorDescriptor_t normMeanVarDesc,
|
| 929 |
+
const void *estimatedMean,
|
| 930 |
+
const void *estimatedVariance,
|
| 931 |
+
const cudnnTensorDescriptor_t zDesc,
|
| 932 |
+
const void *z,
|
| 933 |
+
cudnnActivationDescriptor_t activationDesc,
|
| 934 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 935 |
+
void *y, /* NxCxHxW */
|
| 936 |
+
double epsilon,
|
| 937 |
+
int groupCnt); /* Place hold for future work*/
|
| 938 |
+
|
| 939 |
+
/* APIs for spatial transformer network*/
|
| 940 |
+
typedef enum {
|
| 941 |
+
CUDNN_SAMPLER_BILINEAR = 0,
|
| 942 |
+
} cudnnSamplerType_t;
|
| 943 |
+
|
| 944 |
+
cudnnStatus_t CUDNNWINAPI
|
| 945 |
+
cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc);
|
| 946 |
+
|
| 947 |
+
cudnnStatus_t CUDNNWINAPI
|
| 948 |
+
cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc,
|
| 949 |
+
cudnnSamplerType_t samplerType,
|
| 950 |
+
cudnnDataType_t dataType,
|
| 951 |
+
const int nbDims,
|
| 952 |
+
const int dimA[]);
|
| 953 |
+
|
| 954 |
+
cudnnStatus_t CUDNNWINAPI
|
| 955 |
+
cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc);
|
| 956 |
+
|
| 957 |
+
cudnnStatus_t CUDNNWINAPI
|
| 958 |
+
cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle,
|
| 959 |
+
const cudnnSpatialTransformerDescriptor_t stDesc,
|
| 960 |
+
const void *theta,
|
| 961 |
+
void *grid);
|
| 962 |
+
|
| 963 |
+
cudnnStatus_t CUDNNWINAPI
|
| 964 |
+
cudnnSpatialTfSamplerForward(cudnnHandle_t handle,
|
| 965 |
+
cudnnSpatialTransformerDescriptor_t stDesc,
|
| 966 |
+
const void *alpha,
|
| 967 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 968 |
+
const void *x,
|
| 969 |
+
const void *grid,
|
| 970 |
+
const void *beta,
|
| 971 |
+
cudnnTensorDescriptor_t yDesc,
|
| 972 |
+
void *y);
|
| 973 |
+
|
| 974 |
+
typedef struct cudnnDropoutStruct *cudnnDropoutDescriptor_t;
|
| 975 |
+
|
| 976 |
+
cudnnStatus_t CUDNNWINAPI
|
| 977 |
+
cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc);
|
| 978 |
+
|
| 979 |
+
cudnnStatus_t CUDNNWINAPI
|
| 980 |
+
cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc);
|
| 981 |
+
|
| 982 |
+
/*helper function to determine size of the states to be passed to cudnnSetDropoutDescriptor */
|
| 983 |
+
cudnnStatus_t CUDNNWINAPI
|
| 984 |
+
cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes);
|
| 985 |
+
|
| 986 |
+
/*helper function to determine size of the reserve space to be passed to dropout forward/backward calls */
|
| 987 |
+
cudnnStatus_t CUDNNWINAPI
|
| 988 |
+
cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes);
|
| 989 |
+
|
| 990 |
+
cudnnStatus_t CUDNNWINAPI
|
| 991 |
+
cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
|
| 992 |
+
cudnnHandle_t handle,
|
| 993 |
+
float dropout,
|
| 994 |
+
void *states,
|
| 995 |
+
size_t stateSizeInBytes,
|
| 996 |
+
unsigned long long seed);
|
| 997 |
+
|
| 998 |
+
/* Restores the dropout descriptor to a previously saved-off state */
|
| 999 |
+
cudnnStatus_t CUDNNWINAPI
|
| 1000 |
+
cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
|
| 1001 |
+
cudnnHandle_t handle,
|
| 1002 |
+
float dropout,
|
| 1003 |
+
void *states,
|
| 1004 |
+
size_t stateSizeInBytes,
|
| 1005 |
+
unsigned long long seed);
|
| 1006 |
+
|
| 1007 |
+
cudnnStatus_t CUDNNWINAPI
|
| 1008 |
+
cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
|
| 1009 |
+
cudnnHandle_t handle,
|
| 1010 |
+
float *dropout,
|
| 1011 |
+
void **states,
|
| 1012 |
+
unsigned long long *seed);
|
| 1013 |
+
|
| 1014 |
+
cudnnStatus_t CUDNNWINAPI
|
| 1015 |
+
cudnnDropoutForward(cudnnHandle_t handle,
|
| 1016 |
+
const cudnnDropoutDescriptor_t dropoutDesc,
|
| 1017 |
+
const cudnnTensorDescriptor_t xdesc,
|
| 1018 |
+
const void *x,
|
| 1019 |
+
const cudnnTensorDescriptor_t ydesc,
|
| 1020 |
+
void *y,
|
| 1021 |
+
void *reserveSpace,
|
| 1022 |
+
size_t reserveSpaceSizeInBytes);
|
| 1023 |
+
|
| 1024 |
+
/* TODO: remove */
|
| 1025 |
+
|
| 1026 |
+
typedef struct cudnnAlgorithmStruct *cudnnAlgorithmDescriptor_t;
|
| 1027 |
+
typedef struct cudnnAlgorithmPerformanceStruct *cudnnAlgorithmPerformance_t;
|
| 1028 |
+
|
| 1029 |
+
/* TODO: move these enums out to the appropriate submodule */
|
| 1030 |
+
typedef enum {
|
| 1031 |
+
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 0,
|
| 1032 |
+
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1,
|
| 1033 |
+
CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2,
|
| 1034 |
+
CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 3,
|
| 1035 |
+
CUDNN_CONVOLUTION_FWD_ALGO_FFT = 4,
|
| 1036 |
+
CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 5,
|
| 1037 |
+
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 6,
|
| 1038 |
+
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED = 7,
|
| 1039 |
+
CUDNN_CONVOLUTION_FWD_ALGO_COUNT = 8
|
| 1040 |
+
} cudnnConvolutionFwdAlgo_t;
|
| 1041 |
+
|
| 1042 |
+
typedef enum {
|
| 1043 |
+
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 0, /* non-deterministic */
|
| 1044 |
+
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 1,
|
| 1045 |
+
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 2,
|
| 1046 |
+
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 3, /* non-deterministic */
|
| 1047 |
+
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD = 4, /* not implemented */
|
| 1048 |
+
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5,
|
| 1049 |
+
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING = 6,
|
| 1050 |
+
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT = 7
|
| 1051 |
+
} cudnnConvolutionBwdFilterAlgo_t;
|
| 1052 |
+
|
| 1053 |
+
typedef enum {
|
| 1054 |
+
CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 0, /* non-deterministic */
|
| 1055 |
+
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 1,
|
| 1056 |
+
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 2,
|
| 1057 |
+
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 3,
|
| 1058 |
+
CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 4,
|
| 1059 |
+
CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5,
|
| 1060 |
+
CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT = 6
|
| 1061 |
+
} cudnnConvolutionBwdDataAlgo_t;
|
| 1062 |
+
|
| 1063 |
+
typedef enum {
|
| 1064 |
+
CUDNN_RNN_ALGO_STANDARD = 0,
|
| 1065 |
+
CUDNN_RNN_ALGO_PERSIST_STATIC = 1,
|
| 1066 |
+
CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 2,
|
| 1067 |
+
CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H = 3,
|
| 1068 |
+
CUDNN_RNN_ALGO_COUNT = 4,
|
| 1069 |
+
} cudnnRNNAlgo_t;
|
| 1070 |
+
|
| 1071 |
+
typedef enum { CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0, CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 } cudnnCTCLossAlgo_t;
|
| 1072 |
+
|
| 1073 |
+
/* TODO: remove */
|
| 1074 |
+
typedef struct cudnnAlgorithmUnionStruct {
|
| 1075 |
+
union Algorithm {
|
| 1076 |
+
cudnnConvolutionFwdAlgo_t convFwdAlgo;
|
| 1077 |
+
cudnnConvolutionBwdFilterAlgo_t convBwdFilterAlgo;
|
| 1078 |
+
cudnnConvolutionBwdDataAlgo_t convBwdDataAlgo;
|
| 1079 |
+
cudnnRNNAlgo_t RNNAlgo;
|
| 1080 |
+
cudnnCTCLossAlgo_t CTCLossAlgo;
|
| 1081 |
+
} algo;
|
| 1082 |
+
} cudnnAlgorithm_t;
|
| 1083 |
+
|
| 1084 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 1085 |
+
cudnnCreateAlgorithmDescriptor(cudnnAlgorithmDescriptor_t *algoDesc);
|
| 1086 |
+
|
| 1087 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 1088 |
+
cudnnSetAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm);
|
| 1089 |
+
|
| 1090 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 1091 |
+
cudnnGetAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm);
|
| 1092 |
+
|
| 1093 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 1094 |
+
cudnnCopyAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest);
|
| 1095 |
+
|
| 1096 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 1097 |
+
cudnnDestroyAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc);
|
| 1098 |
+
|
| 1099 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 1100 |
+
cudnnCreateAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate);
|
| 1101 |
+
|
| 1102 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 1103 |
+
cudnnSetAlgorithmPerformance(cudnnAlgorithmPerformance_t algoPerf,
|
| 1104 |
+
cudnnAlgorithmDescriptor_t algoDesc,
|
| 1105 |
+
cudnnStatus_t status,
|
| 1106 |
+
float time,
|
| 1107 |
+
size_t memory);
|
| 1108 |
+
|
| 1109 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 1110 |
+
cudnnGetAlgorithmPerformance(const cudnnAlgorithmPerformance_t algoPerf,
|
| 1111 |
+
cudnnAlgorithmDescriptor_t *algoDesc,
|
| 1112 |
+
cudnnStatus_t *status,
|
| 1113 |
+
float *time,
|
| 1114 |
+
size_t *memory);
|
| 1115 |
+
|
| 1116 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 1117 |
+
cudnnDestroyAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy);
|
| 1118 |
+
|
| 1119 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 1120 |
+
cudnnGetAlgorithmSpaceSize(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc, size_t *algoSpaceSizeInBytes);
|
| 1121 |
+
|
| 1122 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 1123 |
+
cudnnSaveAlgorithm(cudnnHandle_t handle,
|
| 1124 |
+
cudnnAlgorithmDescriptor_t algoDesc,
|
| 1125 |
+
void *algoSpace,
|
| 1126 |
+
size_t algoSpaceSizeInBytes);
|
| 1127 |
+
|
| 1128 |
+
CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
|
| 1129 |
+
cudnnRestoreAlgorithm(cudnnHandle_t handle,
|
| 1130 |
+
void *algoSpace,
|
| 1131 |
+
size_t algoSpaceSizeInBytes,
|
| 1132 |
+
cudnnAlgorithmDescriptor_t algoDesc);
|
| 1133 |
+
|
| 1134 |
+
typedef enum {
|
| 1135 |
+
CUDNN_SEV_FATAL = 0,
|
| 1136 |
+
CUDNN_SEV_ERROR = 1,
|
| 1137 |
+
CUDNN_SEV_WARNING = 2,
|
| 1138 |
+
CUDNN_SEV_INFO = 3,
|
| 1139 |
+
} cudnnSeverity_t;
|
| 1140 |
+
|
| 1141 |
+
/* Message masks to be used with cudnnSetCallback() */
|
| 1142 |
+
#define CUDNN_SEV_ERROR_EN (1U << CUDNN_SEV_ERROR)
|
| 1143 |
+
#define CUDNN_SEV_WARNING_EN (1U << CUDNN_SEV_WARNING)
|
| 1144 |
+
#define CUDNN_SEV_INFO_EN (1U << CUDNN_SEV_INFO)
|
| 1145 |
+
|
| 1146 |
+
/* struct containing useful informaiton for each API call */
|
| 1147 |
+
typedef struct cudnnDebugStruct {
|
| 1148 |
+
unsigned cudnn_version;
|
| 1149 |
+
cudnnStatus_t cudnnStatus;
|
| 1150 |
+
unsigned time_sec; /* epoch time in seconds */
|
| 1151 |
+
unsigned time_usec; /* microseconds part of epoch time */
|
| 1152 |
+
unsigned time_delta; /* time since start in seconds */
|
| 1153 |
+
cudnnHandle_t handle; /* cudnn handle */
|
| 1154 |
+
cudaStream_t stream; /* cuda stream ID */
|
| 1155 |
+
unsigned long long pid; /* process ID */
|
| 1156 |
+
unsigned long long tid; /* thread ID */
|
| 1157 |
+
int cudaDeviceId; /* CUDA device ID */
|
| 1158 |
+
int reserved[15]; /* reserved for future use */
|
| 1159 |
+
} cudnnDebug_t;
|
| 1160 |
+
|
| 1161 |
+
typedef void (*cudnnCallback_t)(cudnnSeverity_t sev, void *udata, const cudnnDebug_t *dbg, const char *msg);
|
| 1162 |
+
|
| 1163 |
+
cudnnStatus_t CUDNNWINAPI
|
| 1164 |
+
cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr);
|
| 1165 |
+
|
| 1166 |
+
cudnnStatus_t CUDNNWINAPI
|
| 1167 |
+
cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr);
|
| 1168 |
+
|
| 1169 |
+
/*
|
| 1170 |
+
* \brief Cross-library version checker.
|
| 1171 |
+
* This function is implemented differently in each sub-library. Each sublib
|
| 1172 |
+
* checks whether its own version matches that of its dependencies.
|
| 1173 |
+
* \returns CUDNN_STATUS_SUCCESS if the version check passes,
|
| 1174 |
+
* CUDNN_STATUS_VERSION_MISMATCH if the versions are inconsistent.
|
| 1175 |
+
*/
|
| 1176 |
+
cudnnStatus_t CUDNNWINAPI
|
| 1177 |
+
cudnnOpsInferVersionCheck(void);
|
| 1178 |
+
|
| 1179 |
+
#if defined(__cplusplus)
|
| 1180 |
+
}
|
| 1181 |
+
#endif
|
| 1182 |
+
|
| 1183 |
+
#endif /* CUDNN_OPS_INFER_H_ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops_train.h
ADDED
|
@@ -0,0 +1,501 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2017-2022 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
/*
|
| 51 |
+
* cudnn_ops_train : cuDNN's basic training operations and algorithms.
|
| 52 |
+
*/
|
| 53 |
+
|
| 54 |
+
#if !defined(CUDNN_OPS_TRAIN_H_)
|
| 55 |
+
#define CUDNN_OPS_TRAIN_H_
|
| 56 |
+
|
| 57 |
+
#include <cuda_runtime.h>
|
| 58 |
+
#include <stdint.h>
|
| 59 |
+
|
| 60 |
+
#include "cudnn_version.h"
|
| 61 |
+
#include "cudnn_ops_infer.h"
|
| 62 |
+
|
| 63 |
+
/* These version numbers are autogenerated, do not edit manually. */
|
| 64 |
+
#define CUDNN_OPS_TRAIN_MAJOR 8
|
| 65 |
+
#define CUDNN_OPS_TRAIN_MINOR 7
|
| 66 |
+
#define CUDNN_OPS_TRAIN_PATCH 0
|
| 67 |
+
|
| 68 |
+
#if (CUDNN_OPS_TRAIN_MAJOR != CUDNN_MAJOR) || (CUDNN_OPS_TRAIN_MINOR != CUDNN_MINOR) || \
|
| 69 |
+
(CUDNN_OPS_TRAIN_PATCH != CUDNN_PATCHLEVEL)
|
| 70 |
+
#error Version mismatch in cuDNN OPS TRAIN!!!
|
| 71 |
+
#endif
|
| 72 |
+
|
| 73 |
+
#if defined(__cplusplus)
|
| 74 |
+
extern "C" {
|
| 75 |
+
#endif
|
| 76 |
+
|
| 77 |
+
/* Function to perform backward softmax */
|
| 78 |
+
cudnnStatus_t CUDNNWINAPI
|
| 79 |
+
cudnnSoftmaxBackward(cudnnHandle_t handle,
|
| 80 |
+
cudnnSoftmaxAlgorithm_t algo,
|
| 81 |
+
cudnnSoftmaxMode_t mode,
|
| 82 |
+
const void *alpha,
|
| 83 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 84 |
+
const void *y,
|
| 85 |
+
const cudnnTensorDescriptor_t dyDesc,
|
| 86 |
+
const void *dy,
|
| 87 |
+
const void *beta,
|
| 88 |
+
const cudnnTensorDescriptor_t dxDesc,
|
| 89 |
+
void *dx);
|
| 90 |
+
|
| 91 |
+
/* Function to perform backward pooling */
|
| 92 |
+
cudnnStatus_t CUDNNWINAPI
|
| 93 |
+
cudnnPoolingBackward(cudnnHandle_t handle,
|
| 94 |
+
const cudnnPoolingDescriptor_t poolingDesc,
|
| 95 |
+
const void *alpha,
|
| 96 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 97 |
+
const void *y,
|
| 98 |
+
const cudnnTensorDescriptor_t dyDesc,
|
| 99 |
+
const void *dy,
|
| 100 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 101 |
+
const void *x,
|
| 102 |
+
const void *beta,
|
| 103 |
+
const cudnnTensorDescriptor_t dxDesc,
|
| 104 |
+
void *dx);
|
| 105 |
+
|
| 106 |
+
/* Function to perform backward activation */
|
| 107 |
+
cudnnStatus_t CUDNNWINAPI
|
| 108 |
+
cudnnActivationBackward(cudnnHandle_t handle,
|
| 109 |
+
cudnnActivationDescriptor_t activationDesc,
|
| 110 |
+
const void *alpha,
|
| 111 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 112 |
+
const void *y,
|
| 113 |
+
const cudnnTensorDescriptor_t dyDesc,
|
| 114 |
+
const void *dy,
|
| 115 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 116 |
+
const void *x,
|
| 117 |
+
const void *beta,
|
| 118 |
+
const cudnnTensorDescriptor_t dxDesc,
|
| 119 |
+
void *dx);
|
| 120 |
+
|
| 121 |
+
/* LRN cross-channel backward computation. Double parameters cast to tensor data type */
|
| 122 |
+
cudnnStatus_t CUDNNWINAPI
|
| 123 |
+
cudnnLRNCrossChannelBackward(cudnnHandle_t handle,
|
| 124 |
+
cudnnLRNDescriptor_t normDesc,
|
| 125 |
+
cudnnLRNMode_t lrnMode,
|
| 126 |
+
const void *alpha,
|
| 127 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 128 |
+
const void *y,
|
| 129 |
+
const cudnnTensorDescriptor_t dyDesc,
|
| 130 |
+
const void *dy,
|
| 131 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 132 |
+
const void *x,
|
| 133 |
+
const void *beta,
|
| 134 |
+
const cudnnTensorDescriptor_t dxDesc,
|
| 135 |
+
void *dx);
|
| 136 |
+
|
| 137 |
+
cudnnStatus_t CUDNNWINAPI
|
| 138 |
+
cudnnDivisiveNormalizationBackward(cudnnHandle_t handle,
|
| 139 |
+
cudnnLRNDescriptor_t normDesc,
|
| 140 |
+
cudnnDivNormMode_t mode,
|
| 141 |
+
const void *alpha,
|
| 142 |
+
const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */
|
| 143 |
+
const void *x,
|
| 144 |
+
const void *means, /* if NULL, means are assumed to be zero */
|
| 145 |
+
const void *dy,
|
| 146 |
+
void *temp,
|
| 147 |
+
void *temp2,
|
| 148 |
+
const void *beta,
|
| 149 |
+
const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
|
| 150 |
+
void *dx, /* output x differential */
|
| 151 |
+
void *dMeans); /* output means differential, can be NULL */
|
| 152 |
+
|
| 153 |
+
cudnnStatus_t CUDNNWINAPI
|
| 154 |
+
cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHandle_t handle,
|
| 155 |
+
cudnnBatchNormMode_t mode,
|
| 156 |
+
cudnnBatchNormOps_t bnOps,
|
| 157 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 158 |
+
const cudnnTensorDescriptor_t zDesc,
|
| 159 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 160 |
+
const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
|
| 161 |
+
const cudnnActivationDescriptor_t activationDesc,
|
| 162 |
+
size_t *sizeInBytes);
|
| 163 |
+
|
| 164 |
+
cudnnStatus_t CUDNNWINAPI
|
| 165 |
+
cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t handle,
|
| 166 |
+
cudnnBatchNormMode_t mode,
|
| 167 |
+
cudnnBatchNormOps_t bnOps,
|
| 168 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 169 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 170 |
+
const cudnnTensorDescriptor_t dyDesc,
|
| 171 |
+
const cudnnTensorDescriptor_t dzDesc,
|
| 172 |
+
const cudnnTensorDescriptor_t dxDesc,
|
| 173 |
+
const cudnnTensorDescriptor_t dBnScaleBiasDesc,
|
| 174 |
+
const cudnnActivationDescriptor_t activationDesc,
|
| 175 |
+
size_t *sizeInBytes);
|
| 176 |
+
|
| 177 |
+
cudnnStatus_t CUDNNWINAPI
|
| 178 |
+
cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t handle,
|
| 179 |
+
cudnnBatchNormMode_t mode,
|
| 180 |
+
cudnnBatchNormOps_t bnOps,
|
| 181 |
+
const cudnnActivationDescriptor_t activationDesc,
|
| 182 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 183 |
+
size_t *sizeInBytes);
|
| 184 |
+
|
| 185 |
+
/* Computes y = BN(x). Also accumulates moving averages of mean and inverse variances */
|
| 186 |
+
cudnnStatus_t CUDNNWINAPI
|
| 187 |
+
cudnnBatchNormalizationForwardTraining(
|
| 188 |
+
cudnnHandle_t handle,
|
| 189 |
+
cudnnBatchNormMode_t mode,
|
| 190 |
+
|
| 191 |
+
const void *alpha, /* alpha[0] = result blend factor */
|
| 192 |
+
const void *beta, /* beta[0] = dest layer blend factor */
|
| 193 |
+
|
| 194 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 195 |
+
const void *x, /* NxCxHxW */
|
| 196 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 197 |
+
void *y, /* NxCxHxW */
|
| 198 |
+
|
| 199 |
+
/* Shared desc for the next 6 tensors in the argument list.
|
| 200 |
+
Data type to be set as follows:
|
| 201 |
+
type = (typeOf(x) == double) ? double : float
|
| 202 |
+
Dimensions for this descriptor depend on normalization mode
|
| 203 |
+
- Spatial Normalization : tensors are expected to have dims 1xCx1x1
|
| 204 |
+
(normalization is performed across NxHxW)
|
| 205 |
+
- Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW
|
| 206 |
+
(normalization is performed across N) */
|
| 207 |
+
const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
|
| 208 |
+
|
| 209 |
+
/* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
|
| 210 |
+
const void *bnScale,
|
| 211 |
+
const void *bnBias,
|
| 212 |
+
|
| 213 |
+
/* MUST use factor=1 in the very first call of a complete training cycle.
|
| 214 |
+
Use a factor=1/(1+n) at N-th call to the function to get
|
| 215 |
+
Cumulative Moving Average (CMA) behavior
|
| 216 |
+
CMA[n] = (x[1]+...+x[n])/n
|
| 217 |
+
Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
|
| 218 |
+
((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
|
| 219 |
+
CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
|
| 220 |
+
double exponentialAverageFactor,
|
| 221 |
+
|
| 222 |
+
/* Used in Training phase only.
|
| 223 |
+
runningMean = newMean*factor + runningMean*(1-factor) */
|
| 224 |
+
void *resultRunningMean,
|
| 225 |
+
/* Output in training mode, input in inference. Is the moving average
|
| 226 |
+
of variance[x] (factor is applied in the same way as for runningMean) */
|
| 227 |
+
void *resultRunningVariance,
|
| 228 |
+
|
| 229 |
+
/* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
|
| 230 |
+
double epsilon,
|
| 231 |
+
|
| 232 |
+
/* Optionally save intermediate results from the forward pass here
|
| 233 |
+
- can be reused to speed up backward pass. NULL if unused */
|
| 234 |
+
void *resultSaveMean,
|
| 235 |
+
void *resultSaveInvVariance);
|
| 236 |
+
|
| 237 |
+
/* Computes y = relu(BN(x) + z). Also accumulates moving averages of mean and inverse variances */
|
| 238 |
+
cudnnStatus_t CUDNNWINAPI
|
| 239 |
+
cudnnBatchNormalizationForwardTrainingEx(
|
| 240 |
+
cudnnHandle_t handle,
|
| 241 |
+
cudnnBatchNormMode_t mode,
|
| 242 |
+
cudnnBatchNormOps_t bnOps,
|
| 243 |
+
|
| 244 |
+
const void *alpha, /* alpha[0] = result blend factor */
|
| 245 |
+
const void *beta, /* beta[0] = dest layer blend factor */
|
| 246 |
+
|
| 247 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 248 |
+
const void *xData,
|
| 249 |
+
const cudnnTensorDescriptor_t zDesc,
|
| 250 |
+
const void *zData,
|
| 251 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 252 |
+
void *yData,
|
| 253 |
+
|
| 254 |
+
const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
|
| 255 |
+
const void *bnScale,
|
| 256 |
+
const void *bnBias,
|
| 257 |
+
|
| 258 |
+
double exponentialAverageFactor,
|
| 259 |
+
void *resultRunningMean,
|
| 260 |
+
void *resultRunningVariance,
|
| 261 |
+
|
| 262 |
+
/* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
|
| 263 |
+
double epsilon,
|
| 264 |
+
|
| 265 |
+
/* Optionally save intermediate results from the forward pass here
|
| 266 |
+
- can be reused to speed up backward pass. NULL if unused */
|
| 267 |
+
void *resultSaveMean,
|
| 268 |
+
void *resultSaveInvVariance,
|
| 269 |
+
|
| 270 |
+
cudnnActivationDescriptor_t activationDesc,
|
| 271 |
+
void *workspace,
|
| 272 |
+
size_t workSpaceSizeInBytes,
|
| 273 |
+
void *reserveSpace,
|
| 274 |
+
size_t reserveSpaceSizeInBytes);
|
| 275 |
+
|
| 276 |
+
/* Performs backward pass of Batch Normalization layer. Returns x gradient,
|
| 277 |
+
* bnScale gradient and bnBias gradient */
|
| 278 |
+
cudnnStatus_t CUDNNWINAPI
|
| 279 |
+
cudnnBatchNormalizationBackward(cudnnHandle_t handle,
|
| 280 |
+
cudnnBatchNormMode_t mode,
|
| 281 |
+
const void *alphaDataDiff,
|
| 282 |
+
const void *betaDataDiff,
|
| 283 |
+
const void *alphaParamDiff,
|
| 284 |
+
const void *betaParamDiff,
|
| 285 |
+
const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
|
| 286 |
+
const void *x,
|
| 287 |
+
const cudnnTensorDescriptor_t dyDesc,
|
| 288 |
+
const void *dy,
|
| 289 |
+
const cudnnTensorDescriptor_t dxDesc,
|
| 290 |
+
void *dx,
|
| 291 |
+
/* Shared tensor desc for the 4 tensors below */
|
| 292 |
+
const cudnnTensorDescriptor_t dBnScaleBiasDesc,
|
| 293 |
+
const void *bnScale, /* bnBias doesn't affect backpropagation */
|
| 294 |
+
/* scale and bias diff are not backpropagated below this layer */
|
| 295 |
+
void *dBnScaleResult,
|
| 296 |
+
void *dBnBiasResult,
|
| 297 |
+
/* Same epsilon as forward pass */
|
| 298 |
+
double epsilon,
|
| 299 |
+
|
| 300 |
+
/* Optionally cached intermediate results from
|
| 301 |
+
forward pass */
|
| 302 |
+
const void *savedMean,
|
| 303 |
+
const void *savedInvVariance);
|
| 304 |
+
|
| 305 |
+
cudnnStatus_t CUDNNWINAPI
|
| 306 |
+
cudnnBatchNormalizationBackwardEx(cudnnHandle_t handle,
|
| 307 |
+
cudnnBatchNormMode_t mode,
|
| 308 |
+
cudnnBatchNormOps_t bnOps,
|
| 309 |
+
|
| 310 |
+
const void *alphaDataDiff,
|
| 311 |
+
const void *betaDataDiff,
|
| 312 |
+
const void *alphaParamDiff,
|
| 313 |
+
const void *betaParamDiff,
|
| 314 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 315 |
+
const void *xData,
|
| 316 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 317 |
+
const void *yData,
|
| 318 |
+
const cudnnTensorDescriptor_t dyDesc,
|
| 319 |
+
const void *dyData,
|
| 320 |
+
const cudnnTensorDescriptor_t dzDesc,
|
| 321 |
+
void *dzData,
|
| 322 |
+
const cudnnTensorDescriptor_t dxDesc,
|
| 323 |
+
void *dxData,
|
| 324 |
+
|
| 325 |
+
/* Shared tensor desc for the 4 tensors below */
|
| 326 |
+
const cudnnTensorDescriptor_t dBnScaleBiasDesc,
|
| 327 |
+
const void *bnScaleData,
|
| 328 |
+
const void *bnBiasData, /* needed if there is activation */
|
| 329 |
+
void *dBnScaleData,
|
| 330 |
+
void *dBnBiasData,
|
| 331 |
+
double epsilon, /* Same epsilon as forward pass */
|
| 332 |
+
|
| 333 |
+
/* Optionally cached intermediate results from
|
| 334 |
+
forward pass */
|
| 335 |
+
const void *savedMean,
|
| 336 |
+
const void *savedInvVariance,
|
| 337 |
+
cudnnActivationDescriptor_t activationDesc,
|
| 338 |
+
void *workSpace,
|
| 339 |
+
size_t workSpaceSizeInBytes,
|
| 340 |
+
void *reserveSpace,
|
| 341 |
+
size_t reserveSpaceSizeInBytes);
|
| 342 |
+
|
| 343 |
+
cudnnStatus_t CUDNNWINAPI
|
| 344 |
+
cudnnGetNormalizationForwardTrainingWorkspaceSize(cudnnHandle_t handle,
|
| 345 |
+
cudnnNormMode_t mode,
|
| 346 |
+
cudnnNormOps_t normOps,
|
| 347 |
+
cudnnNormAlgo_t algo,
|
| 348 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 349 |
+
const cudnnTensorDescriptor_t zDesc,
|
| 350 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 351 |
+
const cudnnTensorDescriptor_t normScaleBiasDesc,
|
| 352 |
+
const cudnnActivationDescriptor_t activationDesc,
|
| 353 |
+
const cudnnTensorDescriptor_t normMeanVarDesc,
|
| 354 |
+
size_t *sizeInBytes,
|
| 355 |
+
int groupCnt); /* Place hold for future work, should be set to 1 now*/
|
| 356 |
+
|
| 357 |
+
cudnnStatus_t CUDNNWINAPI
|
| 358 |
+
cudnnGetNormalizationBackwardWorkspaceSize(cudnnHandle_t handle,
|
| 359 |
+
cudnnNormMode_t mode,
|
| 360 |
+
cudnnNormOps_t normOps,
|
| 361 |
+
cudnnNormAlgo_t algo,
|
| 362 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 363 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 364 |
+
const cudnnTensorDescriptor_t dyDesc,
|
| 365 |
+
const cudnnTensorDescriptor_t dzDesc,
|
| 366 |
+
const cudnnTensorDescriptor_t dxDesc,
|
| 367 |
+
const cudnnTensorDescriptor_t dNormScaleBiasDesc,
|
| 368 |
+
const cudnnActivationDescriptor_t activationDesc,
|
| 369 |
+
const cudnnTensorDescriptor_t normMeanVarDesc,
|
| 370 |
+
size_t *sizeInBytes,
|
| 371 |
+
int groupCnt); /* Place hold for future work, should be set to 1 now*/
|
| 372 |
+
|
| 373 |
+
cudnnStatus_t CUDNNWINAPI
|
| 374 |
+
cudnnGetNormalizationTrainingReserveSpaceSize(cudnnHandle_t handle,
|
| 375 |
+
cudnnNormMode_t mode,
|
| 376 |
+
cudnnNormOps_t normOps,
|
| 377 |
+
cudnnNormAlgo_t algo,
|
| 378 |
+
const cudnnActivationDescriptor_t activationDesc,
|
| 379 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 380 |
+
size_t *sizeInBytes,
|
| 381 |
+
int groupCnt); /* Place hold for future work, should be set to 1 now*/
|
| 382 |
+
|
| 383 |
+
/* Computes y = relu(Norm(x) + z). Also accumulates moving averages of mean and inverse variances */
|
| 384 |
+
cudnnStatus_t CUDNNWINAPI
|
| 385 |
+
cudnnNormalizationForwardTraining(cudnnHandle_t handle,
|
| 386 |
+
cudnnNormMode_t mode,
|
| 387 |
+
cudnnNormOps_t normOps,
|
| 388 |
+
cudnnNormAlgo_t algo,
|
| 389 |
+
const void *alpha, /* alpha[0] = result blend factor */
|
| 390 |
+
const void *beta, /* beta[0] = dest layer blend factor */
|
| 391 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 392 |
+
const void *xData,
|
| 393 |
+
const cudnnTensorDescriptor_t normScaleBiasDesc,
|
| 394 |
+
const void *normScale,
|
| 395 |
+
const void *normBias,
|
| 396 |
+
double exponentialAverageFactor,
|
| 397 |
+
const cudnnTensorDescriptor_t normMeanVarDesc,
|
| 398 |
+
void *resultRunningMean,
|
| 399 |
+
void *resultRunningVariance,
|
| 400 |
+
/* Has to be >= 0. Should be the same in forward and backward functions. */
|
| 401 |
+
double epsilon,
|
| 402 |
+
/* Optionally save intermediate results from the forward pass here
|
| 403 |
+
- can be reused to speed up backward pass. NULL if unused */
|
| 404 |
+
void *resultSaveMean,
|
| 405 |
+
void *resultSaveInvVariance,
|
| 406 |
+
cudnnActivationDescriptor_t activationDesc,
|
| 407 |
+
const cudnnTensorDescriptor_t zDesc,
|
| 408 |
+
const void *zData,
|
| 409 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 410 |
+
void *yData,
|
| 411 |
+
void *workspace,
|
| 412 |
+
size_t workSpaceSizeInBytes,
|
| 413 |
+
void *reserveSpace,
|
| 414 |
+
size_t reserveSpaceSizeInBytes,
|
| 415 |
+
int groupCnt); /* Place hold for future work, should be set to 1 now*/
|
| 416 |
+
|
| 417 |
+
cudnnStatus_t CUDNNWINAPI
|
| 418 |
+
cudnnNormalizationBackward(cudnnHandle_t handle,
|
| 419 |
+
cudnnNormMode_t mode,
|
| 420 |
+
cudnnNormOps_t normOps,
|
| 421 |
+
cudnnNormAlgo_t algo,
|
| 422 |
+
const void *alphaDataDiff,
|
| 423 |
+
const void *betaDataDiff,
|
| 424 |
+
const void *alphaParamDiff,
|
| 425 |
+
const void *betaParamDiff,
|
| 426 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 427 |
+
const void *xData,
|
| 428 |
+
const cudnnTensorDescriptor_t yDesc,
|
| 429 |
+
const void *yData,
|
| 430 |
+
const cudnnTensorDescriptor_t dyDesc,
|
| 431 |
+
const void *dyData,
|
| 432 |
+
const cudnnTensorDescriptor_t dzDesc,
|
| 433 |
+
void *dzData,
|
| 434 |
+
const cudnnTensorDescriptor_t dxDesc,
|
| 435 |
+
void *dxData,
|
| 436 |
+
/* Shared tensor desc for the 4 tensors below */
|
| 437 |
+
const cudnnTensorDescriptor_t dNormScaleBiasDesc,
|
| 438 |
+
const void *normScaleData,
|
| 439 |
+
const void *normBiasData, /* needed if there is activation */
|
| 440 |
+
void *dNormScaleData,
|
| 441 |
+
void *dNormBiasData,
|
| 442 |
+
double epsilon, /* Same epsilon as forward pass */
|
| 443 |
+
const cudnnTensorDescriptor_t normMeanVarDesc,
|
| 444 |
+
/* Optionally cached intermediate results from
|
| 445 |
+
forward pass */
|
| 446 |
+
const void *savedMean,
|
| 447 |
+
const void *savedInvVariance,
|
| 448 |
+
cudnnActivationDescriptor_t activationDesc,
|
| 449 |
+
void *workSpace,
|
| 450 |
+
size_t workSpaceSizeInBytes,
|
| 451 |
+
void *reserveSpace,
|
| 452 |
+
size_t reserveSpaceSizeInBytes,
|
| 453 |
+
int groupCnt); /* Place hold for future work, should be set to 1 now*/
|
| 454 |
+
|
| 455 |
+
cudnnStatus_t CUDNNWINAPI
|
| 456 |
+
cudnnSpatialTfGridGeneratorBackward(cudnnHandle_t handle,
|
| 457 |
+
const cudnnSpatialTransformerDescriptor_t stDesc,
|
| 458 |
+
const void *dgrid,
|
| 459 |
+
void *dtheta);
|
| 460 |
+
|
| 461 |
+
cudnnStatus_t CUDNNWINAPI
|
| 462 |
+
cudnnSpatialTfSamplerBackward(cudnnHandle_t handle,
|
| 463 |
+
cudnnSpatialTransformerDescriptor_t stDesc,
|
| 464 |
+
const void *alpha,
|
| 465 |
+
const cudnnTensorDescriptor_t xDesc,
|
| 466 |
+
const void *x,
|
| 467 |
+
const void *beta,
|
| 468 |
+
const cudnnTensorDescriptor_t dxDesc,
|
| 469 |
+
void *dx,
|
| 470 |
+
const void *alphaDgrid,
|
| 471 |
+
const cudnnTensorDescriptor_t dyDesc,
|
| 472 |
+
const void *dy,
|
| 473 |
+
const void *grid,
|
| 474 |
+
const void *betaDgrid,
|
| 475 |
+
void *dgrid);
|
| 476 |
+
|
| 477 |
+
cudnnStatus_t CUDNNWINAPI
|
| 478 |
+
cudnnDropoutBackward(cudnnHandle_t handle,
|
| 479 |
+
const cudnnDropoutDescriptor_t dropoutDesc,
|
| 480 |
+
const cudnnTensorDescriptor_t dydesc,
|
| 481 |
+
const void *dy,
|
| 482 |
+
const cudnnTensorDescriptor_t dxdesc,
|
| 483 |
+
void *dx,
|
| 484 |
+
void *reserveSpace,
|
| 485 |
+
size_t reserveSpaceSizeInBytes);
|
| 486 |
+
|
| 487 |
+
/*
|
| 488 |
+
* \brief Cross-library version checker.
|
| 489 |
+
* This function is implemented differently in each sub-library. Each sublib
|
| 490 |
+
* checks whether its own version matches that of its dependencies.
|
| 491 |
+
* \returns CUDNN_STATUS_SUCCESS if the version check passes,
|
| 492 |
+
* CUDNN_STATUS_VERSION_MISMATCH if the versions are inconsistent.
|
| 493 |
+
*/
|
| 494 |
+
cudnnStatus_t CUDNNWINAPI
|
| 495 |
+
cudnnOpsTrainVersionCheck(void);
|
| 496 |
+
|
| 497 |
+
#if defined(__cplusplus)
|
| 498 |
+
}
|
| 499 |
+
#endif
|
| 500 |
+
|
| 501 |
+
#endif /* CUDNN_OPS_TRAIN_H_ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/__init__.py
ADDED
|
File without changes
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (217 Bytes). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufft.h
ADDED
|
@@ -0,0 +1,322 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 2005-2021 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
/*!
|
| 50 |
+
* \file cufft.h
|
| 51 |
+
* \brief Public header file for the NVIDIA CUDA FFT library (CUFFT)
|
| 52 |
+
*/
|
| 53 |
+
|
| 54 |
+
#ifndef _CUFFT_H_
|
| 55 |
+
#define _CUFFT_H_
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
#include "cuComplex.h"
|
| 59 |
+
#include "driver_types.h"
|
| 60 |
+
#include "library_types.h"
|
| 61 |
+
|
| 62 |
+
#ifndef CUFFTAPI
|
| 63 |
+
#ifdef _WIN32
|
| 64 |
+
#define CUFFTAPI __stdcall
|
| 65 |
+
#elif __GNUC__ >= 4
|
| 66 |
+
#define CUFFTAPI __attribute__ ((visibility ("default")))
|
| 67 |
+
#else
|
| 68 |
+
#define CUFFTAPI
|
| 69 |
+
#endif
|
| 70 |
+
#endif
|
| 71 |
+
|
| 72 |
+
#ifdef __cplusplus
|
| 73 |
+
extern "C" {
|
| 74 |
+
#endif
|
| 75 |
+
|
| 76 |
+
#define CUFFT_VER_MAJOR 10
|
| 77 |
+
#define CUFFT_VER_MINOR 9
|
| 78 |
+
#define CUFFT_VER_PATCH 0
|
| 79 |
+
#define CUFFT_VER_BUILD 58
|
| 80 |
+
|
| 81 |
+
// cuFFT library version
|
| 82 |
+
//
|
| 83 |
+
// CUFFT_VERSION / 1000 - major version
|
| 84 |
+
// CUFFT_VERSION / 100 % 100 - minor version
|
| 85 |
+
// CUFFT_VERSION % 100 - patch level
|
| 86 |
+
#define CUFFT_VERSION 10900
|
| 87 |
+
|
| 88 |
+
// CUFFT API function return values
|
| 89 |
+
typedef enum cufftResult_t {
|
| 90 |
+
CUFFT_SUCCESS = 0x0,
|
| 91 |
+
CUFFT_INVALID_PLAN = 0x1,
|
| 92 |
+
CUFFT_ALLOC_FAILED = 0x2,
|
| 93 |
+
CUFFT_INVALID_TYPE = 0x3,
|
| 94 |
+
CUFFT_INVALID_VALUE = 0x4,
|
| 95 |
+
CUFFT_INTERNAL_ERROR = 0x5,
|
| 96 |
+
CUFFT_EXEC_FAILED = 0x6,
|
| 97 |
+
CUFFT_SETUP_FAILED = 0x7,
|
| 98 |
+
CUFFT_INVALID_SIZE = 0x8,
|
| 99 |
+
CUFFT_UNALIGNED_DATA = 0x9,
|
| 100 |
+
CUFFT_INCOMPLETE_PARAMETER_LIST = 0xA,
|
| 101 |
+
CUFFT_INVALID_DEVICE = 0xB,
|
| 102 |
+
CUFFT_PARSE_ERROR = 0xC,
|
| 103 |
+
CUFFT_NO_WORKSPACE = 0xD,
|
| 104 |
+
CUFFT_NOT_IMPLEMENTED = 0xE,
|
| 105 |
+
CUFFT_LICENSE_ERROR = 0x0F,
|
| 106 |
+
CUFFT_NOT_SUPPORTED = 0x10
|
| 107 |
+
|
| 108 |
+
} cufftResult;
|
| 109 |
+
|
| 110 |
+
#define MAX_CUFFT_ERROR 0x11
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
// CUFFT defines and supports the following data types
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
// cufftReal is a single-precision, floating-point real data type.
|
| 117 |
+
// cufftDoubleReal is a double-precision, real data type.
|
| 118 |
+
typedef float cufftReal;
|
| 119 |
+
typedef double cufftDoubleReal;
|
| 120 |
+
|
| 121 |
+
// cufftComplex is a single-precision, floating-point complex data type that
|
| 122 |
+
// consists of interleaved real and imaginary components.
|
| 123 |
+
// cufftDoubleComplex is the double-precision equivalent.
|
| 124 |
+
typedef cuComplex cufftComplex;
|
| 125 |
+
typedef cuDoubleComplex cufftDoubleComplex;
|
| 126 |
+
|
| 127 |
+
// CUFFT transform directions
|
| 128 |
+
#define CUFFT_FORWARD -1 // Forward FFT
|
| 129 |
+
#define CUFFT_INVERSE 1 // Inverse FFT
|
| 130 |
+
|
| 131 |
+
// CUFFT supports the following transform types
|
| 132 |
+
typedef enum cufftType_t {
|
| 133 |
+
CUFFT_R2C = 0x2a, // Real to Complex (interleaved)
|
| 134 |
+
CUFFT_C2R = 0x2c, // Complex (interleaved) to Real
|
| 135 |
+
CUFFT_C2C = 0x29, // Complex to Complex, interleaved
|
| 136 |
+
CUFFT_D2Z = 0x6a, // Double to Double-Complex
|
| 137 |
+
CUFFT_Z2D = 0x6c, // Double-Complex to Double
|
| 138 |
+
CUFFT_Z2Z = 0x69 // Double-Complex to Double-Complex
|
| 139 |
+
} cufftType;
|
| 140 |
+
|
| 141 |
+
// CUFFT supports the following data layouts
|
| 142 |
+
typedef enum cufftCompatibility_t {
|
| 143 |
+
CUFFT_COMPATIBILITY_FFTW_PADDING = 0x01 // The default value
|
| 144 |
+
} cufftCompatibility;
|
| 145 |
+
|
| 146 |
+
#define CUFFT_COMPATIBILITY_DEFAULT CUFFT_COMPATIBILITY_FFTW_PADDING
|
| 147 |
+
|
| 148 |
+
//
|
| 149 |
+
// structure definition used by the shim between old and new APIs
|
| 150 |
+
//
|
| 151 |
+
#define MAX_SHIM_RANK 3
|
| 152 |
+
|
| 153 |
+
// cufftHandle is a handle type used to store and access CUFFT plans.
|
| 154 |
+
typedef int cufftHandle;
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan,
|
| 158 |
+
int nx,
|
| 159 |
+
cufftType type,
|
| 160 |
+
int batch);
|
| 161 |
+
|
| 162 |
+
cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan,
|
| 163 |
+
int nx, int ny,
|
| 164 |
+
cufftType type);
|
| 165 |
+
|
| 166 |
+
cufftResult CUFFTAPI cufftPlan3d(cufftHandle *plan,
|
| 167 |
+
int nx, int ny, int nz,
|
| 168 |
+
cufftType type);
|
| 169 |
+
|
| 170 |
+
cufftResult CUFFTAPI cufftPlanMany(cufftHandle *plan,
|
| 171 |
+
int rank,
|
| 172 |
+
int *n,
|
| 173 |
+
int *inembed, int istride, int idist,
|
| 174 |
+
int *onembed, int ostride, int odist,
|
| 175 |
+
cufftType type,
|
| 176 |
+
int batch);
|
| 177 |
+
|
| 178 |
+
cufftResult CUFFTAPI cufftMakePlan1d(cufftHandle plan,
|
| 179 |
+
int nx,
|
| 180 |
+
cufftType type,
|
| 181 |
+
int batch,
|
| 182 |
+
size_t *workSize);
|
| 183 |
+
|
| 184 |
+
cufftResult CUFFTAPI cufftMakePlan2d(cufftHandle plan,
|
| 185 |
+
int nx, int ny,
|
| 186 |
+
cufftType type,
|
| 187 |
+
size_t *workSize);
|
| 188 |
+
|
| 189 |
+
cufftResult CUFFTAPI cufftMakePlan3d(cufftHandle plan,
|
| 190 |
+
int nx, int ny, int nz,
|
| 191 |
+
cufftType type,
|
| 192 |
+
size_t *workSize);
|
| 193 |
+
|
| 194 |
+
cufftResult CUFFTAPI cufftMakePlanMany(cufftHandle plan,
|
| 195 |
+
int rank,
|
| 196 |
+
int *n,
|
| 197 |
+
int *inembed, int istride, int idist,
|
| 198 |
+
int *onembed, int ostride, int odist,
|
| 199 |
+
cufftType type,
|
| 200 |
+
int batch,
|
| 201 |
+
size_t *workSize);
|
| 202 |
+
|
| 203 |
+
cufftResult CUFFTAPI cufftMakePlanMany64(cufftHandle plan,
|
| 204 |
+
int rank,
|
| 205 |
+
long long int *n,
|
| 206 |
+
long long int *inembed,
|
| 207 |
+
long long int istride,
|
| 208 |
+
long long int idist,
|
| 209 |
+
long long int *onembed,
|
| 210 |
+
long long int ostride, long long int odist,
|
| 211 |
+
cufftType type,
|
| 212 |
+
long long int batch,
|
| 213 |
+
size_t * workSize);
|
| 214 |
+
|
| 215 |
+
cufftResult CUFFTAPI cufftGetSizeMany64(cufftHandle plan,
|
| 216 |
+
int rank,
|
| 217 |
+
long long int *n,
|
| 218 |
+
long long int *inembed,
|
| 219 |
+
long long int istride, long long int idist,
|
| 220 |
+
long long int *onembed,
|
| 221 |
+
long long int ostride, long long int odist,
|
| 222 |
+
cufftType type,
|
| 223 |
+
long long int batch,
|
| 224 |
+
size_t *workSize);
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
cufftResult CUFFTAPI cufftEstimate1d(int nx,
|
| 230 |
+
cufftType type,
|
| 231 |
+
int batch,
|
| 232 |
+
size_t *workSize);
|
| 233 |
+
|
| 234 |
+
cufftResult CUFFTAPI cufftEstimate2d(int nx, int ny,
|
| 235 |
+
cufftType type,
|
| 236 |
+
size_t *workSize);
|
| 237 |
+
|
| 238 |
+
cufftResult CUFFTAPI cufftEstimate3d(int nx, int ny, int nz,
|
| 239 |
+
cufftType type,
|
| 240 |
+
size_t *workSize);
|
| 241 |
+
|
| 242 |
+
cufftResult CUFFTAPI cufftEstimateMany(int rank,
|
| 243 |
+
int *n,
|
| 244 |
+
int *inembed, int istride, int idist,
|
| 245 |
+
int *onembed, int ostride, int odist,
|
| 246 |
+
cufftType type,
|
| 247 |
+
int batch,
|
| 248 |
+
size_t *workSize);
|
| 249 |
+
|
| 250 |
+
cufftResult CUFFTAPI cufftCreate(cufftHandle * handle);
|
| 251 |
+
|
| 252 |
+
cufftResult CUFFTAPI cufftGetSize1d(cufftHandle handle,
|
| 253 |
+
int nx,
|
| 254 |
+
cufftType type,
|
| 255 |
+
int batch,
|
| 256 |
+
size_t *workSize );
|
| 257 |
+
|
| 258 |
+
cufftResult CUFFTAPI cufftGetSize2d(cufftHandle handle,
|
| 259 |
+
int nx, int ny,
|
| 260 |
+
cufftType type,
|
| 261 |
+
size_t *workSize);
|
| 262 |
+
|
| 263 |
+
cufftResult CUFFTAPI cufftGetSize3d(cufftHandle handle,
|
| 264 |
+
int nx, int ny, int nz,
|
| 265 |
+
cufftType type,
|
| 266 |
+
size_t *workSize);
|
| 267 |
+
|
| 268 |
+
cufftResult CUFFTAPI cufftGetSizeMany(cufftHandle handle,
|
| 269 |
+
int rank, int *n,
|
| 270 |
+
int *inembed, int istride, int idist,
|
| 271 |
+
int *onembed, int ostride, int odist,
|
| 272 |
+
cufftType type, int batch, size_t *workArea);
|
| 273 |
+
|
| 274 |
+
cufftResult CUFFTAPI cufftGetSize(cufftHandle handle, size_t *workSize);
|
| 275 |
+
|
| 276 |
+
cufftResult CUFFTAPI cufftSetWorkArea(cufftHandle plan, void *workArea);
|
| 277 |
+
|
| 278 |
+
cufftResult CUFFTAPI cufftSetAutoAllocation(cufftHandle plan, int autoAllocate);
|
| 279 |
+
|
| 280 |
+
cufftResult CUFFTAPI cufftExecC2C(cufftHandle plan,
|
| 281 |
+
cufftComplex *idata,
|
| 282 |
+
cufftComplex *odata,
|
| 283 |
+
int direction);
|
| 284 |
+
|
| 285 |
+
cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan,
|
| 286 |
+
cufftReal *idata,
|
| 287 |
+
cufftComplex *odata);
|
| 288 |
+
|
| 289 |
+
cufftResult CUFFTAPI cufftExecC2R(cufftHandle plan,
|
| 290 |
+
cufftComplex *idata,
|
| 291 |
+
cufftReal *odata);
|
| 292 |
+
|
| 293 |
+
cufftResult CUFFTAPI cufftExecZ2Z(cufftHandle plan,
|
| 294 |
+
cufftDoubleComplex *idata,
|
| 295 |
+
cufftDoubleComplex *odata,
|
| 296 |
+
int direction);
|
| 297 |
+
|
| 298 |
+
cufftResult CUFFTAPI cufftExecD2Z(cufftHandle plan,
|
| 299 |
+
cufftDoubleReal *idata,
|
| 300 |
+
cufftDoubleComplex *odata);
|
| 301 |
+
|
| 302 |
+
cufftResult CUFFTAPI cufftExecZ2D(cufftHandle plan,
|
| 303 |
+
cufftDoubleComplex *idata,
|
| 304 |
+
cufftDoubleReal *odata);
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
// utility functions
|
| 308 |
+
cufftResult CUFFTAPI cufftSetStream(cufftHandle plan,
|
| 309 |
+
cudaStream_t stream);
|
| 310 |
+
|
| 311 |
+
cufftResult CUFFTAPI cufftDestroy(cufftHandle plan);
|
| 312 |
+
|
| 313 |
+
cufftResult CUFFTAPI cufftGetVersion(int *version);
|
| 314 |
+
|
| 315 |
+
cufftResult CUFFTAPI cufftGetProperty(libraryPropertyType type,
|
| 316 |
+
int *value);
|
| 317 |
+
|
| 318 |
+
#ifdef __cplusplus
|
| 319 |
+
}
|
| 320 |
+
#endif
|
| 321 |
+
|
| 322 |
+
#endif /* _CUFFT_H_ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (214 Bytes). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (222 Bytes). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete.h
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#if !defined(CURANDDISCRETE_H_)
|
| 50 |
+
#define CURANDDISCRETE_H_
|
| 51 |
+
|
| 52 |
+
struct curandDistributionShift_st {
|
| 53 |
+
curandDistribution_t probability;
|
| 54 |
+
curandDistribution_t host_probability;
|
| 55 |
+
unsigned int shift;
|
| 56 |
+
unsigned int length;
|
| 57 |
+
unsigned int host_gen;
|
| 58 |
+
};
|
| 59 |
+
|
| 60 |
+
struct curandHistogramM2_st {
|
| 61 |
+
curandHistogramM2V_t V;
|
| 62 |
+
curandHistogramM2V_t host_V;
|
| 63 |
+
curandHistogramM2K_t K;
|
| 64 |
+
curandHistogramM2K_t host_K;
|
| 65 |
+
unsigned int host_gen;
|
| 66 |
+
};
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
struct curandDistributionM2Shift_st {
|
| 70 |
+
curandHistogramM2_t histogram;
|
| 71 |
+
curandHistogramM2_t host_histogram;
|
| 72 |
+
unsigned int shift;
|
| 73 |
+
unsigned int length;
|
| 74 |
+
unsigned int host_gen;
|
| 75 |
+
};
|
| 76 |
+
|
| 77 |
+
struct curandDiscreteDistribution_st {
|
| 78 |
+
curandDiscreteDistribution_t self_host_ptr;
|
| 79 |
+
curandDistributionM2Shift_t M2;
|
| 80 |
+
curandDistributionM2Shift_t host_M2;
|
| 81 |
+
double stddev;
|
| 82 |
+
double mean;
|
| 83 |
+
curandMethod_t method;
|
| 84 |
+
unsigned int host_gen;
|
| 85 |
+
};
|
| 86 |
+
|
| 87 |
+
#endif // !defined(CURANDDISCRETE_H_)
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete2.h
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
/* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
#if !defined(CURAND_DISCRETE_H_)
|
| 52 |
+
#define CURAND_DISCRETE_H_
|
| 53 |
+
|
| 54 |
+
/**
|
| 55 |
+
* \defgroup DEVICE Device API
|
| 56 |
+
*
|
| 57 |
+
* @{
|
| 58 |
+
*/
|
| 59 |
+
|
| 60 |
+
#ifndef __CUDACC_RTC__
|
| 61 |
+
#include <math.h>
|
| 62 |
+
#endif // __CUDACC_RTC__
|
| 63 |
+
|
| 64 |
+
#include "curand_mrg32k3a.h"
|
| 65 |
+
#include "curand_mtgp32_kernel.h"
|
| 66 |
+
#include "curand_philox4x32_x.h"
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
template <typename T>
|
| 70 |
+
QUALIFIERS unsigned int _curand_discrete(T x, curandDiscreteDistribution_t discrete_distribution){
|
| 71 |
+
if (discrete_distribution->method == CURAND_M2){
|
| 72 |
+
return _curand_M2_double(x, discrete_distribution->M2);
|
| 73 |
+
}
|
| 74 |
+
return (unsigned int)((discrete_distribution->stddev * _curand_normal_icdf_double(x)) + discrete_distribution->mean + 0.5);
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
template <typename STATE>
|
| 79 |
+
QUALIFIERS unsigned int curand__discrete(STATE state, curandDiscreteDistribution_t discrete_distribution){
|
| 80 |
+
if (discrete_distribution->method == CURAND_M2){
|
| 81 |
+
return curand_M2_double(state, discrete_distribution->M2);
|
| 82 |
+
}
|
| 83 |
+
return (unsigned int)((discrete_distribution->stddev * curand_normal_double(state)) + discrete_distribution->mean + 0.5); //Round to nearest
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
template <typename STATE>
|
| 87 |
+
QUALIFIERS uint4 curand__discrete4(STATE state, curandDiscreteDistribution_t discrete_distribution){
|
| 88 |
+
if (discrete_distribution->method == CURAND_M2){
|
| 89 |
+
return curand_M2_double4(state, discrete_distribution->M2);
|
| 90 |
+
}
|
| 91 |
+
double4 _res;
|
| 92 |
+
uint4 result;
|
| 93 |
+
_res = curand_normal4_double(state);
|
| 94 |
+
result.x = (unsigned int)((discrete_distribution->stddev * _res.x) + discrete_distribution->mean + 0.5); //Round to nearest
|
| 95 |
+
result.y = (unsigned int)((discrete_distribution->stddev * _res.y) + discrete_distribution->mean + 0.5); //Round to nearest
|
| 96 |
+
result.z = (unsigned int)((discrete_distribution->stddev * _res.z) + discrete_distribution->mean + 0.5); //Round to nearest
|
| 97 |
+
result.w = (unsigned int)((discrete_distribution->stddev * _res.w) + discrete_distribution->mean + 0.5); //Round to nearest
|
| 98 |
+
return result;
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
/*
|
| 102 |
+
* \brief Return a discrete distributed unsigned int from a XORWOW generator.
|
| 103 |
+
*
|
| 104 |
+
* Return a single discrete distributed unsigned int derived from a
|
| 105 |
+
* distribution defined by \p discrete_distribution from the XORWOW generator in \p state,
|
| 106 |
+
* increment position of generator by one.
|
| 107 |
+
*
|
| 108 |
+
* \param state - Pointer to state to update
|
| 109 |
+
* \param discrete_distribution - ancillary structure for discrete distribution
|
| 110 |
+
*
|
| 111 |
+
* \return unsigned int distributed by distribution defined by \p discrete_distribution.
|
| 112 |
+
*/
|
| 113 |
+
QUALIFIERS unsigned int curand_discrete(curandStateXORWOW_t *state, curandDiscreteDistribution_t discrete_distribution)
|
| 114 |
+
{
|
| 115 |
+
return curand__discrete(state, discrete_distribution);
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
/*
|
| 119 |
+
* \brief Return a discrete distributed unsigned int from a Philox4_32_10 generator.
|
| 120 |
+
*
|
| 121 |
+
* Return a single discrete distributed unsigned int derived from a
|
| 122 |
+
* distribution defined by \p discrete_distribution from the Philox4_32_10 generator in \p state,
|
| 123 |
+
* increment position of generator by one.
|
| 124 |
+
*
|
| 125 |
+
* \param state - Pointer to state to update
|
| 126 |
+
* \param discrete_distribution - ancillary structure for discrete distribution
|
| 127 |
+
*
|
| 128 |
+
* \return unsigned int distributed by distribution defined by \p discrete_distribution.
|
| 129 |
+
*/
|
| 130 |
+
QUALIFIERS unsigned int curand_discrete(curandStatePhilox4_32_10_t *state, curandDiscreteDistribution_t discrete_distribution)
|
| 131 |
+
{
|
| 132 |
+
return curand__discrete(state, discrete_distribution);
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
/*
|
| 136 |
+
* \brief Return four discrete distributed unsigned ints from a Philox4_32_10 generator.
|
| 137 |
+
*
|
| 138 |
+
* Return four single discrete distributed unsigned ints derived from a
|
| 139 |
+
* distribution defined by \p discrete_distribution from the Philox4_32_10 generator in \p state,
|
| 140 |
+
* increment position of generator by one.
|
| 141 |
+
*
|
| 142 |
+
* \param state - Pointer to state to update
|
| 143 |
+
* \param discrete_distribution - ancillary structure for discrete distribution
|
| 144 |
+
*
|
| 145 |
+
* \return unsigned int distributed by distribution defined by \p discrete_distribution.
|
| 146 |
+
*/
|
| 147 |
+
QUALIFIERS uint4 curand_discrete4(curandStatePhilox4_32_10_t *state, curandDiscreteDistribution_t discrete_distribution)
|
| 148 |
+
{
|
| 149 |
+
return curand__discrete4(state, discrete_distribution);
|
| 150 |
+
}
|
| 151 |
+
/*
|
| 152 |
+
* \brief Return a discrete distributed unsigned int from a MRG32k3a generator.
|
| 153 |
+
*
|
| 154 |
+
* Re turn a single discrete distributed unsigned int derived from a
|
| 155 |
+
* distribution defined by \p discrete_distribution from the MRG32k3a generator in \p state,
|
| 156 |
+
* increment position of generator by one.
|
| 157 |
+
*
|
| 158 |
+
* \param state - Pointer to state to update
|
| 159 |
+
* \param discrete_distribution - ancillary structure for discrete distribution
|
| 160 |
+
*
|
| 161 |
+
* \return unsigned int distributed by distribution defined by \p discrete_distribution.
|
| 162 |
+
*/
|
| 163 |
+
QUALIFIERS unsigned int curand_discrete(curandStateMRG32k3a_t *state, curandDiscreteDistribution_t discrete_distribution)
|
| 164 |
+
{
|
| 165 |
+
return curand__discrete(state, discrete_distribution);
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
/*
|
| 169 |
+
* \brief Return a discrete distributed unsigned int from a MTGP32 generator.
|
| 170 |
+
*
|
| 171 |
+
* Return a single discrete distributed unsigned int derived from a
|
| 172 |
+
* distribution defined by \p discrete_distribution from the MTGP32 generator in \p state,
|
| 173 |
+
* increment position of generator by one.
|
| 174 |
+
*
|
| 175 |
+
* \param state - Pointer to state to update
|
| 176 |
+
* \param discrete_distribution - ancillary structure for discrete distribution
|
| 177 |
+
*
|
| 178 |
+
* \return unsigned int distributed by distribution defined by \p discrete_distribution.
|
| 179 |
+
*/
|
| 180 |
+
QUALIFIERS unsigned int curand_discrete(curandStateMtgp32_t *state, curandDiscreteDistribution_t discrete_distribution)
|
| 181 |
+
{
|
| 182 |
+
return curand__discrete(state, discrete_distribution);
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
/*
|
| 186 |
+
* \brief Return a discrete distributed unsigned int from a Sobol32 generator.
|
| 187 |
+
*
|
| 188 |
+
* Return a single discrete distributed unsigned int derived from a
|
| 189 |
+
* distribution defined by \p discrete_distribution from the Sobol32 generator in \p state,
|
| 190 |
+
* increment position of generator by one.
|
| 191 |
+
*
|
| 192 |
+
* \param state - Pointer to state to update
|
| 193 |
+
* \param discrete_distribution - ancillary structure for discrete distribution
|
| 194 |
+
*
|
| 195 |
+
* \return unsigned int distributed by distribution defined by \p discrete_distribution.
|
| 196 |
+
*/
|
| 197 |
+
QUALIFIERS unsigned int curand_discrete(curandStateSobol32_t *state, curandDiscreteDistribution_t discrete_distribution)
|
| 198 |
+
{
|
| 199 |
+
return curand__discrete(state, discrete_distribution);
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
/*
|
| 203 |
+
* \brief Return a discrete distributed unsigned int from a scrambled Sobol32 generator.
|
| 204 |
+
*
|
| 205 |
+
* Return a single discrete distributed unsigned int derived from a
|
| 206 |
+
* distribution defined by \p discrete_distribution from the scrambled Sobol32 generator in \p state,
|
| 207 |
+
* increment position of generator by one.
|
| 208 |
+
*
|
| 209 |
+
* \param state - Pointer to state to update
|
| 210 |
+
* \param discrete_distribution - ancillary structure for discrete distribution
|
| 211 |
+
*
|
| 212 |
+
* \return unsigned int distributed by distribution defined by \p discrete_distribution.
|
| 213 |
+
*/
|
| 214 |
+
QUALIFIERS unsigned int curand_discrete(curandStateScrambledSobol32_t *state, curandDiscreteDistribution_t discrete_distribution)
|
| 215 |
+
{
|
| 216 |
+
return curand__discrete(state, discrete_distribution);
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
/*
|
| 220 |
+
* \brief Return a discrete distributed unsigned int from a Sobol64 generator.
|
| 221 |
+
*
|
| 222 |
+
* Return a single discrete distributed unsigned int derived from a
|
| 223 |
+
* distribution defined by \p discrete_distribution from the Sobol64 generator in \p state,
|
| 224 |
+
* increment position of generator by one.
|
| 225 |
+
*
|
| 226 |
+
* \param state - Pointer to state to update
|
| 227 |
+
* \param discrete_distribution - ancillary structure for discrete distribution
|
| 228 |
+
*
|
| 229 |
+
* \return unsigned int distributed by distribution defined by \p discrete_distribution.
|
| 230 |
+
*/
|
| 231 |
+
QUALIFIERS unsigned int curand_discrete(curandStateSobol64_t *state, curandDiscreteDistribution_t discrete_distribution)
|
| 232 |
+
{
|
| 233 |
+
return curand__discrete(state, discrete_distribution);
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
/*
|
| 237 |
+
* \brief Return a discrete distributed unsigned int from a scrambled Sobol64 generator.
|
| 238 |
+
*
|
| 239 |
+
* Return a single discrete distributed unsigned int derived from a
|
| 240 |
+
* distribution defined by \p discrete_distribution from the scrambled Sobol64 generator in \p state,
|
| 241 |
+
* increment position of generator by one.
|
| 242 |
+
*
|
| 243 |
+
* \param state - Pointer to state to update
|
| 244 |
+
* \param discrete_distribution - ancillary structure for discrete distribution
|
| 245 |
+
*
|
| 246 |
+
* \return unsigned int distributed by distribution defined by \p discrete_distribution.
|
| 247 |
+
*/
|
| 248 |
+
QUALIFIERS unsigned int curand_discrete(curandStateScrambledSobol64_t *state, curandDiscreteDistribution_t discrete_distribution)
|
| 249 |
+
{
|
| 250 |
+
return curand__discrete(state, discrete_distribution);
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
#endif // !defined(CURAND_DISCRETE_H_)
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32.h
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef CURAND_MTGP32_H
|
| 51 |
+
#define CURAND_MTGP32_H
|
| 52 |
+
/*
|
| 53 |
+
* @file curand_mtgp32.h
|
| 54 |
+
*
|
| 55 |
+
* @brief Mersenne Twister for Graphic Processors (mtgp32), which
|
| 56 |
+
* generates 32-bit unsigned integers and single precision floating
|
| 57 |
+
* point numbers based on IEEE 754 format.
|
| 58 |
+
*
|
| 59 |
+
* @author Mutsuo Saito (Hiroshima University)
|
| 60 |
+
* @author Makoto Matsumoto (Hiroshima University)
|
| 61 |
+
*
|
| 62 |
+
*/
|
| 63 |
+
/*
|
| 64 |
+
* Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
|
| 65 |
+
* University. All rights reserved.
|
| 66 |
+
* Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
|
| 67 |
+
* University and University of Tokyo. All rights reserved.
|
| 68 |
+
*
|
| 69 |
+
* Redistribution and use in source and binary forms, with or without
|
| 70 |
+
* modification, are permitted provided that the following conditions are
|
| 71 |
+
* met:
|
| 72 |
+
*
|
| 73 |
+
* * Redistributions of source code must retain the above copyright
|
| 74 |
+
* notice, this list of conditions and the following disclaimer.
|
| 75 |
+
* * Redistributions in binary form must reproduce the above
|
| 76 |
+
* copyright notice, this list of conditions and the following
|
| 77 |
+
* disclaimer in the documentation and/or other materials provided
|
| 78 |
+
* with the distribution.
|
| 79 |
+
* * Neither the name of the Hiroshima University nor the names of
|
| 80 |
+
* its contributors may be used to endorse or promote products
|
| 81 |
+
* derived from this software without specific prior written
|
| 82 |
+
* permission.
|
| 83 |
+
*
|
| 84 |
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
| 85 |
+
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
| 86 |
+
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
| 87 |
+
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
| 88 |
+
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
| 89 |
+
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
| 90 |
+
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
| 91 |
+
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
| 92 |
+
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 93 |
+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 94 |
+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 95 |
+
*/
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
#define MTGPDC_MEXP 11213
|
| 99 |
+
#define MTGPDC_N 351
|
| 100 |
+
#define MTGPDC_FLOOR_2P 256
|
| 101 |
+
#define MTGPDC_CEIL_2P 512
|
| 102 |
+
#define MTGPDC_PARAM_TABLE mtgp32dc_params_fast_11213
|
| 103 |
+
#define MTGP32_STATE_SIZE 1024
|
| 104 |
+
#define MTGP32_STATE_MASK 1023
|
| 105 |
+
#define CURAND_NUM_MTGP32_PARAMS 200
|
| 106 |
+
#define MEXP 11213
|
| 107 |
+
#define THREAD_NUM MTGPDC_FLOOR_2P
|
| 108 |
+
#define LARGE_SIZE (THREAD_NUM * 3)
|
| 109 |
+
#define TBL_SIZE 16
|
| 110 |
+
|
| 111 |
+
/**
|
| 112 |
+
* \addtogroup DEVICE Device API
|
| 113 |
+
*
|
| 114 |
+
* @{
|
| 115 |
+
*/
|
| 116 |
+
|
| 117 |
+
/*
|
| 118 |
+
* \struct MTGP32_PARAMS_FAST_T
|
| 119 |
+
* MTGP32 parameters.
|
| 120 |
+
* Some element is redundant to keep structure simple.
|
| 121 |
+
*
|
| 122 |
+
* \b pos is a pick up position which is selected to have good
|
| 123 |
+
* performance on graphic processors. 3 < \b pos < Q, where Q is a
|
| 124 |
+
* maximum number such that the size of status array - Q is a power of
|
| 125 |
+
* 2. For example, when \b mexp is 44497, size of 32-bit status array
|
| 126 |
+
* is 696, and Q is 184, then \b pos is between 4 and 183. This means
|
| 127 |
+
* 512 parallel calculations is allowed when \b mexp is 44497.
|
| 128 |
+
*
|
| 129 |
+
* \b poly_sha1 is SHA1 digest of the characteristic polynomial of
|
| 130 |
+
* state transition function. SHA1 is calculated based on printing
|
| 131 |
+
* form of the polynomial. This is important when we use parameters
|
| 132 |
+
* generated by the dynamic creator which
|
| 133 |
+
*
|
| 134 |
+
* \b mask This is a mask to make the dimension of state space have
|
| 135 |
+
* just Mersenne Prime. This is redundant.
|
| 136 |
+
*/
|
| 137 |
+
|
| 138 |
+
struct mtgp32_params_fast;
|
| 139 |
+
|
| 140 |
+
struct mtgp32_params_fast {
|
| 141 |
+
int mexp; /*< Mersenne exponent. This is redundant. */
|
| 142 |
+
int pos; /*< pick up position. */
|
| 143 |
+
int sh1; /*< shift value 1. 0 < sh1 < 32. */
|
| 144 |
+
int sh2; /*< shift value 2. 0 < sh2 < 32. */
|
| 145 |
+
unsigned int tbl[16]; /*< a small matrix. */
|
| 146 |
+
unsigned int tmp_tbl[16]; /*< a small matrix for tempering. */
|
| 147 |
+
unsigned int flt_tmp_tbl[16]; /*< a small matrix for tempering and
|
| 148 |
+
converting to float. */
|
| 149 |
+
unsigned int mask; /*< This is a mask for state space */
|
| 150 |
+
unsigned char poly_sha1[21]; /*< SHA1 digest */
|
| 151 |
+
};
|
| 152 |
+
|
| 153 |
+
/** \cond UNHIDE_TYPEDEFS */
|
| 154 |
+
typedef struct mtgp32_params_fast mtgp32_params_fast_t;
|
| 155 |
+
/** \endcond */
|
| 156 |
+
|
| 157 |
+
/*
|
| 158 |
+
* Generator Parameters.
|
| 159 |
+
*/
|
| 160 |
+
struct mtgp32_kernel_params;
|
| 161 |
+
struct mtgp32_kernel_params {
|
| 162 |
+
unsigned int pos_tbl[CURAND_NUM_MTGP32_PARAMS];
|
| 163 |
+
unsigned int param_tbl[CURAND_NUM_MTGP32_PARAMS][TBL_SIZE];
|
| 164 |
+
unsigned int temper_tbl[CURAND_NUM_MTGP32_PARAMS][TBL_SIZE];
|
| 165 |
+
unsigned int single_temper_tbl[CURAND_NUM_MTGP32_PARAMS][TBL_SIZE];
|
| 166 |
+
unsigned int sh1_tbl[CURAND_NUM_MTGP32_PARAMS];
|
| 167 |
+
unsigned int sh2_tbl[CURAND_NUM_MTGP32_PARAMS];
|
| 168 |
+
unsigned int mask[1];
|
| 169 |
+
};
|
| 170 |
+
|
| 171 |
+
/** \cond UNHIDE_TYPEDEFS */
|
| 172 |
+
typedef struct mtgp32_kernel_params mtgp32_kernel_params_t;
|
| 173 |
+
/** \endcond */
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
/*
|
| 178 |
+
* kernel I/O
|
| 179 |
+
* This structure must be initialized before first use.
|
| 180 |
+
*/
|
| 181 |
+
|
| 182 |
+
/* MTGP (Mersenne Twister) RNG */
|
| 183 |
+
/* This generator uses the Mersenne Twister algorithm of
|
| 184 |
+
* http://arxiv.org/abs/1005.4973v2
|
| 185 |
+
* Has period 2^11213.
|
| 186 |
+
*/
|
| 187 |
+
|
| 188 |
+
/**
|
| 189 |
+
* CURAND MTGP32 state
|
| 190 |
+
*/
|
| 191 |
+
struct curandStateMtgp32;
|
| 192 |
+
|
| 193 |
+
struct curandStateMtgp32 {
|
| 194 |
+
unsigned int s[MTGP32_STATE_SIZE];
|
| 195 |
+
int offset;
|
| 196 |
+
int pIdx;
|
| 197 |
+
mtgp32_kernel_params_t * k;
|
| 198 |
+
};
|
| 199 |
+
|
| 200 |
+
/*
|
| 201 |
+
* CURAND MTGP32 state
|
| 202 |
+
*/
|
| 203 |
+
/** \cond UNHIDE_TYPEDEFS */
|
| 204 |
+
typedef struct curandStateMtgp32 curandStateMtgp32_t;
|
| 205 |
+
/** \endcond */
|
| 206 |
+
|
| 207 |
+
/** @} */
|
| 208 |
+
|
| 209 |
+
#endif
|
| 210 |
+
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32_kernel.h
ADDED
|
@@ -0,0 +1,385 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
/*
|
| 51 |
+
* curand_mtgp32_kernel.h
|
| 52 |
+
*
|
| 53 |
+
*
|
| 54 |
+
* MTGP32-11213
|
| 55 |
+
*
|
| 56 |
+
* Mersenne Twister RNG for the GPU
|
| 57 |
+
*
|
| 58 |
+
* The period of generated integers is 2<sup>11213</sup>-1.
|
| 59 |
+
*
|
| 60 |
+
* This code generates 32-bit unsigned integers, and
|
| 61 |
+
* single precision floating point numbers uniformly distributed
|
| 62 |
+
* in the range [1, 2). (float r; 1.0 <= r < 2.0)
|
| 63 |
+
*/
|
| 64 |
+
|
| 65 |
+
/*
|
| 66 |
+
* Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
|
| 67 |
+
* University. All rights reserved.
|
| 68 |
+
* Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
|
| 69 |
+
* University and University of Tokyo. All rights reserved.
|
| 70 |
+
*
|
| 71 |
+
* Redistribution and use in source and binary forms, with or without
|
| 72 |
+
* modification, are permitted provided that the following conditions are
|
| 73 |
+
* met:
|
| 74 |
+
*
|
| 75 |
+
* * Redistributions of source code must retain the above copyright
|
| 76 |
+
* notice, this list of conditions and the following disclaimer.
|
| 77 |
+
* * Redistributions in binary form must reproduce the above
|
| 78 |
+
* copyright notice, this list of conditions and the following
|
| 79 |
+
* disclaimer in the documentation and/or other materials provided
|
| 80 |
+
* with the distribution.
|
| 81 |
+
* * Neither the name of the Hiroshima University nor the names of
|
| 82 |
+
* its contributors may be used to endorse or promote products
|
| 83 |
+
* derived from this software without specific prior written
|
| 84 |
+
* permission.
|
| 85 |
+
*
|
| 86 |
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
| 87 |
+
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
| 88 |
+
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
| 89 |
+
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
| 90 |
+
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
| 91 |
+
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
| 92 |
+
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
| 93 |
+
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
| 94 |
+
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 95 |
+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 96 |
+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 97 |
+
*/
|
| 98 |
+
#if !defined CURAND_MTGP32_KERNEL_H
|
| 99 |
+
#define CURAND_MTGP32_KERNEL_H
|
| 100 |
+
|
| 101 |
+
#if !defined(QUALIFIERS)
|
| 102 |
+
#define QUALIFIERS static __forceinline__ __device__
|
| 103 |
+
#endif
|
| 104 |
+
|
| 105 |
+
#ifndef __CUDACC_RTC__
|
| 106 |
+
#include <cuda.h>
|
| 107 |
+
#include <stdlib.h>
|
| 108 |
+
#include <memory.h>
|
| 109 |
+
#include <string.h>
|
| 110 |
+
#endif // ifndef __CUDACC_RTC__
|
| 111 |
+
#include "curand.h"
|
| 112 |
+
#include "curand_mtgp32.h"
|
| 113 |
+
|
| 114 |
+
/**
|
| 115 |
+
* \addtogroup DEVICE Device API
|
| 116 |
+
*
|
| 117 |
+
* @{
|
| 118 |
+
*/
|
| 119 |
+
|
| 120 |
+
#ifndef __CUDA_ARCH__
|
| 121 |
+
// define blockDim and threadIdx for host compatibility call
|
| 122 |
+
extern const dim3 blockDim;
|
| 123 |
+
extern const uint3 threadIdx;
|
| 124 |
+
#endif
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
/*
|
| 128 |
+
* The function of the recursion formula calculation.
|
| 129 |
+
*
|
| 130 |
+
* @param[in] X1 the farthest part of state array.
|
| 131 |
+
* @param[in] X2 the second farthest part of state array.
|
| 132 |
+
* @param[in] Y a part of state array.
|
| 133 |
+
* @param[in] bid block id.
|
| 134 |
+
* @return output
|
| 135 |
+
*/
|
| 136 |
+
QUALIFIERS unsigned int para_rec(mtgp32_kernel_params_t * k,unsigned int X1, unsigned int X2, unsigned int Y, int bid) {
|
| 137 |
+
unsigned int X = (X1 & k->mask[0]) ^ X2;
|
| 138 |
+
unsigned int MAT;
|
| 139 |
+
|
| 140 |
+
X ^= X << k->sh1_tbl[bid];
|
| 141 |
+
Y = X ^ (Y >> k->sh2_tbl[bid]);
|
| 142 |
+
MAT = k->param_tbl[bid][Y & 0x0f];
|
| 143 |
+
return Y ^ MAT;
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
/*
|
| 147 |
+
* The tempering function.
|
| 148 |
+
*
|
| 149 |
+
* @param[in] V the output value should be tempered.
|
| 150 |
+
* @param[in] T the tempering helper value.
|
| 151 |
+
* @param[in] bid block id.
|
| 152 |
+
* @return the tempered value.
|
| 153 |
+
*/
|
| 154 |
+
QUALIFIERS unsigned int temper(mtgp32_kernel_params_t * k,unsigned int V, unsigned int T, int bid) {
|
| 155 |
+
unsigned int MAT;
|
| 156 |
+
|
| 157 |
+
T ^= T >> 16;
|
| 158 |
+
T ^= T >> 8;
|
| 159 |
+
MAT = k->temper_tbl[bid][T & 0x0f];
|
| 160 |
+
return V ^ MAT;
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
/*
|
| 164 |
+
* The tempering and converting function.
|
| 165 |
+
* By using the preset table, converting to IEEE format
|
| 166 |
+
* and tempering are done simultaneously.
|
| 167 |
+
*
|
| 168 |
+
* @param[in] V the output value should be tempered.
|
| 169 |
+
* @param[in] T the tempering helper value.
|
| 170 |
+
* @param[in] bid block id.
|
| 171 |
+
* @return the tempered and converted value.
|
| 172 |
+
*/
|
| 173 |
+
QUALIFIERS unsigned int temper_single(mtgp32_kernel_params_t * k,unsigned int V, unsigned int T, int bid) {
|
| 174 |
+
unsigned int MAT;
|
| 175 |
+
unsigned int r;
|
| 176 |
+
|
| 177 |
+
T ^= T >> 16;
|
| 178 |
+
T ^= T >> 8;
|
| 179 |
+
MAT = k->single_temper_tbl[bid][T & 0x0f];
|
| 180 |
+
r = (V >> 9) ^ MAT;
|
| 181 |
+
return r;
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
/**
|
| 185 |
+
* \brief Return 32-bits of pseudorandomness from a mtgp32 generator.
|
| 186 |
+
*
|
| 187 |
+
* Return 32-bits of pseudorandomness from the mtgp32 generator in \p state,
|
| 188 |
+
* increment position of generator by the number of threads in the block.
|
| 189 |
+
* Note the number of threads in the block can not exceed 256.
|
| 190 |
+
*
|
| 191 |
+
* \param state - Pointer to state to update
|
| 192 |
+
*
|
| 193 |
+
* \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
|
| 194 |
+
*/
|
| 195 |
+
QUALIFIERS unsigned int curand(curandStateMtgp32_t *state)
|
| 196 |
+
{
|
| 197 |
+
unsigned int t;
|
| 198 |
+
unsigned int d;
|
| 199 |
+
int pos = state->k->pos_tbl[state->pIdx];
|
| 200 |
+
unsigned int r;
|
| 201 |
+
unsigned int o;
|
| 202 |
+
|
| 203 |
+
d = blockDim.z * blockDim.y * blockDim.x;
|
| 204 |
+
//assert( d <= 256 );
|
| 205 |
+
t = (blockDim.z * blockDim.y * threadIdx.z) + (blockDim.x * threadIdx.y) + threadIdx.x;
|
| 206 |
+
r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
|
| 207 |
+
state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
|
| 208 |
+
state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
|
| 209 |
+
state->pIdx);
|
| 210 |
+
|
| 211 |
+
state->s[(t + state->offset + MTGPDC_N) & MTGP32_STATE_MASK] = r;
|
| 212 |
+
o = temper(state->k, r,
|
| 213 |
+
state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
|
| 214 |
+
state->pIdx);
|
| 215 |
+
#if __CUDA_ARCH__ != 0
|
| 216 |
+
__syncthreads();
|
| 217 |
+
#endif
|
| 218 |
+
if (t == 0)
|
| 219 |
+
{
|
| 220 |
+
state->offset = (state->offset + d) & MTGP32_STATE_MASK;
|
| 221 |
+
}
|
| 222 |
+
#if __CUDA_ARCH__ != 0
|
| 223 |
+
__syncthreads();
|
| 224 |
+
#endif
|
| 225 |
+
return o;
|
| 226 |
+
|
| 227 |
+
}
|
| 228 |
+
/**
|
| 229 |
+
* \brief Return 32-bits of pseudorandomness from a specific position in a mtgp32 generator.
|
| 230 |
+
*
|
| 231 |
+
* Return 32-bits of pseudorandomness from position \p index of the mtgp32 generator in \p state,
|
| 232 |
+
* increment position of generator by \p n positions, which must be the total number of positions
|
| 233 |
+
* upddated in the state by the thread block, for this invocation.
|
| 234 |
+
*
|
| 235 |
+
* Note :
|
| 236 |
+
* Thread indices must range from 0...\ n - 1.
|
| 237 |
+
* The number of positions updated may not exceed 256.
|
| 238 |
+
* A thread block may update more than one state, but a given state may not be updated by more than one thread block.
|
| 239 |
+
*
|
| 240 |
+
* \param state - Pointer to state to update
|
| 241 |
+
* \param index - Index (0..255) of the position within the state to draw from and update
|
| 242 |
+
* \param n - The total number of postions in this state that are being updated by this invocation
|
| 243 |
+
*
|
| 244 |
+
* \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
|
| 245 |
+
*/
|
| 246 |
+
QUALIFIERS unsigned int curand_mtgp32_specific(curandStateMtgp32_t *state, unsigned char index, unsigned char n)
|
| 247 |
+
{
|
| 248 |
+
unsigned int t;
|
| 249 |
+
int pos = state->k->pos_tbl[state->pIdx];
|
| 250 |
+
unsigned int r;
|
| 251 |
+
unsigned int o;
|
| 252 |
+
|
| 253 |
+
t = index;
|
| 254 |
+
r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
|
| 255 |
+
state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
|
| 256 |
+
state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
|
| 257 |
+
state->pIdx);
|
| 258 |
+
|
| 259 |
+
state->s[(t + state->offset + MTGPDC_N) & MTGP32_STATE_MASK] = r;
|
| 260 |
+
o = temper(state->k, r,
|
| 261 |
+
state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
|
| 262 |
+
state->pIdx);
|
| 263 |
+
#if __CUDA_ARCH__ != 0
|
| 264 |
+
__syncthreads();
|
| 265 |
+
#endif
|
| 266 |
+
if (index == 0)
|
| 267 |
+
{
|
| 268 |
+
state->offset = (state->offset + n) & MTGP32_STATE_MASK;
|
| 269 |
+
}
|
| 270 |
+
#if __CUDA_ARCH__ != 0
|
| 271 |
+
__syncthreads();
|
| 272 |
+
#endif
|
| 273 |
+
return o;
|
| 274 |
+
}
|
| 275 |
+
/**
|
| 276 |
+
* \brief Return a uniformly distributed float from a mtgp32 generator.
|
| 277 |
+
*
|
| 278 |
+
* Return a uniformly distributed float between \p 0.0f and \p 1.0f
|
| 279 |
+
* from the mtgp32 generator in \p state, increment position of generator.
|
| 280 |
+
* Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
|
| 281 |
+
* point outputs are never returned.
|
| 282 |
+
*
|
| 283 |
+
* Note: This alternate derivation of a uniform float is provided for completeness
|
| 284 |
+
* with the original source
|
| 285 |
+
*
|
| 286 |
+
* \param state - Pointer to state to update
|
| 287 |
+
*
|
| 288 |
+
* \return uniformly distributed float between \p 0.0f and \p 1.0f
|
| 289 |
+
*/
|
| 290 |
+
QUALIFIERS float curand_mtgp32_single(curandStateMtgp32_t *state)
|
| 291 |
+
{
|
| 292 |
+
unsigned int t;
|
| 293 |
+
unsigned int d;
|
| 294 |
+
int pos = state->k->pos_tbl[state->pIdx];
|
| 295 |
+
unsigned int r;
|
| 296 |
+
unsigned int o_u;
|
| 297 |
+
float o_f;
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
t = blockDim.z * blockDim.y;
|
| 301 |
+
d = t * blockDim.x;
|
| 302 |
+
//assert( d <= 256 );
|
| 303 |
+
t += threadIdx.x;
|
| 304 |
+
r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
|
| 305 |
+
state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
|
| 306 |
+
state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
|
| 307 |
+
state->pIdx);
|
| 308 |
+
|
| 309 |
+
state->s[t] = r;
|
| 310 |
+
o_u = temper_single(state->k, r,
|
| 311 |
+
state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
|
| 312 |
+
state->pIdx);
|
| 313 |
+
#if __CUDA_ARCH__ != 0
|
| 314 |
+
__syncthreads();
|
| 315 |
+
#endif
|
| 316 |
+
if (threadIdx.x == 0)
|
| 317 |
+
{
|
| 318 |
+
state->offset = (state->offset + d) & MTGP32_STATE_MASK;
|
| 319 |
+
}
|
| 320 |
+
#if __CUDA_ARCH__ != 0
|
| 321 |
+
__syncthreads();
|
| 322 |
+
#endif
|
| 323 |
+
memcpy(&o_f, &o_u, sizeof(o_u));
|
| 324 |
+
return o_f;
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
/**
|
| 328 |
+
* \brief Return a uniformly distributed float from a specific position in a mtgp32 generator.
|
| 329 |
+
*
|
| 330 |
+
* Return a uniformly distributed float between \p 0.0f and \p 1.0f
|
| 331 |
+
* from position \p index of the mtgp32 generator in \p state, and
|
| 332 |
+
* increment position of generator by \p n positions, which must be the total number of positions
|
| 333 |
+
* upddated in the state by the thread block, for this invocation.
|
| 334 |
+
* Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
|
| 335 |
+
* point outputs are never returned.
|
| 336 |
+
*
|
| 337 |
+
* Note 1:
|
| 338 |
+
* Thread indices must range from 0...\p n - 1.
|
| 339 |
+
* The number of positions updated may not exceed 256.
|
| 340 |
+
* A thread block may update more than one state, but a given state may not be updated by more than one thread block.
|
| 341 |
+
*
|
| 342 |
+
* Note 2: This alternate derivation of a uniform float is provided for completeness
|
| 343 |
+
* with the original source
|
| 344 |
+
*
|
| 345 |
+
* \param state - Pointer to state to update
|
| 346 |
+
* \param index - Index (0..255) of the position within the state to draw from and update
|
| 347 |
+
* \param n - The total number of postions in this state that are being updated by this invocation
|
| 348 |
+
*
|
| 349 |
+
* \return uniformly distributed float between \p 0.0f and \p 1.0f
|
| 350 |
+
*/
|
| 351 |
+
QUALIFIERS float curand_mtgp32_single_specific(curandStateMtgp32_t *state, unsigned char index, unsigned char n)
|
| 352 |
+
{
|
| 353 |
+
unsigned int t;
|
| 354 |
+
int pos = state->k->pos_tbl[state->pIdx];
|
| 355 |
+
unsigned int r;
|
| 356 |
+
unsigned int o_u;
|
| 357 |
+
float o_f;
|
| 358 |
+
|
| 359 |
+
t = index;
|
| 360 |
+
r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
|
| 361 |
+
state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
|
| 362 |
+
state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
|
| 363 |
+
state->pIdx);
|
| 364 |
+
|
| 365 |
+
state->s[t] = r;
|
| 366 |
+
o_u = temper_single(state->k, r,
|
| 367 |
+
state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
|
| 368 |
+
state->pIdx);
|
| 369 |
+
#if __CUDA_ARCH__ != 0
|
| 370 |
+
__syncthreads();
|
| 371 |
+
#endif
|
| 372 |
+
if (threadIdx.x == 0)
|
| 373 |
+
{
|
| 374 |
+
state->offset = (state->offset + n) & MTGP32_STATE_MASK;
|
| 375 |
+
}
|
| 376 |
+
#if __CUDA_ARCH__ != 0
|
| 377 |
+
__syncthreads();
|
| 378 |
+
#endif
|
| 379 |
+
memcpy(&o_f, &o_u, sizeof(o_u));
|
| 380 |
+
return o_f;
|
| 381 |
+
}
|
| 382 |
+
|
| 383 |
+
/** @} */
|
| 384 |
+
|
| 385 |
+
#endif
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_normal.h
ADDED
|
@@ -0,0 +1,837 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
/* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
#if !defined(CURAND_NORMAL_H_)
|
| 52 |
+
#define CURAND_NORMAL_H_
|
| 53 |
+
|
| 54 |
+
/**
|
| 55 |
+
* \defgroup DEVICE Device API
|
| 56 |
+
*
|
| 57 |
+
* @{
|
| 58 |
+
*/
|
| 59 |
+
|
| 60 |
+
#ifndef __CUDACC_RTC__
|
| 61 |
+
#include <math.h>
|
| 62 |
+
#endif // __CUDACC_RTC__
|
| 63 |
+
|
| 64 |
+
#include "curand_mrg32k3a.h"
|
| 65 |
+
#include "curand_mtgp32_kernel.h"
|
| 66 |
+
#include "curand_philox4x32_x.h"
|
| 67 |
+
#include "curand_normal_static.h"
|
| 68 |
+
|
| 69 |
+
QUALIFIERS float2 _curand_box_muller(unsigned int x, unsigned int y)
|
| 70 |
+
{
|
| 71 |
+
float2 result;
|
| 72 |
+
float u = x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2);
|
| 73 |
+
float v = y * CURAND_2POW32_INV_2PI + (CURAND_2POW32_INV_2PI/2);
|
| 74 |
+
#if __CUDA_ARCH__ > 0
|
| 75 |
+
float s = sqrtf(-2.0f * logf(u));
|
| 76 |
+
__sincosf(v, &result.x, &result.y);
|
| 77 |
+
#else
|
| 78 |
+
float s = sqrtf(-2.0f * logf(u));
|
| 79 |
+
result.x = sinf(v);
|
| 80 |
+
result.y = cosf(v);
|
| 81 |
+
#endif
|
| 82 |
+
result.x *= s;
|
| 83 |
+
result.y *= s;
|
| 84 |
+
return result;
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
QUALIFIERS float2 curand_box_muller_mrg(curandStateMRG32k3a_t * state)
|
| 88 |
+
{
|
| 89 |
+
float x, y;
|
| 90 |
+
x = curand_uniform(state);
|
| 91 |
+
y = curand_uniform(state) * CURAND_2PI;
|
| 92 |
+
float2 result;
|
| 93 |
+
#if __CUDA_ARCH__ > 0
|
| 94 |
+
float s = sqrtf(-2.0f * logf(x));
|
| 95 |
+
__sincosf(y, &result.x, &result.y);
|
| 96 |
+
#else
|
| 97 |
+
float s = sqrtf(-2.0f * logf(x));
|
| 98 |
+
result.x = sinf(y);
|
| 99 |
+
result.y = cosf(y);
|
| 100 |
+
#endif
|
| 101 |
+
result.x *= s;
|
| 102 |
+
result.y *= s;
|
| 103 |
+
return result;
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
QUALIFIERS double2
|
| 107 |
+
_curand_box_muller_double(unsigned int x0, unsigned int x1,
|
| 108 |
+
unsigned int y0, unsigned int y1)
|
| 109 |
+
{
|
| 110 |
+
double2 result;
|
| 111 |
+
unsigned long long zx = (unsigned long long)x0 ^
|
| 112 |
+
((unsigned long long)x1 << (53 - 32));
|
| 113 |
+
double u = zx * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
|
| 114 |
+
unsigned long long zy = (unsigned long long)y0 ^
|
| 115 |
+
((unsigned long long)y1 << (53 - 32));
|
| 116 |
+
double v = zy * (CURAND_2POW53_INV_DOUBLE*2.0) + CURAND_2POW53_INV_DOUBLE;
|
| 117 |
+
double s = sqrt(-2.0 * log(u));
|
| 118 |
+
|
| 119 |
+
#if __CUDA_ARCH__ > 0
|
| 120 |
+
sincospi(v, &result.x, &result.y);
|
| 121 |
+
#else
|
| 122 |
+
result.x = sin(v*CURAND_PI_DOUBLE);
|
| 123 |
+
result.y = cos(v*CURAND_PI_DOUBLE);
|
| 124 |
+
#endif
|
| 125 |
+
result.x *= s;
|
| 126 |
+
result.y *= s;
|
| 127 |
+
|
| 128 |
+
return result;
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
QUALIFIERS double2
|
| 132 |
+
curand_box_muller_mrg_double(curandStateMRG32k3a_t * state)
|
| 133 |
+
{
|
| 134 |
+
double x, y;
|
| 135 |
+
double2 result;
|
| 136 |
+
x = curand_uniform_double(state);
|
| 137 |
+
y = curand_uniform_double(state) * 2.0;
|
| 138 |
+
|
| 139 |
+
double s = sqrt(-2.0 * log(x));
|
| 140 |
+
#if __CUDA_ARCH__ > 0
|
| 141 |
+
sincospi(y, &result.x, &result.y);
|
| 142 |
+
#else
|
| 143 |
+
result.x = sin(y*CURAND_PI_DOUBLE);
|
| 144 |
+
result.y = cos(y*CURAND_PI_DOUBLE);
|
| 145 |
+
#endif
|
| 146 |
+
result.x *= s;
|
| 147 |
+
result.y *= s;
|
| 148 |
+
return result;
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
template <typename R>
|
| 152 |
+
QUALIFIERS float2 curand_box_muller(R *state)
|
| 153 |
+
{
|
| 154 |
+
float2 result;
|
| 155 |
+
unsigned int x = curand(state);
|
| 156 |
+
unsigned int y = curand(state);
|
| 157 |
+
result = _curand_box_muller(x, y);
|
| 158 |
+
return result;
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
template <typename R>
|
| 162 |
+
QUALIFIERS float4 curand_box_muller4(R *state)
|
| 163 |
+
{
|
| 164 |
+
float4 result;
|
| 165 |
+
float2 _result;
|
| 166 |
+
uint4 x = curand4(state);
|
| 167 |
+
//unsigned int y = curand(state);
|
| 168 |
+
_result = _curand_box_muller(x.x, x.y);
|
| 169 |
+
result.x = _result.x;
|
| 170 |
+
result.y = _result.y;
|
| 171 |
+
_result = _curand_box_muller(x.z, x.w);
|
| 172 |
+
result.z = _result.x;
|
| 173 |
+
result.w = _result.y;
|
| 174 |
+
return result;
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
template <typename R>
|
| 178 |
+
QUALIFIERS double2 curand_box_muller_double(R *state)
|
| 179 |
+
{
|
| 180 |
+
double2 result;
|
| 181 |
+
unsigned int x0 = curand(state);
|
| 182 |
+
unsigned int x1 = curand(state);
|
| 183 |
+
unsigned int y0 = curand(state);
|
| 184 |
+
unsigned int y1 = curand(state);
|
| 185 |
+
result = _curand_box_muller_double(x0, x1, y0, y1);
|
| 186 |
+
return result;
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
template <typename R>
|
| 190 |
+
QUALIFIERS double2 curand_box_muller2_double(R *state)
|
| 191 |
+
{
|
| 192 |
+
double2 result;
|
| 193 |
+
uint4 _x;
|
| 194 |
+
_x = curand4(state);
|
| 195 |
+
result = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
|
| 196 |
+
return result;
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
template <typename R>
|
| 201 |
+
QUALIFIERS double4 curand_box_muller4_double(R *state)
|
| 202 |
+
{
|
| 203 |
+
double4 result;
|
| 204 |
+
double2 _res1;
|
| 205 |
+
double2 _res2;
|
| 206 |
+
uint4 _x;
|
| 207 |
+
uint4 _y;
|
| 208 |
+
_x = curand4(state);
|
| 209 |
+
_y = curand4(state);
|
| 210 |
+
_res1 = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
|
| 211 |
+
_res2 = _curand_box_muller_double(_y.x, _y.y, _y.z, _y.w);
|
| 212 |
+
result.x = _res1.x;
|
| 213 |
+
result.y = _res1.y;
|
| 214 |
+
result.z = _res2.x;
|
| 215 |
+
result.w = _res2.y;
|
| 216 |
+
return result;
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
//QUALIFIERS float _curand_normal_icdf(unsigned int x)
|
| 220 |
+
//{
|
| 221 |
+
//#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
|
| 222 |
+
// float s = CURAND_SQRT2;
|
| 223 |
+
// // Mirror to avoid loss of precision
|
| 224 |
+
// if(x > 0x80000000UL) {
|
| 225 |
+
// x = 0xffffffffUL - x;
|
| 226 |
+
// s = -s;
|
| 227 |
+
// }
|
| 228 |
+
// float p = x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
|
| 229 |
+
// // p is in (0, 0.5], 2p is in (0, 1]
|
| 230 |
+
// return s * erfcinvf(2.0f * p);
|
| 231 |
+
//#else
|
| 232 |
+
// x++; //suppress warnings
|
| 233 |
+
// return 0.0f;
|
| 234 |
+
//#endif
|
| 235 |
+
//}
|
| 236 |
+
//
|
| 237 |
+
//QUALIFIERS float _curand_normal_icdf(unsigned long long x)
|
| 238 |
+
//{
|
| 239 |
+
//#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
|
| 240 |
+
// unsigned int t = (unsigned int)(x >> 32);
|
| 241 |
+
// float s = CURAND_SQRT2;
|
| 242 |
+
// // Mirror to avoid loss of precision
|
| 243 |
+
// if(t > 0x80000000UL) {
|
| 244 |
+
// t = 0xffffffffUL - t;
|
| 245 |
+
// s = -s;
|
| 246 |
+
// }
|
| 247 |
+
// float p = t * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
|
| 248 |
+
// // p is in (0, 0.5], 2p is in (0, 1]
|
| 249 |
+
// return s * erfcinvf(2.0f * p);
|
| 250 |
+
//#else
|
| 251 |
+
// x++;
|
| 252 |
+
// return 0.0f;
|
| 253 |
+
//#endif
|
| 254 |
+
//}
|
| 255 |
+
//
|
| 256 |
+
//QUALIFIERS double _curand_normal_icdf_double(unsigned int x)
|
| 257 |
+
//{
|
| 258 |
+
//#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
|
| 259 |
+
// double s = CURAND_SQRT2_DOUBLE;
|
| 260 |
+
// // Mirror to avoid loss of precision
|
| 261 |
+
// if(x > 0x80000000UL) {
|
| 262 |
+
// x = 0xffffffffUL - x;
|
| 263 |
+
// s = -s;
|
| 264 |
+
// }
|
| 265 |
+
// double p = x * CURAND_2POW32_INV_DOUBLE + (CURAND_2POW32_INV_DOUBLE/2.0);
|
| 266 |
+
// // p is in (0, 0.5], 2p is in (0, 1]
|
| 267 |
+
// return s * erfcinv(2.0 * p);
|
| 268 |
+
//#else
|
| 269 |
+
// x++;
|
| 270 |
+
// return 0.0;
|
| 271 |
+
//#endif
|
| 272 |
+
//}
|
| 273 |
+
//
|
| 274 |
+
//QUALIFIERS double _curand_normal_icdf_double(unsigned long long x)
|
| 275 |
+
//{
|
| 276 |
+
//#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
|
| 277 |
+
// double s = CURAND_SQRT2_DOUBLE;
|
| 278 |
+
// x >>= 11;
|
| 279 |
+
// // Mirror to avoid loss of precision
|
| 280 |
+
// if(x > 0x10000000000000UL) {
|
| 281 |
+
// x = 0x1fffffffffffffUL - x;
|
| 282 |
+
// s = -s;
|
| 283 |
+
// }
|
| 284 |
+
// double p = x * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
|
| 285 |
+
// // p is in (0, 0.5], 2p is in (0, 1]
|
| 286 |
+
// return s * erfcinv(2.0 * p);
|
| 287 |
+
//#else
|
| 288 |
+
// x++;
|
| 289 |
+
// return 0.0;
|
| 290 |
+
//#endif
|
| 291 |
+
//}
|
| 292 |
+
//
|
| 293 |
+
|
| 294 |
+
/**
|
| 295 |
+
* \brief Return a normally distributed float from an XORWOW generator.
|
| 296 |
+
*
|
| 297 |
+
* Return a single normally distributed float with mean \p 0.0f and
|
| 298 |
+
* standard deviation \p 1.0f from the XORWOW generator in \p state,
|
| 299 |
+
* increment position of generator by one.
|
| 300 |
+
*
|
| 301 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 302 |
+
* normally distributed results, then returns them one at a time.
|
| 303 |
+
* See ::curand_normal2() for a more efficient version that returns
|
| 304 |
+
* both results at once.
|
| 305 |
+
*
|
| 306 |
+
* \param state - Pointer to state to update
|
| 307 |
+
*
|
| 308 |
+
* \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
|
| 309 |
+
*/
|
| 310 |
+
QUALIFIERS float curand_normal(curandStateXORWOW_t *state)
|
| 311 |
+
{
|
| 312 |
+
if(state->boxmuller_flag != EXTRA_FLAG_NORMAL) {
|
| 313 |
+
unsigned int x, y;
|
| 314 |
+
x = curand(state);
|
| 315 |
+
y = curand(state);
|
| 316 |
+
float2 v = _curand_box_muller(x, y);
|
| 317 |
+
state->boxmuller_extra = v.y;
|
| 318 |
+
state->boxmuller_flag = EXTRA_FLAG_NORMAL;
|
| 319 |
+
return v.x;
|
| 320 |
+
}
|
| 321 |
+
state->boxmuller_flag = 0;
|
| 322 |
+
return state->boxmuller_extra;
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
/**
|
| 326 |
+
* \brief Return a normally distributed float from an Philox4_32_10 generator.
|
| 327 |
+
*
|
| 328 |
+
* Return a single normally distributed float with mean \p 0.0f and
|
| 329 |
+
* standard deviation \p 1.0f from the Philox4_32_10 generator in \p state,
|
| 330 |
+
* increment position of generator by one.
|
| 331 |
+
*
|
| 332 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 333 |
+
* normally distributed results, then returns them one at a time.
|
| 334 |
+
* See ::curand_normal2() for a more efficient version that returns
|
| 335 |
+
* both results at once.
|
| 336 |
+
*
|
| 337 |
+
* \param state - Pointer to state to update
|
| 338 |
+
*
|
| 339 |
+
* \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
|
| 340 |
+
*/
|
| 341 |
+
|
| 342 |
+
QUALIFIERS float curand_normal(curandStatePhilox4_32_10_t *state)
|
| 343 |
+
{
|
| 344 |
+
if(state->boxmuller_flag != EXTRA_FLAG_NORMAL) {
|
| 345 |
+
unsigned int x, y;
|
| 346 |
+
x = curand(state);
|
| 347 |
+
y = curand(state);
|
| 348 |
+
float2 v = _curand_box_muller(x, y);
|
| 349 |
+
state->boxmuller_extra = v.y;
|
| 350 |
+
state->boxmuller_flag = EXTRA_FLAG_NORMAL;
|
| 351 |
+
return v.x;
|
| 352 |
+
}
|
| 353 |
+
state->boxmuller_flag = 0;
|
| 354 |
+
return state->boxmuller_extra;
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
/**
|
| 360 |
+
* \brief Return a normally distributed float from an MRG32k3a generator.
|
| 361 |
+
*
|
| 362 |
+
* Return a single normally distributed float with mean \p 0.0f and
|
| 363 |
+
* standard deviation \p 1.0f from the MRG32k3a generator in \p state,
|
| 364 |
+
* increment position of generator by one.
|
| 365 |
+
*
|
| 366 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 367 |
+
* normally distributed results, then returns them one at a time.
|
| 368 |
+
* See ::curand_normal2() for a more efficient version that returns
|
| 369 |
+
* both results at once.
|
| 370 |
+
*
|
| 371 |
+
* \param state - Pointer to state to update
|
| 372 |
+
*
|
| 373 |
+
* \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
|
| 374 |
+
*/
|
| 375 |
+
QUALIFIERS float curand_normal(curandStateMRG32k3a_t *state)
|
| 376 |
+
{
|
| 377 |
+
if(state->boxmuller_flag != EXTRA_FLAG_NORMAL) {
|
| 378 |
+
float2 v = curand_box_muller_mrg(state);
|
| 379 |
+
state->boxmuller_extra = v.y;
|
| 380 |
+
state->boxmuller_flag = EXTRA_FLAG_NORMAL;
|
| 381 |
+
return v.x;
|
| 382 |
+
}
|
| 383 |
+
state->boxmuller_flag = 0;
|
| 384 |
+
return state->boxmuller_extra;
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
/**
|
| 388 |
+
* \brief Return two normally distributed floats from an XORWOW generator.
|
| 389 |
+
*
|
| 390 |
+
* Return two normally distributed floats with mean \p 0.0f and
|
| 391 |
+
* standard deviation \p 1.0f from the XORWOW generator in \p state,
|
| 392 |
+
* increment position of generator by two.
|
| 393 |
+
*
|
| 394 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 395 |
+
* normally distributed results.
|
| 396 |
+
*
|
| 397 |
+
* \param state - Pointer to state to update
|
| 398 |
+
*
|
| 399 |
+
* \return Normally distributed float2 where each element is from a
|
| 400 |
+
* distribution with mean \p 0.0f and standard deviation \p 1.0f
|
| 401 |
+
*/
|
| 402 |
+
QUALIFIERS float2 curand_normal2(curandStateXORWOW_t *state)
|
| 403 |
+
{
|
| 404 |
+
return curand_box_muller(state);
|
| 405 |
+
}
|
| 406 |
+
/**
|
| 407 |
+
* \brief Return two normally distributed floats from an Philox4_32_10 generator.
|
| 408 |
+
*
|
| 409 |
+
* Return two normally distributed floats with mean \p 0.0f and
|
| 410 |
+
* standard deviation \p 1.0f from the Philox4_32_10 generator in \p state,
|
| 411 |
+
* increment position of generator by two.
|
| 412 |
+
*
|
| 413 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 414 |
+
* normally distributed results.
|
| 415 |
+
*
|
| 416 |
+
* \param state - Pointer to state to update
|
| 417 |
+
*
|
| 418 |
+
* \return Normally distributed float2 where each element is from a
|
| 419 |
+
* distribution with mean \p 0.0f and standard deviation \p 1.0f
|
| 420 |
+
*/
|
| 421 |
+
QUALIFIERS float2 curand_normal2(curandStatePhilox4_32_10_t *state)
|
| 422 |
+
{
|
| 423 |
+
return curand_box_muller(state);
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
/**
|
| 427 |
+
* \brief Return four normally distributed floats from an Philox4_32_10 generator.
|
| 428 |
+
*
|
| 429 |
+
* Return four normally distributed floats with mean \p 0.0f and
|
| 430 |
+
* standard deviation \p 1.0f from the Philox4_32_10 generator in \p state,
|
| 431 |
+
* increment position of generator by four.
|
| 432 |
+
*
|
| 433 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 434 |
+
* normally distributed results.
|
| 435 |
+
*
|
| 436 |
+
* \param state - Pointer to state to update
|
| 437 |
+
*
|
| 438 |
+
* \return Normally distributed float2 where each element is from a
|
| 439 |
+
* distribution with mean \p 0.0f and standard deviation \p 1.0f
|
| 440 |
+
*/
|
| 441 |
+
QUALIFIERS float4 curand_normal4(curandStatePhilox4_32_10_t *state)
|
| 442 |
+
{
|
| 443 |
+
return curand_box_muller4(state);
|
| 444 |
+
}
|
| 445 |
+
|
| 446 |
+
|
| 447 |
+
|
| 448 |
+
/**
|
| 449 |
+
* \brief Return two normally distributed floats from an MRG32k3a generator.
|
| 450 |
+
*
|
| 451 |
+
* Return two normally distributed floats with mean \p 0.0f and
|
| 452 |
+
* standard deviation \p 1.0f from the MRG32k3a generator in \p state,
|
| 453 |
+
* increment position of generator by two.
|
| 454 |
+
*
|
| 455 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 456 |
+
* normally distributed results.
|
| 457 |
+
*
|
| 458 |
+
* \param state - Pointer to state to update
|
| 459 |
+
*
|
| 460 |
+
* \return Normally distributed float2 where each element is from a
|
| 461 |
+
* distribution with mean \p 0.0f and standard deviation \p 1.0f
|
| 462 |
+
*/
|
| 463 |
+
QUALIFIERS float2 curand_normal2(curandStateMRG32k3a_t *state)
|
| 464 |
+
{
|
| 465 |
+
return curand_box_muller_mrg(state);
|
| 466 |
+
}
|
| 467 |
+
|
| 468 |
+
/**
|
| 469 |
+
* \brief Return a normally distributed float from a MTGP32 generator.
|
| 470 |
+
*
|
| 471 |
+
* Return a single normally distributed float with mean \p 0.0f and
|
| 472 |
+
* standard deviation \p 1.0f from the MTGP32 generator in \p state,
|
| 473 |
+
* increment position of generator.
|
| 474 |
+
*
|
| 475 |
+
* The implementation uses the inverse cumulative distribution function
|
| 476 |
+
* to generate normally distributed results.
|
| 477 |
+
*
|
| 478 |
+
* \param state - Pointer to state to update
|
| 479 |
+
*
|
| 480 |
+
* \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
|
| 481 |
+
*/
|
| 482 |
+
QUALIFIERS float curand_normal(curandStateMtgp32_t *state)
|
| 483 |
+
{
|
| 484 |
+
return _curand_normal_icdf(curand(state));
|
| 485 |
+
}
|
| 486 |
+
/**
|
| 487 |
+
* \brief Return a normally distributed float from a Sobol32 generator.
|
| 488 |
+
*
|
| 489 |
+
* Return a single normally distributed float with mean \p 0.0f and
|
| 490 |
+
* standard deviation \p 1.0f from the Sobol32 generator in \p state,
|
| 491 |
+
* increment position of generator by one.
|
| 492 |
+
*
|
| 493 |
+
* The implementation uses the inverse cumulative distribution function
|
| 494 |
+
* to generate normally distributed results.
|
| 495 |
+
*
|
| 496 |
+
* \param state - Pointer to state to update
|
| 497 |
+
*
|
| 498 |
+
* \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
|
| 499 |
+
*/
|
| 500 |
+
QUALIFIERS float curand_normal(curandStateSobol32_t *state)
|
| 501 |
+
{
|
| 502 |
+
return _curand_normal_icdf(curand(state));
|
| 503 |
+
}
|
| 504 |
+
|
| 505 |
+
/**
|
| 506 |
+
* \brief Return a normally distributed float from a scrambled Sobol32 generator.
|
| 507 |
+
*
|
| 508 |
+
* Return a single normally distributed float with mean \p 0.0f and
|
| 509 |
+
* standard deviation \p 1.0f from the scrambled Sobol32 generator in \p state,
|
| 510 |
+
* increment position of generator by one.
|
| 511 |
+
*
|
| 512 |
+
* The implementation uses the inverse cumulative distribution function
|
| 513 |
+
* to generate normally distributed results.
|
| 514 |
+
*
|
| 515 |
+
* \param state - Pointer to state to update
|
| 516 |
+
*
|
| 517 |
+
* \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
|
| 518 |
+
*/
|
| 519 |
+
QUALIFIERS float curand_normal(curandStateScrambledSobol32_t *state)
|
| 520 |
+
{
|
| 521 |
+
return _curand_normal_icdf(curand(state));
|
| 522 |
+
}
|
| 523 |
+
|
| 524 |
+
/**
|
| 525 |
+
* \brief Return a normally distributed float from a Sobol64 generator.
|
| 526 |
+
*
|
| 527 |
+
* Return a single normally distributed float with mean \p 0.0f and
|
| 528 |
+
* standard deviation \p 1.0f from the Sobol64 generator in \p state,
|
| 529 |
+
* increment position of generator by one.
|
| 530 |
+
*
|
| 531 |
+
* The implementation uses the inverse cumulative distribution function
|
| 532 |
+
* to generate normally distributed results.
|
| 533 |
+
*
|
| 534 |
+
* \param state - Pointer to state to update
|
| 535 |
+
*
|
| 536 |
+
* \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
|
| 537 |
+
*/
|
| 538 |
+
QUALIFIERS float curand_normal(curandStateSobol64_t *state)
|
| 539 |
+
{
|
| 540 |
+
return _curand_normal_icdf(curand(state));
|
| 541 |
+
}
|
| 542 |
+
|
| 543 |
+
/**
|
| 544 |
+
* \brief Return a normally distributed float from a scrambled Sobol64 generator.
|
| 545 |
+
*
|
| 546 |
+
* Return a single normally distributed float with mean \p 0.0f and
|
| 547 |
+
* standard deviation \p 1.0f from the scrambled Sobol64 generator in \p state,
|
| 548 |
+
* increment position of generator by one.
|
| 549 |
+
*
|
| 550 |
+
* The implementation uses the inverse cumulative distribution function
|
| 551 |
+
* to generate normally distributed results.
|
| 552 |
+
*
|
| 553 |
+
* \param state - Pointer to state to update
|
| 554 |
+
*
|
| 555 |
+
* \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
|
| 556 |
+
*/
|
| 557 |
+
QUALIFIERS float curand_normal(curandStateScrambledSobol64_t *state)
|
| 558 |
+
{
|
| 559 |
+
return _curand_normal_icdf(curand(state));
|
| 560 |
+
}
|
| 561 |
+
|
| 562 |
+
/**
|
| 563 |
+
* \brief Return a normally distributed double from an XORWOW generator.
|
| 564 |
+
*
|
| 565 |
+
* Return a single normally distributed double with mean \p 0.0 and
|
| 566 |
+
* standard deviation \p 1.0 from the XORWOW generator in \p state,
|
| 567 |
+
* increment position of generator.
|
| 568 |
+
*
|
| 569 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 570 |
+
* normally distributed results, then returns them one at a time.
|
| 571 |
+
* See ::curand_normal2_double() for a more efficient version that returns
|
| 572 |
+
* both results at once.
|
| 573 |
+
*
|
| 574 |
+
* \param state - Pointer to state to update
|
| 575 |
+
*
|
| 576 |
+
* \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
|
| 577 |
+
*/
|
| 578 |
+
QUALIFIERS double curand_normal_double(curandStateXORWOW_t *state)
|
| 579 |
+
{
|
| 580 |
+
if(state->boxmuller_flag_double != EXTRA_FLAG_NORMAL) {
|
| 581 |
+
unsigned int x0, x1, y0, y1;
|
| 582 |
+
x0 = curand(state);
|
| 583 |
+
x1 = curand(state);
|
| 584 |
+
y0 = curand(state);
|
| 585 |
+
y1 = curand(state);
|
| 586 |
+
double2 v = _curand_box_muller_double(x0, x1, y0, y1);
|
| 587 |
+
state->boxmuller_extra_double = v.y;
|
| 588 |
+
state->boxmuller_flag_double = EXTRA_FLAG_NORMAL;
|
| 589 |
+
return v.x;
|
| 590 |
+
}
|
| 591 |
+
state->boxmuller_flag_double = 0;
|
| 592 |
+
return state->boxmuller_extra_double;
|
| 593 |
+
}
|
| 594 |
+
|
| 595 |
+
/**
|
| 596 |
+
* \brief Return a normally distributed double from an Philox4_32_10 generator.
|
| 597 |
+
*
|
| 598 |
+
* Return a single normally distributed double with mean \p 0.0 and
|
| 599 |
+
* standard deviation \p 1.0 from the Philox4_32_10 generator in \p state,
|
| 600 |
+
* increment position of generator.
|
| 601 |
+
*
|
| 602 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 603 |
+
* normally distributed results, then returns them one at a time.
|
| 604 |
+
* See ::curand_normal2_double() for a more efficient version that returns
|
| 605 |
+
* both results at once.
|
| 606 |
+
*
|
| 607 |
+
* \param state - Pointer to state to update
|
| 608 |
+
*
|
| 609 |
+
* \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
|
| 610 |
+
*/
|
| 611 |
+
|
| 612 |
+
QUALIFIERS double curand_normal_double(curandStatePhilox4_32_10_t *state)
|
| 613 |
+
{
|
| 614 |
+
if(state->boxmuller_flag_double != EXTRA_FLAG_NORMAL) {
|
| 615 |
+
uint4 _x;
|
| 616 |
+
_x = curand4(state);
|
| 617 |
+
double2 v = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
|
| 618 |
+
state->boxmuller_extra_double = v.y;
|
| 619 |
+
state->boxmuller_flag_double = EXTRA_FLAG_NORMAL;
|
| 620 |
+
return v.x;
|
| 621 |
+
}
|
| 622 |
+
state->boxmuller_flag_double = 0;
|
| 623 |
+
return state->boxmuller_extra_double;
|
| 624 |
+
}
|
| 625 |
+
|
| 626 |
+
|
| 627 |
+
/**
|
| 628 |
+
* \brief Return a normally distributed double from an MRG32k3a generator.
|
| 629 |
+
*
|
| 630 |
+
* Return a single normally distributed double with mean \p 0.0 and
|
| 631 |
+
* standard deviation \p 1.0 from the XORWOW generator in \p state,
|
| 632 |
+
* increment position of generator.
|
| 633 |
+
*
|
| 634 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 635 |
+
* normally distributed results, then returns them one at a time.
|
| 636 |
+
* See ::curand_normal2_double() for a more efficient version that returns
|
| 637 |
+
* both results at once.
|
| 638 |
+
*
|
| 639 |
+
* \param state - Pointer to state to update
|
| 640 |
+
*
|
| 641 |
+
* \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
|
| 642 |
+
*/
|
| 643 |
+
QUALIFIERS double curand_normal_double(curandStateMRG32k3a_t *state)
|
| 644 |
+
{
|
| 645 |
+
if(state->boxmuller_flag_double != EXTRA_FLAG_NORMAL) {
|
| 646 |
+
double2 v = curand_box_muller_mrg_double(state);
|
| 647 |
+
state->boxmuller_extra_double = v.y;
|
| 648 |
+
state->boxmuller_flag_double = EXTRA_FLAG_NORMAL;
|
| 649 |
+
return v.x;
|
| 650 |
+
}
|
| 651 |
+
state->boxmuller_flag_double = 0;
|
| 652 |
+
return state->boxmuller_extra_double;
|
| 653 |
+
}
|
| 654 |
+
|
| 655 |
+
/**
|
| 656 |
+
* \brief Return two normally distributed doubles from an XORWOW generator.
|
| 657 |
+
*
|
| 658 |
+
* Return two normally distributed doubles with mean \p 0.0 and
|
| 659 |
+
* standard deviation \p 1.0 from the XORWOW generator in \p state,
|
| 660 |
+
* increment position of generator by 2.
|
| 661 |
+
*
|
| 662 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 663 |
+
* normally distributed results.
|
| 664 |
+
*
|
| 665 |
+
* \param state - Pointer to state to update
|
| 666 |
+
*
|
| 667 |
+
* \return Normally distributed double2 where each element is from a
|
| 668 |
+
* distribution with mean \p 0.0 and standard deviation \p 1.0
|
| 669 |
+
*/
|
| 670 |
+
QUALIFIERS double2 curand_normal2_double(curandStateXORWOW_t *state)
|
| 671 |
+
{
|
| 672 |
+
return curand_box_muller_double(state);
|
| 673 |
+
}
|
| 674 |
+
|
| 675 |
+
/**
|
| 676 |
+
* \brief Return two normally distributed doubles from an Philox4_32_10 generator.
|
| 677 |
+
*
|
| 678 |
+
* Return two normally distributed doubles with mean \p 0.0 and
|
| 679 |
+
* standard deviation \p 1.0 from the Philox4_32_10 generator in \p state,
|
| 680 |
+
* increment position of generator by 2.
|
| 681 |
+
*
|
| 682 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 683 |
+
* normally distributed results.
|
| 684 |
+
*
|
| 685 |
+
* \param state - Pointer to state to update
|
| 686 |
+
*
|
| 687 |
+
* \return Normally distributed double2 where each element is from a
|
| 688 |
+
* distribution with mean \p 0.0 and standard deviation \p 1.0
|
| 689 |
+
*/
|
| 690 |
+
QUALIFIERS double2 curand_normal2_double(curandStatePhilox4_32_10_t *state)
|
| 691 |
+
{
|
| 692 |
+
uint4 _x;
|
| 693 |
+
double2 result;
|
| 694 |
+
|
| 695 |
+
_x = curand4(state);
|
| 696 |
+
double2 v1 = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
|
| 697 |
+
result.x = v1.x;
|
| 698 |
+
result.y = v1.y;
|
| 699 |
+
|
| 700 |
+
return result;
|
| 701 |
+
}
|
| 702 |
+
|
| 703 |
+
// not a part of API
|
| 704 |
+
QUALIFIERS double4 curand_normal4_double(curandStatePhilox4_32_10_t *state)
|
| 705 |
+
{
|
| 706 |
+
uint4 _x;
|
| 707 |
+
uint4 _y;
|
| 708 |
+
double4 result;
|
| 709 |
+
|
| 710 |
+
_x = curand4(state);
|
| 711 |
+
_y = curand4(state);
|
| 712 |
+
double2 v1 = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
|
| 713 |
+
double2 v2 = _curand_box_muller_double(_y.x, _y.y, _y.z, _y.w);
|
| 714 |
+
result.x = v1.x;
|
| 715 |
+
result.y = v1.y;
|
| 716 |
+
result.z = v2.x;
|
| 717 |
+
result.w = v2.y;
|
| 718 |
+
|
| 719 |
+
return result;
|
| 720 |
+
}
|
| 721 |
+
|
| 722 |
+
|
| 723 |
+
/**
|
| 724 |
+
* \brief Return two normally distributed doubles from an MRG32k3a generator.
|
| 725 |
+
*
|
| 726 |
+
* Return two normally distributed doubles with mean \p 0.0 and
|
| 727 |
+
* standard deviation \p 1.0 from the MRG32k3a generator in \p state,
|
| 728 |
+
* increment position of generator.
|
| 729 |
+
*
|
| 730 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 731 |
+
* normally distributed results.
|
| 732 |
+
*
|
| 733 |
+
* \param state - Pointer to state to update
|
| 734 |
+
*
|
| 735 |
+
* \return Normally distributed double2 where each element is from a
|
| 736 |
+
* distribution with mean \p 0.0 and standard deviation \p 1.0
|
| 737 |
+
*/
|
| 738 |
+
QUALIFIERS double2 curand_normal2_double(curandStateMRG32k3a_t *state)
|
| 739 |
+
{
|
| 740 |
+
return curand_box_muller_mrg_double(state);
|
| 741 |
+
}
|
| 742 |
+
|
| 743 |
+
/**
|
| 744 |
+
* \brief Return a normally distributed double from an MTGP32 generator.
|
| 745 |
+
*
|
| 746 |
+
* Return a single normally distributed double with mean \p 0.0 and
|
| 747 |
+
* standard deviation \p 1.0 from the MTGP32 generator in \p state,
|
| 748 |
+
* increment position of generator.
|
| 749 |
+
*
|
| 750 |
+
* The implementation uses the inverse cumulative distribution function
|
| 751 |
+
* to generate normally distributed results.
|
| 752 |
+
*
|
| 753 |
+
* \param state - Pointer to state to update
|
| 754 |
+
*
|
| 755 |
+
* \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
|
| 756 |
+
*/
|
| 757 |
+
QUALIFIERS double curand_normal_double(curandStateMtgp32_t *state)
|
| 758 |
+
{
|
| 759 |
+
return _curand_normal_icdf_double(curand(state));
|
| 760 |
+
}
|
| 761 |
+
|
| 762 |
+
/**
|
| 763 |
+
* \brief Return a normally distributed double from an Sobol32 generator.
|
| 764 |
+
*
|
| 765 |
+
* Return a single normally distributed double with mean \p 0.0 and
|
| 766 |
+
* standard deviation \p 1.0 from the Sobol32 generator in \p state,
|
| 767 |
+
* increment position of generator by one.
|
| 768 |
+
*
|
| 769 |
+
* The implementation uses the inverse cumulative distribution function
|
| 770 |
+
* to generate normally distributed results.
|
| 771 |
+
*
|
| 772 |
+
* \param state - Pointer to state to update
|
| 773 |
+
*
|
| 774 |
+
* \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
|
| 775 |
+
*/
|
| 776 |
+
QUALIFIERS double curand_normal_double(curandStateSobol32_t *state)
|
| 777 |
+
{
|
| 778 |
+
return _curand_normal_icdf_double(curand(state));
|
| 779 |
+
}
|
| 780 |
+
|
| 781 |
+
/**
|
| 782 |
+
* \brief Return a normally distributed double from a scrambled Sobol32 generator.
|
| 783 |
+
*
|
| 784 |
+
* Return a single normally distributed double with mean \p 0.0 and
|
| 785 |
+
* standard deviation \p 1.0 from the scrambled Sobol32 generator in \p state,
|
| 786 |
+
* increment position of generator by one.
|
| 787 |
+
*
|
| 788 |
+
* The implementation uses the inverse cumulative distribution function
|
| 789 |
+
* to generate normally distributed results.
|
| 790 |
+
*
|
| 791 |
+
* \param state - Pointer to state to update
|
| 792 |
+
*
|
| 793 |
+
* \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
|
| 794 |
+
*/
|
| 795 |
+
QUALIFIERS double curand_normal_double(curandStateScrambledSobol32_t *state)
|
| 796 |
+
{
|
| 797 |
+
return _curand_normal_icdf_double(curand(state));
|
| 798 |
+
}
|
| 799 |
+
|
| 800 |
+
/**
|
| 801 |
+
* \brief Return a normally distributed double from a Sobol64 generator.
|
| 802 |
+
*
|
| 803 |
+
* Return a single normally distributed double with mean \p 0.0 and
|
| 804 |
+
* standard deviation \p 1.0 from the Sobol64 generator in \p state,
|
| 805 |
+
* increment position of generator by one.
|
| 806 |
+
*
|
| 807 |
+
* The implementation uses the inverse cumulative distribution function
|
| 808 |
+
* to generate normally distributed results.
|
| 809 |
+
*
|
| 810 |
+
* \param state - Pointer to state to update
|
| 811 |
+
*
|
| 812 |
+
* \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
|
| 813 |
+
*/
|
| 814 |
+
QUALIFIERS double curand_normal_double(curandStateSobol64_t *state)
|
| 815 |
+
{
|
| 816 |
+
return _curand_normal_icdf_double(curand(state));
|
| 817 |
+
}
|
| 818 |
+
|
| 819 |
+
/**
|
| 820 |
+
* \brief Return a normally distributed double from a scrambled Sobol64 generator.
|
| 821 |
+
*
|
| 822 |
+
* Return a single normally distributed double with mean \p 0.0 and
|
| 823 |
+
* standard deviation \p 1.0 from the scrambled Sobol64 generator in \p state,
|
| 824 |
+
* increment position of generator by one.
|
| 825 |
+
*
|
| 826 |
+
* The implementation uses the inverse cumulative distribution function
|
| 827 |
+
* to generate normally distributed results.
|
| 828 |
+
*
|
| 829 |
+
* \param state - Pointer to state to update
|
| 830 |
+
*
|
| 831 |
+
* \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
|
| 832 |
+
*/
|
| 833 |
+
QUALIFIERS double curand_normal_double(curandStateScrambledSobol64_t *state)
|
| 834 |
+
{
|
| 835 |
+
return _curand_normal_icdf_double(curand(state));
|
| 836 |
+
}
|
| 837 |
+
#endif // !defined(CURAND_NORMAL_H_)
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/lib/__init__.py
ADDED
|
File without changes
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/lib/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (218 Bytes). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/__init__.py
ADDED
|
File without changes
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (212 Bytes). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/__init__.py
ADDED
|
File without changes
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvtxDetail/nvtxImpl.h
ADDED
|
@@ -0,0 +1,469 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* This file was procedurally generated! Do not modify this file by hand. */
|
| 2 |
+
|
| 3 |
+
/*
|
| 4 |
+
* Copyright 2009-2016 NVIDIA Corporation. All rights reserved.
|
| 5 |
+
*
|
| 6 |
+
* NOTICE TO USER:
|
| 7 |
+
*
|
| 8 |
+
* This source code is subject to NVIDIA ownership rights under U.S. and
|
| 9 |
+
* international Copyright laws.
|
| 10 |
+
*
|
| 11 |
+
* This software and the information contained herein is PROPRIETARY and
|
| 12 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
|
| 13 |
+
* of a form of NVIDIA software license agreement.
|
| 14 |
+
*
|
| 15 |
+
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
|
| 16 |
+
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
|
| 17 |
+
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
|
| 18 |
+
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
|
| 19 |
+
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 20 |
+
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
|
| 21 |
+
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
| 22 |
+
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
| 23 |
+
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
| 24 |
+
* OR PERFORMANCE OF THIS SOURCE CODE.
|
| 25 |
+
*
|
| 26 |
+
* U.S. Government End Users. This source code is a "commercial item" as
|
| 27 |
+
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
|
| 28 |
+
* "commercial computer software" and "commercial computer software
|
| 29 |
+
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
|
| 30 |
+
* and is provided to the U.S. Government only as a commercial end item.
|
| 31 |
+
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
|
| 32 |
+
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
|
| 33 |
+
* source code with only those rights set forth herein.
|
| 34 |
+
*
|
| 35 |
+
* Any use of this source code in individual and commercial software must
|
| 36 |
+
* include, in the user documentation and internal comments to the code,
|
| 37 |
+
* the above Disclaimer and U.S. Government End Users Notice.
|
| 38 |
+
*/
|
| 39 |
+
|
| 40 |
+
#ifndef NVTX_IMPL_GUARD
|
| 41 |
+
#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
|
| 42 |
+
#endif
|
| 43 |
+
|
| 44 |
+
/* ---- Include required platform headers ---- */
|
| 45 |
+
|
| 46 |
+
#if defined(_WIN32)
|
| 47 |
+
|
| 48 |
+
#include <Windows.h>
|
| 49 |
+
|
| 50 |
+
#else
|
| 51 |
+
#include <unistd.h>
|
| 52 |
+
|
| 53 |
+
#if defined(__ANDROID__)
|
| 54 |
+
#include <android/api-level.h>
|
| 55 |
+
#endif
|
| 56 |
+
|
| 57 |
+
#if defined(__linux__) || defined(__CYGWIN__)
|
| 58 |
+
#include <sched.h>
|
| 59 |
+
#endif
|
| 60 |
+
|
| 61 |
+
#include <limits.h>
|
| 62 |
+
#include <dlfcn.h>
|
| 63 |
+
#include <fcntl.h>
|
| 64 |
+
#include <stdlib.h>
|
| 65 |
+
#include <stdio.h>
|
| 66 |
+
#include <sys/types.h>
|
| 67 |
+
#include <unistd.h>
|
| 68 |
+
#include <errno.h>
|
| 69 |
+
|
| 70 |
+
#include <string.h>
|
| 71 |
+
#include <sys/types.h>
|
| 72 |
+
#include <pthread.h>
|
| 73 |
+
#include <stdlib.h>
|
| 74 |
+
#include <wchar.h>
|
| 75 |
+
|
| 76 |
+
#endif
|
| 77 |
+
|
| 78 |
+
/* ---- Define macros used in this file ---- */
|
| 79 |
+
|
| 80 |
+
#define NVTX_INIT_STATE_FRESH 0
|
| 81 |
+
#define NVTX_INIT_STATE_STARTED 1
|
| 82 |
+
#define NVTX_INIT_STATE_COMPLETE 2
|
| 83 |
+
|
| 84 |
+
#ifdef NVTX_DEBUG_PRINT
|
| 85 |
+
#ifdef __ANDROID__
|
| 86 |
+
#include <android/log.h>
|
| 87 |
+
#define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__);
|
| 88 |
+
#define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__);
|
| 89 |
+
#else
|
| 90 |
+
#include <stdio.h>
|
| 91 |
+
#define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__)
|
| 92 |
+
#define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__)
|
| 93 |
+
#endif
|
| 94 |
+
#else /* !defined(NVTX_DEBUG_PRINT) */
|
| 95 |
+
#define NVTX_ERR(...)
|
| 96 |
+
#define NVTX_INFO(...)
|
| 97 |
+
#endif
|
| 98 |
+
|
| 99 |
+
#ifdef __cplusplus
|
| 100 |
+
extern "C" {
|
| 101 |
+
#endif /* __cplusplus */
|
| 102 |
+
|
| 103 |
+
#ifdef __GNUC__
|
| 104 |
+
#pragma GCC visibility push(hidden)
|
| 105 |
+
#endif
|
| 106 |
+
|
| 107 |
+
/* ---- Forward declare all functions referenced in globals ---- */
|
| 108 |
+
|
| 109 |
+
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(void);
|
| 110 |
+
NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)(
|
| 111 |
+
NvtxCallbackModule module,
|
| 112 |
+
NvtxFunctionTable* out_table,
|
| 113 |
+
unsigned int* out_size);
|
| 114 |
+
NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(
|
| 115 |
+
uint32_t version);
|
| 116 |
+
NVTX_LINKONCE_FWDDECL_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(
|
| 117 |
+
uint32_t exportTableId);
|
| 118 |
+
|
| 119 |
+
#include "nvtxInitDecls.h"
|
| 120 |
+
|
| 121 |
+
/* ---- Define all globals ---- */
|
| 122 |
+
|
| 123 |
+
typedef struct nvtxGlobals_t
|
| 124 |
+
{
|
| 125 |
+
volatile unsigned int initState;
|
| 126 |
+
NvtxExportTableCallbacks etblCallbacks;
|
| 127 |
+
NvtxExportTableVersionInfo etblVersionInfo;
|
| 128 |
+
|
| 129 |
+
/* Implementation function pointers */
|
| 130 |
+
nvtxMarkEx_impl_fntype nvtxMarkEx_impl_fnptr;
|
| 131 |
+
nvtxMarkA_impl_fntype nvtxMarkA_impl_fnptr;
|
| 132 |
+
nvtxMarkW_impl_fntype nvtxMarkW_impl_fnptr;
|
| 133 |
+
nvtxRangeStartEx_impl_fntype nvtxRangeStartEx_impl_fnptr;
|
| 134 |
+
nvtxRangeStartA_impl_fntype nvtxRangeStartA_impl_fnptr;
|
| 135 |
+
nvtxRangeStartW_impl_fntype nvtxRangeStartW_impl_fnptr;
|
| 136 |
+
nvtxRangeEnd_impl_fntype nvtxRangeEnd_impl_fnptr;
|
| 137 |
+
nvtxRangePushEx_impl_fntype nvtxRangePushEx_impl_fnptr;
|
| 138 |
+
nvtxRangePushA_impl_fntype nvtxRangePushA_impl_fnptr;
|
| 139 |
+
nvtxRangePushW_impl_fntype nvtxRangePushW_impl_fnptr;
|
| 140 |
+
nvtxRangePop_impl_fntype nvtxRangePop_impl_fnptr;
|
| 141 |
+
nvtxNameCategoryA_impl_fntype nvtxNameCategoryA_impl_fnptr;
|
| 142 |
+
nvtxNameCategoryW_impl_fntype nvtxNameCategoryW_impl_fnptr;
|
| 143 |
+
nvtxNameOsThreadA_impl_fntype nvtxNameOsThreadA_impl_fnptr;
|
| 144 |
+
nvtxNameOsThreadW_impl_fntype nvtxNameOsThreadW_impl_fnptr;
|
| 145 |
+
|
| 146 |
+
nvtxNameCuDeviceA_fakeimpl_fntype nvtxNameCuDeviceA_impl_fnptr;
|
| 147 |
+
nvtxNameCuDeviceW_fakeimpl_fntype nvtxNameCuDeviceW_impl_fnptr;
|
| 148 |
+
nvtxNameCuContextA_fakeimpl_fntype nvtxNameCuContextA_impl_fnptr;
|
| 149 |
+
nvtxNameCuContextW_fakeimpl_fntype nvtxNameCuContextW_impl_fnptr;
|
| 150 |
+
nvtxNameCuStreamA_fakeimpl_fntype nvtxNameCuStreamA_impl_fnptr;
|
| 151 |
+
nvtxNameCuStreamW_fakeimpl_fntype nvtxNameCuStreamW_impl_fnptr;
|
| 152 |
+
nvtxNameCuEventA_fakeimpl_fntype nvtxNameCuEventA_impl_fnptr;
|
| 153 |
+
nvtxNameCuEventW_fakeimpl_fntype nvtxNameCuEventW_impl_fnptr;
|
| 154 |
+
|
| 155 |
+
nvtxNameClDeviceA_fakeimpl_fntype nvtxNameClDeviceA_impl_fnptr;
|
| 156 |
+
nvtxNameClDeviceW_fakeimpl_fntype nvtxNameClDeviceW_impl_fnptr;
|
| 157 |
+
nvtxNameClContextA_fakeimpl_fntype nvtxNameClContextA_impl_fnptr;
|
| 158 |
+
nvtxNameClContextW_fakeimpl_fntype nvtxNameClContextW_impl_fnptr;
|
| 159 |
+
nvtxNameClCommandQueueA_fakeimpl_fntype nvtxNameClCommandQueueA_impl_fnptr;
|
| 160 |
+
nvtxNameClCommandQueueW_fakeimpl_fntype nvtxNameClCommandQueueW_impl_fnptr;
|
| 161 |
+
nvtxNameClMemObjectA_fakeimpl_fntype nvtxNameClMemObjectA_impl_fnptr;
|
| 162 |
+
nvtxNameClMemObjectW_fakeimpl_fntype nvtxNameClMemObjectW_impl_fnptr;
|
| 163 |
+
nvtxNameClSamplerA_fakeimpl_fntype nvtxNameClSamplerA_impl_fnptr;
|
| 164 |
+
nvtxNameClSamplerW_fakeimpl_fntype nvtxNameClSamplerW_impl_fnptr;
|
| 165 |
+
nvtxNameClProgramA_fakeimpl_fntype nvtxNameClProgramA_impl_fnptr;
|
| 166 |
+
nvtxNameClProgramW_fakeimpl_fntype nvtxNameClProgramW_impl_fnptr;
|
| 167 |
+
nvtxNameClEventA_fakeimpl_fntype nvtxNameClEventA_impl_fnptr;
|
| 168 |
+
nvtxNameClEventW_fakeimpl_fntype nvtxNameClEventW_impl_fnptr;
|
| 169 |
+
|
| 170 |
+
nvtxNameCudaDeviceA_impl_fntype nvtxNameCudaDeviceA_impl_fnptr;
|
| 171 |
+
nvtxNameCudaDeviceW_impl_fntype nvtxNameCudaDeviceW_impl_fnptr;
|
| 172 |
+
nvtxNameCudaStreamA_fakeimpl_fntype nvtxNameCudaStreamA_impl_fnptr;
|
| 173 |
+
nvtxNameCudaStreamW_fakeimpl_fntype nvtxNameCudaStreamW_impl_fnptr;
|
| 174 |
+
nvtxNameCudaEventA_fakeimpl_fntype nvtxNameCudaEventA_impl_fnptr;
|
| 175 |
+
nvtxNameCudaEventW_fakeimpl_fntype nvtxNameCudaEventW_impl_fnptr;
|
| 176 |
+
|
| 177 |
+
nvtxDomainMarkEx_impl_fntype nvtxDomainMarkEx_impl_fnptr;
|
| 178 |
+
nvtxDomainRangeStartEx_impl_fntype nvtxDomainRangeStartEx_impl_fnptr;
|
| 179 |
+
nvtxDomainRangeEnd_impl_fntype nvtxDomainRangeEnd_impl_fnptr;
|
| 180 |
+
nvtxDomainRangePushEx_impl_fntype nvtxDomainRangePushEx_impl_fnptr;
|
| 181 |
+
nvtxDomainRangePop_impl_fntype nvtxDomainRangePop_impl_fnptr;
|
| 182 |
+
nvtxDomainResourceCreate_impl_fntype nvtxDomainResourceCreate_impl_fnptr;
|
| 183 |
+
nvtxDomainResourceDestroy_impl_fntype nvtxDomainResourceDestroy_impl_fnptr;
|
| 184 |
+
nvtxDomainNameCategoryA_impl_fntype nvtxDomainNameCategoryA_impl_fnptr;
|
| 185 |
+
nvtxDomainNameCategoryW_impl_fntype nvtxDomainNameCategoryW_impl_fnptr;
|
| 186 |
+
nvtxDomainRegisterStringA_impl_fntype nvtxDomainRegisterStringA_impl_fnptr;
|
| 187 |
+
nvtxDomainRegisterStringW_impl_fntype nvtxDomainRegisterStringW_impl_fnptr;
|
| 188 |
+
nvtxDomainCreateA_impl_fntype nvtxDomainCreateA_impl_fnptr;
|
| 189 |
+
nvtxDomainCreateW_impl_fntype nvtxDomainCreateW_impl_fnptr;
|
| 190 |
+
nvtxDomainDestroy_impl_fntype nvtxDomainDestroy_impl_fnptr;
|
| 191 |
+
nvtxInitialize_impl_fntype nvtxInitialize_impl_fnptr;
|
| 192 |
+
|
| 193 |
+
nvtxDomainSyncUserCreate_impl_fntype nvtxDomainSyncUserCreate_impl_fnptr;
|
| 194 |
+
nvtxDomainSyncUserDestroy_impl_fntype nvtxDomainSyncUserDestroy_impl_fnptr;
|
| 195 |
+
nvtxDomainSyncUserAcquireStart_impl_fntype nvtxDomainSyncUserAcquireStart_impl_fnptr;
|
| 196 |
+
nvtxDomainSyncUserAcquireFailed_impl_fntype nvtxDomainSyncUserAcquireFailed_impl_fnptr;
|
| 197 |
+
nvtxDomainSyncUserAcquireSuccess_impl_fntype nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
|
| 198 |
+
nvtxDomainSyncUserReleasing_impl_fntype nvtxDomainSyncUserReleasing_impl_fnptr;
|
| 199 |
+
|
| 200 |
+
/* Tables of function pointers -- Extra null added to the end to ensure
|
| 201 |
+
* a crash instead of silent corruption if a tool reads off the end. */
|
| 202 |
+
NvtxFunctionPointer* functionTable_CORE [NVTX_CBID_CORE_SIZE + 1];
|
| 203 |
+
NvtxFunctionPointer* functionTable_CUDA [NVTX_CBID_CUDA_SIZE + 1];
|
| 204 |
+
NvtxFunctionPointer* functionTable_OPENCL[NVTX_CBID_OPENCL_SIZE + 1];
|
| 205 |
+
NvtxFunctionPointer* functionTable_CUDART[NVTX_CBID_CUDART_SIZE + 1];
|
| 206 |
+
NvtxFunctionPointer* functionTable_CORE2 [NVTX_CBID_CORE2_SIZE + 1];
|
| 207 |
+
NvtxFunctionPointer* functionTable_SYNC [NVTX_CBID_SYNC_SIZE + 1];
|
| 208 |
+
} nvtxGlobals_t;
|
| 209 |
+
|
| 210 |
+
NVTX_LINKONCE_DEFINE_GLOBAL nvtxGlobals_t NVTX_VERSIONED_IDENTIFIER(nvtxGlobals) =
|
| 211 |
+
{
|
| 212 |
+
NVTX_INIT_STATE_FRESH,
|
| 213 |
+
|
| 214 |
+
{
|
| 215 |
+
sizeof(NvtxExportTableCallbacks),
|
| 216 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)
|
| 217 |
+
},
|
| 218 |
+
{
|
| 219 |
+
sizeof(NvtxExportTableVersionInfo),
|
| 220 |
+
NVTX_VERSION,
|
| 221 |
+
0,
|
| 222 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)
|
| 223 |
+
},
|
| 224 |
+
|
| 225 |
+
/* Implementation function pointers */
|
| 226 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init),
|
| 227 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init),
|
| 228 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init),
|
| 229 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init),
|
| 230 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init),
|
| 231 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init),
|
| 232 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init),
|
| 233 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init),
|
| 234 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init),
|
| 235 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init),
|
| 236 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init),
|
| 237 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init),
|
| 238 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init),
|
| 239 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init),
|
| 240 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init),
|
| 241 |
+
|
| 242 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init),
|
| 243 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init),
|
| 244 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init),
|
| 245 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init),
|
| 246 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init),
|
| 247 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init),
|
| 248 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init),
|
| 249 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init),
|
| 250 |
+
|
| 251 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init),
|
| 252 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init),
|
| 253 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init),
|
| 254 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init),
|
| 255 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init),
|
| 256 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init),
|
| 257 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init),
|
| 258 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init),
|
| 259 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init),
|
| 260 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init),
|
| 261 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init),
|
| 262 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init),
|
| 263 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init),
|
| 264 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init),
|
| 265 |
+
|
| 266 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init),
|
| 267 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init),
|
| 268 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init),
|
| 269 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init),
|
| 270 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init),
|
| 271 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init),
|
| 272 |
+
|
| 273 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init),
|
| 274 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init),
|
| 275 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init),
|
| 276 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init),
|
| 277 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init),
|
| 278 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init),
|
| 279 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init),
|
| 280 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init),
|
| 281 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init),
|
| 282 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init),
|
| 283 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init),
|
| 284 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init),
|
| 285 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init),
|
| 286 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init),
|
| 287 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init),
|
| 288 |
+
|
| 289 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init),
|
| 290 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init),
|
| 291 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init),
|
| 292 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init),
|
| 293 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init),
|
| 294 |
+
NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init),
|
| 295 |
+
|
| 296 |
+
/* Tables of function pointers */
|
| 297 |
+
{
|
| 298 |
+
0,
|
| 299 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr,
|
| 300 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr,
|
| 301 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr,
|
| 302 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr,
|
| 303 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr,
|
| 304 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr,
|
| 305 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr,
|
| 306 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr,
|
| 307 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr,
|
| 308 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr,
|
| 309 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr,
|
| 310 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr,
|
| 311 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr,
|
| 312 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr,
|
| 313 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr,
|
| 314 |
+
0
|
| 315 |
+
},
|
| 316 |
+
{
|
| 317 |
+
0,
|
| 318 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr,
|
| 319 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr,
|
| 320 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr,
|
| 321 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr,
|
| 322 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr,
|
| 323 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr,
|
| 324 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr,
|
| 325 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr,
|
| 326 |
+
0
|
| 327 |
+
},
|
| 328 |
+
{
|
| 329 |
+
0,
|
| 330 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr,
|
| 331 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr,
|
| 332 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr,
|
| 333 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr,
|
| 334 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr,
|
| 335 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr,
|
| 336 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr,
|
| 337 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr,
|
| 338 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr,
|
| 339 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr,
|
| 340 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr,
|
| 341 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr,
|
| 342 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr,
|
| 343 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr,
|
| 344 |
+
0
|
| 345 |
+
},
|
| 346 |
+
{
|
| 347 |
+
0,
|
| 348 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr,
|
| 349 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr,
|
| 350 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr,
|
| 351 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr,
|
| 352 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr,
|
| 353 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr,
|
| 354 |
+
0
|
| 355 |
+
},
|
| 356 |
+
{
|
| 357 |
+
0,
|
| 358 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr,
|
| 359 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr,
|
| 360 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr,
|
| 361 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr,
|
| 362 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr,
|
| 363 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr,
|
| 364 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr,
|
| 365 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr,
|
| 366 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr,
|
| 367 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr,
|
| 368 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr,
|
| 369 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr,
|
| 370 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr,
|
| 371 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr,
|
| 372 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr,
|
| 373 |
+
0
|
| 374 |
+
},
|
| 375 |
+
{
|
| 376 |
+
0,
|
| 377 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr,
|
| 378 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr,
|
| 379 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr,
|
| 380 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr,
|
| 381 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr,
|
| 382 |
+
(NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr,
|
| 383 |
+
0
|
| 384 |
+
}
|
| 385 |
+
};
|
| 386 |
+
|
| 387 |
+
/* ---- Define static inline implementations of core API functions ---- */
|
| 388 |
+
|
| 389 |
+
#include "nvtxImplCore.h"
|
| 390 |
+
|
| 391 |
+
/* ---- Define implementations of export table functions ---- */
|
| 392 |
+
|
| 393 |
+
NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)(
|
| 394 |
+
NvtxCallbackModule module,
|
| 395 |
+
NvtxFunctionTable* out_table,
|
| 396 |
+
unsigned int* out_size)
|
| 397 |
+
{
|
| 398 |
+
unsigned int bytes = 0;
|
| 399 |
+
NvtxFunctionTable table = (NvtxFunctionTable)0;
|
| 400 |
+
|
| 401 |
+
switch (module)
|
| 402 |
+
{
|
| 403 |
+
case NVTX_CB_MODULE_CORE:
|
| 404 |
+
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE;
|
| 405 |
+
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE);
|
| 406 |
+
break;
|
| 407 |
+
case NVTX_CB_MODULE_CUDA:
|
| 408 |
+
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA;
|
| 409 |
+
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA);
|
| 410 |
+
break;
|
| 411 |
+
case NVTX_CB_MODULE_OPENCL:
|
| 412 |
+
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL;
|
| 413 |
+
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL);
|
| 414 |
+
break;
|
| 415 |
+
case NVTX_CB_MODULE_CUDART:
|
| 416 |
+
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART;
|
| 417 |
+
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART);
|
| 418 |
+
break;
|
| 419 |
+
case NVTX_CB_MODULE_CORE2:
|
| 420 |
+
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2;
|
| 421 |
+
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2);
|
| 422 |
+
break;
|
| 423 |
+
case NVTX_CB_MODULE_SYNC:
|
| 424 |
+
table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC;
|
| 425 |
+
bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC);
|
| 426 |
+
break;
|
| 427 |
+
default: return 0;
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
if (out_size)
|
| 431 |
+
*out_size = (bytes / (unsigned int)sizeof(NvtxFunctionPointer*)) - 1;
|
| 432 |
+
|
| 433 |
+
if (out_table)
|
| 434 |
+
*out_table = table;
|
| 435 |
+
|
| 436 |
+
return 1;
|
| 437 |
+
}
|
| 438 |
+
|
| 439 |
+
NVTX_LINKONCE_DEFINE_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(uint32_t exportTableId)
|
| 440 |
+
{
|
| 441 |
+
switch (exportTableId)
|
| 442 |
+
{
|
| 443 |
+
case NVTX_ETID_CALLBACKS: return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblCallbacks;
|
| 444 |
+
case NVTX_ETID_VERSIONINFO: return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblVersionInfo;
|
| 445 |
+
default: return 0;
|
| 446 |
+
}
|
| 447 |
+
}
|
| 448 |
+
|
| 449 |
+
NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(uint32_t version)
|
| 450 |
+
{
|
| 451 |
+
/* Reserved for custom implementations to resolve problems with tools */
|
| 452 |
+
(void)version;
|
| 453 |
+
}
|
| 454 |
+
|
| 455 |
+
/* ---- Define implementations of init versions of all API functions ---- */
|
| 456 |
+
|
| 457 |
+
#include "nvtxInitDefs.h"
|
| 458 |
+
|
| 459 |
+
/* ---- Define implementations of initialization functions ---- */
|
| 460 |
+
|
| 461 |
+
#include "nvtxInit.h"
|
| 462 |
+
|
| 463 |
+
#ifdef __GNUC__
|
| 464 |
+
#pragma GCC visibility pop
|
| 465 |
+
#endif
|
| 466 |
+
|
| 467 |
+
#ifdef __cplusplus
|
| 468 |
+
} /* extern "C" */
|
| 469 |
+
#endif /* __cplusplus */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/lib/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (216 Bytes). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/lib/libnvToolsExt.so.1
ADDED
|
Binary file (40.1 kB). View file
|
|
|