Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +2 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__init__.py +51 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_util.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_windows.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_unix.py +65 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_util.py +47 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_windows.py +65 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/__pycache__/test_fp.cpython-311.pyc +3 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/__pycache__/__init__.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openmp/cupti_openmp.h +100 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openmp/omp-tools.h +1083 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/__pycache__/__init__.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cuda_stdint.h +112 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_driver_cbid.h +690 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_metrics.h +825 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_gl_interop_meta.h +71 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/lib/__init__.py +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/lib/__pycache__/__init__.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/common_functions.h +65 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaEGL.h +659 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_egl_interop.h +642 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp8.hpp +1546 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline_helpers.h +373 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline_primitives.h +148 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_types.h +81 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/host_defines.h +65 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/mma.h +60 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_61_intrinsics.h +123 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_types.h +281 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/vector_functions.h +175 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_globals.h +93 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_lognormal.h +697 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mrg32k3a.h +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32dc_p_11213.h +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_normal_static.h +127 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_philox4x32_x.h +194 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_precalc.h +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtCuda.h +164 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtOpenCL.h +214 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExtOpenCL.h +220 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/WHEEL +5 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/top_level.txt +1 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__init__.py +4 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__pycache__/_pyximport2.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__pycache__/pyxbuild.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/_pyximport3.py +478 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_VF.py +30 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_classes.py +55 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_deploy.py +105 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_linalg_utils.py +164 -0
.gitattributes
CHANGED
|
@@ -62,3 +62,5 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/__py
|
|
| 62 |
tuning-competition-baseline/.venv/lib/python3.11/site-packages/jinja2/__pycache__/compiler.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
| 63 |
tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/_C.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 64 |
tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Nodes.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 62 |
tuning-competition-baseline/.venv/lib/python3.11/site-packages/jinja2/__pycache__/compiler.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
| 63 |
tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/_C.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 64 |
tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Nodes.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
| 65 |
+
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/model.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
| 66 |
+
tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/__pycache__/test_fp.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__init__.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
A platform independent file lock that supports the with-statement.
|
| 3 |
+
|
| 4 |
+
.. autodata:: filelock.__version__
|
| 5 |
+
:no-value:
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import sys
|
| 11 |
+
import warnings
|
| 12 |
+
from typing import TYPE_CHECKING
|
| 13 |
+
|
| 14 |
+
from ._api import AcquireReturnProxy, BaseFileLock
|
| 15 |
+
from ._error import Timeout
|
| 16 |
+
from ._soft import SoftFileLock
|
| 17 |
+
from ._unix import UnixFileLock, has_fcntl
|
| 18 |
+
from ._windows import WindowsFileLock
|
| 19 |
+
from .version import version
|
| 20 |
+
|
| 21 |
+
#: version of the project as a string
|
| 22 |
+
__version__: str = version
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
if sys.platform == "win32": # pragma: win32 cover
|
| 26 |
+
_FileLock: type[BaseFileLock] = WindowsFileLock
|
| 27 |
+
else: # pragma: win32 no cover # noqa: PLR5501
|
| 28 |
+
if has_fcntl:
|
| 29 |
+
_FileLock: type[BaseFileLock] = UnixFileLock
|
| 30 |
+
else:
|
| 31 |
+
_FileLock = SoftFileLock
|
| 32 |
+
if warnings is not None:
|
| 33 |
+
warnings.warn("only soft file lock is available", stacklevel=2)
|
| 34 |
+
|
| 35 |
+
if TYPE_CHECKING:
|
| 36 |
+
FileLock = SoftFileLock
|
| 37 |
+
else:
|
| 38 |
+
#: Alias for the lock, which should be used for the current platform.
|
| 39 |
+
FileLock = _FileLock
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
__all__ = [
|
| 43 |
+
"__version__",
|
| 44 |
+
"FileLock",
|
| 45 |
+
"SoftFileLock",
|
| 46 |
+
"Timeout",
|
| 47 |
+
"UnixFileLock",
|
| 48 |
+
"WindowsFileLock",
|
| 49 |
+
"BaseFileLock",
|
| 50 |
+
"AcquireReturnProxy",
|
| 51 |
+
]
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_util.cpython-311.pyc
ADDED
|
Binary file (2.2 kB). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_windows.cpython-311.pyc
ADDED
|
Binary file (3.68 kB). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_unix.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
from contextlib import suppress
|
| 6 |
+
from errno import ENOSYS
|
| 7 |
+
from typing import cast
|
| 8 |
+
|
| 9 |
+
from ._api import BaseFileLock
|
| 10 |
+
from ._util import ensure_directory_exists
|
| 11 |
+
|
| 12 |
+
#: a flag to indicate if the fcntl API is available
|
| 13 |
+
has_fcntl = False
|
| 14 |
+
if sys.platform == "win32": # pragma: win32 cover
|
| 15 |
+
|
| 16 |
+
class UnixFileLock(BaseFileLock):
|
| 17 |
+
"""Uses the :func:`fcntl.flock` to hard lock the lock file on unix systems."""
|
| 18 |
+
|
| 19 |
+
def _acquire(self) -> None:
|
| 20 |
+
raise NotImplementedError
|
| 21 |
+
|
| 22 |
+
def _release(self) -> None:
|
| 23 |
+
raise NotImplementedError
|
| 24 |
+
|
| 25 |
+
else: # pragma: win32 no cover
|
| 26 |
+
try:
|
| 27 |
+
import fcntl
|
| 28 |
+
except ImportError:
|
| 29 |
+
pass
|
| 30 |
+
else:
|
| 31 |
+
has_fcntl = True
|
| 32 |
+
|
| 33 |
+
class UnixFileLock(BaseFileLock):
|
| 34 |
+
"""Uses the :func:`fcntl.flock` to hard lock the lock file on unix systems."""
|
| 35 |
+
|
| 36 |
+
def _acquire(self) -> None:
|
| 37 |
+
ensure_directory_exists(self.lock_file)
|
| 38 |
+
open_flags = os.O_RDWR | os.O_CREAT | os.O_TRUNC
|
| 39 |
+
fd = os.open(self.lock_file, open_flags, self._context.mode)
|
| 40 |
+
with suppress(PermissionError): # This locked is not owned by this UID
|
| 41 |
+
os.fchmod(fd, self._context.mode)
|
| 42 |
+
try:
|
| 43 |
+
fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
| 44 |
+
except OSError as exception:
|
| 45 |
+
os.close(fd)
|
| 46 |
+
if exception.errno == ENOSYS: # NotImplemented error
|
| 47 |
+
msg = "FileSystem does not appear to support flock; user SoftFileLock instead"
|
| 48 |
+
raise NotImplementedError(msg) from exception
|
| 49 |
+
else:
|
| 50 |
+
self._context.lock_file_fd = fd
|
| 51 |
+
|
| 52 |
+
def _release(self) -> None:
|
| 53 |
+
# Do not remove the lockfile:
|
| 54 |
+
# https://github.com/tox-dev/py-filelock/issues/31
|
| 55 |
+
# https://stackoverflow.com/questions/17708885/flock-removing-locked-file-without-race-condition
|
| 56 |
+
fd = cast(int, self._context.lock_file_fd)
|
| 57 |
+
self._context.lock_file_fd = None
|
| 58 |
+
fcntl.flock(fd, fcntl.LOCK_UN)
|
| 59 |
+
os.close(fd)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
__all__ = [
|
| 63 |
+
"has_fcntl",
|
| 64 |
+
"UnixFileLock",
|
| 65 |
+
]
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_util.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import stat
|
| 5 |
+
import sys
|
| 6 |
+
from errno import EACCES, EISDIR
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def raise_on_not_writable_file(filename: str) -> None:
|
| 11 |
+
"""
|
| 12 |
+
Raise an exception if attempting to open the file for writing would fail.
|
| 13 |
+
This is done so files that will never be writable can be separated from
|
| 14 |
+
files that are writable but currently locked
|
| 15 |
+
:param filename: file to check
|
| 16 |
+
:raises OSError: as if the file was opened for writing.
|
| 17 |
+
"""
|
| 18 |
+
try: # use stat to do exists + can write to check without race condition
|
| 19 |
+
file_stat = os.stat(filename) # noqa: PTH116
|
| 20 |
+
except OSError:
|
| 21 |
+
return # swallow does not exist or other errors
|
| 22 |
+
|
| 23 |
+
if file_stat.st_mtime != 0: # if os.stat returns but modification is zero that's an invalid os.stat - ignore it
|
| 24 |
+
if not (file_stat.st_mode & stat.S_IWUSR):
|
| 25 |
+
raise PermissionError(EACCES, "Permission denied", filename)
|
| 26 |
+
|
| 27 |
+
if stat.S_ISDIR(file_stat.st_mode):
|
| 28 |
+
if sys.platform == "win32": # pragma: win32 cover
|
| 29 |
+
# On Windows, this is PermissionError
|
| 30 |
+
raise PermissionError(EACCES, "Permission denied", filename)
|
| 31 |
+
else: # pragma: win32 no cover # noqa: RET506
|
| 32 |
+
# On linux / macOS, this is IsADirectoryError
|
| 33 |
+
raise IsADirectoryError(EISDIR, "Is a directory", filename)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def ensure_directory_exists(filename: Path | str) -> None:
|
| 37 |
+
"""
|
| 38 |
+
Ensure the directory containing the file exists (create it if necessary)
|
| 39 |
+
:param filename: file.
|
| 40 |
+
"""
|
| 41 |
+
Path(filename).parent.mkdir(parents=True, exist_ok=True)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
__all__ = [
|
| 45 |
+
"raise_on_not_writable_file",
|
| 46 |
+
"ensure_directory_exists",
|
| 47 |
+
]
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_windows.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
from contextlib import suppress
|
| 6 |
+
from errno import EACCES
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import cast
|
| 9 |
+
|
| 10 |
+
from ._api import BaseFileLock
|
| 11 |
+
from ._util import ensure_directory_exists, raise_on_not_writable_file
|
| 12 |
+
|
| 13 |
+
if sys.platform == "win32": # pragma: win32 cover
|
| 14 |
+
import msvcrt
|
| 15 |
+
|
| 16 |
+
class WindowsFileLock(BaseFileLock):
|
| 17 |
+
"""Uses the :func:`msvcrt.locking` function to hard lock the lock file on Windows systems."""
|
| 18 |
+
|
| 19 |
+
def _acquire(self) -> None:
|
| 20 |
+
raise_on_not_writable_file(self.lock_file)
|
| 21 |
+
ensure_directory_exists(self.lock_file)
|
| 22 |
+
flags = (
|
| 23 |
+
os.O_RDWR # open for read and write
|
| 24 |
+
| os.O_CREAT # create file if not exists
|
| 25 |
+
| os.O_TRUNC # truncate file if not empty
|
| 26 |
+
)
|
| 27 |
+
try:
|
| 28 |
+
fd = os.open(self.lock_file, flags, self._context.mode)
|
| 29 |
+
except OSError as exception:
|
| 30 |
+
if exception.errno != EACCES: # has no access to this lock
|
| 31 |
+
raise
|
| 32 |
+
else:
|
| 33 |
+
try:
|
| 34 |
+
msvcrt.locking(fd, msvcrt.LK_NBLCK, 1)
|
| 35 |
+
except OSError as exception:
|
| 36 |
+
os.close(fd) # close file first
|
| 37 |
+
if exception.errno != EACCES: # file is already locked
|
| 38 |
+
raise
|
| 39 |
+
else:
|
| 40 |
+
self._context.lock_file_fd = fd
|
| 41 |
+
|
| 42 |
+
def _release(self) -> None:
|
| 43 |
+
fd = cast(int, self._context.lock_file_fd)
|
| 44 |
+
self._context.lock_file_fd = None
|
| 45 |
+
msvcrt.locking(fd, msvcrt.LK_UNLCK, 1)
|
| 46 |
+
os.close(fd)
|
| 47 |
+
|
| 48 |
+
with suppress(OSError): # Probably another instance of the application hat acquired the file lock.
|
| 49 |
+
Path(self.lock_file).unlink()
|
| 50 |
+
|
| 51 |
+
else: # pragma: win32 no cover
|
| 52 |
+
|
| 53 |
+
class WindowsFileLock(BaseFileLock):
|
| 54 |
+
"""Uses the :func:`msvcrt.locking` function to hard lock the lock file on Windows systems."""
|
| 55 |
+
|
| 56 |
+
def _acquire(self) -> None:
|
| 57 |
+
raise NotImplementedError
|
| 58 |
+
|
| 59 |
+
def _release(self) -> None:
|
| 60 |
+
raise NotImplementedError
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
__all__ = [
|
| 64 |
+
"WindowsFileLock",
|
| 65 |
+
]
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/__pycache__/test_fp.cpython-311.pyc
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fac5cd5bfbd06bb4a9b6ca2c30c684bea761aa5b6dbe0c019ed92f1f4a7d8143
|
| 3 |
+
size 142559
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (218 Bytes). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openmp/cupti_openmp.h
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2018 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#include <cuda_stdint.h>
|
| 51 |
+
#include "Openmp/omp-tools.h"
|
| 52 |
+
|
| 53 |
+
#if !defined(_CUPTI_OPENMP_H_)
|
| 54 |
+
#define _CUPTI_OPENMP_H_
|
| 55 |
+
|
| 56 |
+
#ifndef CUPTIAPI
|
| 57 |
+
#ifdef _WIN32
|
| 58 |
+
#define CUPTIAPI __stdcall
|
| 59 |
+
#else
|
| 60 |
+
#define CUPTIAPI
|
| 61 |
+
#endif
|
| 62 |
+
#endif
|
| 63 |
+
|
| 64 |
+
#if defined(__LP64__)
|
| 65 |
+
#define CUPTILP64 1
|
| 66 |
+
#elif defined(_WIN64)
|
| 67 |
+
#define CUPTILP64 1
|
| 68 |
+
#else
|
| 69 |
+
#undef CUPTILP64
|
| 70 |
+
#endif
|
| 71 |
+
|
| 72 |
+
#if defined(__cplusplus)
|
| 73 |
+
extern "C" {
|
| 74 |
+
#endif
|
| 75 |
+
|
| 76 |
+
#if defined(__GNUC__) && defined(CUPTI_LIB)
|
| 77 |
+
#pragma GCC visibility push(default)
|
| 78 |
+
#endif
|
| 79 |
+
|
| 80 |
+
/**
|
| 81 |
+
* \brief Initialize OPENMP support (deprecated, used before OpenMP 5.0)
|
| 82 |
+
*
|
| 83 |
+
*/
|
| 84 |
+
int CUPTIAPI cuptiOpenMpInitialize(ompt_function_lookup_t ompt_fn_lookup, const char *runtime_version, unsigned int ompt_version);
|
| 85 |
+
|
| 86 |
+
/**
|
| 87 |
+
* \brief Initialize OPENMP support
|
| 88 |
+
*
|
| 89 |
+
*/
|
| 90 |
+
int CUPTIAPI cuptiOpenMpInitialize_v2(ompt_function_lookup_t lookup, int initial_device_num, ompt_data_t *tool_data);
|
| 91 |
+
|
| 92 |
+
#if defined(__GNUC__) && defined(CUPTI_LIB)
|
| 93 |
+
#pragma GCC visibility pop
|
| 94 |
+
#endif
|
| 95 |
+
|
| 96 |
+
#if defined(__cplusplus)
|
| 97 |
+
}
|
| 98 |
+
#endif
|
| 99 |
+
|
| 100 |
+
#endif /*_CUPTI_OPENMP_H_*/
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openmp/omp-tools.h
ADDED
|
@@ -0,0 +1,1083 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* include/50/omp-tools.h.var
|
| 3 |
+
*/
|
| 4 |
+
|
| 5 |
+
//===----------------------------------------------------------------------===//
|
| 6 |
+
//
|
| 7 |
+
// The LLVM Compiler Infrastructure
|
| 8 |
+
//
|
| 9 |
+
// This file is dual licensed under the MIT and the University of Illinois Open
|
| 10 |
+
// Source Licenses. See LICENSE.txt for details.
|
| 11 |
+
//
|
| 12 |
+
//===----------------------------------------------------------------------===//
|
| 13 |
+
|
| 14 |
+
#ifndef __OMPT__
|
| 15 |
+
#define __OMPT__
|
| 16 |
+
|
| 17 |
+
/*****************************************************************************
|
| 18 |
+
* system include files
|
| 19 |
+
*****************************************************************************/
|
| 20 |
+
|
| 21 |
+
#include <stdint.h>
|
| 22 |
+
#include <stddef.h>
|
| 23 |
+
|
| 24 |
+
/*****************************************************************************
|
| 25 |
+
* iteration macros
|
| 26 |
+
*****************************************************************************/
|
| 27 |
+
|
| 28 |
+
#define FOREACH_OMPT_INQUIRY_FN(macro) \
|
| 29 |
+
macro (ompt_enumerate_states) \
|
| 30 |
+
macro (ompt_enumerate_mutex_impls) \
|
| 31 |
+
\
|
| 32 |
+
macro (ompt_set_callback) \
|
| 33 |
+
macro (ompt_get_callback) \
|
| 34 |
+
\
|
| 35 |
+
macro (ompt_get_state) \
|
| 36 |
+
\
|
| 37 |
+
macro (ompt_get_parallel_info) \
|
| 38 |
+
macro (ompt_get_task_info) \
|
| 39 |
+
macro (ompt_get_task_memory) \
|
| 40 |
+
macro (ompt_get_thread_data) \
|
| 41 |
+
macro (ompt_get_unique_id) \
|
| 42 |
+
macro (ompt_finalize_tool) \
|
| 43 |
+
\
|
| 44 |
+
macro(ompt_get_num_procs) \
|
| 45 |
+
macro(ompt_get_num_places) \
|
| 46 |
+
macro(ompt_get_place_proc_ids) \
|
| 47 |
+
macro(ompt_get_place_num) \
|
| 48 |
+
macro(ompt_get_partition_place_nums) \
|
| 49 |
+
macro(ompt_get_proc_id) \
|
| 50 |
+
\
|
| 51 |
+
macro(ompt_get_target_info) \
|
| 52 |
+
macro(ompt_get_num_devices)
|
| 53 |
+
|
| 54 |
+
#define FOREACH_OMPT_STATE(macro) \
|
| 55 |
+
\
|
| 56 |
+
/* first available state */ \
|
| 57 |
+
macro (ompt_state_undefined, 0x102) /* undefined thread state */ \
|
| 58 |
+
\
|
| 59 |
+
/* work states (0..15) */ \
|
| 60 |
+
macro (ompt_state_work_serial, 0x000) /* working outside parallel */ \
|
| 61 |
+
macro (ompt_state_work_parallel, 0x001) /* working within parallel */ \
|
| 62 |
+
macro (ompt_state_work_reduction, 0x002) /* performing a reduction */ \
|
| 63 |
+
\
|
| 64 |
+
/* barrier wait states (16..31) */ \
|
| 65 |
+
macro (ompt_state_wait_barrier, 0x010) /* waiting at a barrier */ \
|
| 66 |
+
macro (ompt_state_wait_barrier_implicit_parallel, 0x011) \
|
| 67 |
+
/* implicit barrier at the end of parallel region */\
|
| 68 |
+
macro (ompt_state_wait_barrier_implicit_workshare, 0x012) \
|
| 69 |
+
/* implicit barrier at the end of worksharing */ \
|
| 70 |
+
macro (ompt_state_wait_barrier_implicit, 0x013) /* implicit barrier */ \
|
| 71 |
+
macro (ompt_state_wait_barrier_explicit, 0x014) /* explicit barrier */ \
|
| 72 |
+
\
|
| 73 |
+
/* task wait states (32..63) */ \
|
| 74 |
+
macro (ompt_state_wait_taskwait, 0x020) /* waiting at a taskwait */ \
|
| 75 |
+
macro (ompt_state_wait_taskgroup, 0x021) /* waiting at a taskgroup */ \
|
| 76 |
+
\
|
| 77 |
+
/* mutex wait states (64..127) */ \
|
| 78 |
+
macro (ompt_state_wait_mutex, 0x040) \
|
| 79 |
+
macro (ompt_state_wait_lock, 0x041) /* waiting for lock */ \
|
| 80 |
+
macro (ompt_state_wait_critical, 0x042) /* waiting for critical */ \
|
| 81 |
+
macro (ompt_state_wait_atomic, 0x043) /* waiting for atomic */ \
|
| 82 |
+
macro (ompt_state_wait_ordered, 0x044) /* waiting for ordered */ \
|
| 83 |
+
\
|
| 84 |
+
/* target wait states (128..255) */ \
|
| 85 |
+
macro (ompt_state_wait_target, 0x080) /* waiting for target region */ \
|
| 86 |
+
macro (ompt_state_wait_target_map, 0x081) /* waiting for target data mapping operation */ \
|
| 87 |
+
macro (ompt_state_wait_target_update, 0x082) /* waiting for target update operation */ \
|
| 88 |
+
\
|
| 89 |
+
/* misc (256..511) */ \
|
| 90 |
+
macro (ompt_state_idle, 0x100) /* waiting for work */ \
|
| 91 |
+
macro (ompt_state_overhead, 0x101) /* overhead excluding wait states */ \
|
| 92 |
+
\
|
| 93 |
+
/* implementation-specific states (512..) */
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
#define FOREACH_KMP_MUTEX_IMPL(macro) \
|
| 97 |
+
macro (kmp_mutex_impl_none, 0) /* unknown implementation */ \
|
| 98 |
+
macro (kmp_mutex_impl_spin, 1) /* based on spin */ \
|
| 99 |
+
macro (kmp_mutex_impl_queuing, 2) /* based on some fair policy */ \
|
| 100 |
+
macro (kmp_mutex_impl_speculative, 3) /* based on HW-supported speculation */
|
| 101 |
+
|
| 102 |
+
#define FOREACH_OMPT_EVENT(macro) \
|
| 103 |
+
\
|
| 104 |
+
/*--- Mandatory Events ---*/ \
|
| 105 |
+
macro (ompt_callback_thread_begin, ompt_callback_thread_begin_t, 1) /* thread begin */ \
|
| 106 |
+
macro (ompt_callback_thread_end, ompt_callback_thread_end_t, 2) /* thread end */ \
|
| 107 |
+
\
|
| 108 |
+
macro (ompt_callback_parallel_begin, ompt_callback_parallel_begin_t, 3) /* parallel begin */ \
|
| 109 |
+
macro (ompt_callback_parallel_end, ompt_callback_parallel_end_t, 4) /* parallel end */ \
|
| 110 |
+
\
|
| 111 |
+
macro (ompt_callback_task_create, ompt_callback_task_create_t, 5) /* task begin */ \
|
| 112 |
+
macro (ompt_callback_task_schedule, ompt_callback_task_schedule_t, 6) /* task schedule */ \
|
| 113 |
+
macro (ompt_callback_implicit_task, ompt_callback_implicit_task_t, 7) /* implicit task */ \
|
| 114 |
+
\
|
| 115 |
+
macro (ompt_callback_target, ompt_callback_target_t, 8) /* target */ \
|
| 116 |
+
macro (ompt_callback_target_data_op, ompt_callback_target_data_op_t, 9) /* target data op */ \
|
| 117 |
+
macro (ompt_callback_target_submit, ompt_callback_target_submit_t, 10) /* target submit */ \
|
| 118 |
+
\
|
| 119 |
+
macro (ompt_callback_control_tool, ompt_callback_control_tool_t, 11) /* control tool */ \
|
| 120 |
+
\
|
| 121 |
+
macro (ompt_callback_device_initialize, ompt_callback_device_initialize_t, 12) /* device initialize */ \
|
| 122 |
+
macro (ompt_callback_device_finalize, ompt_callback_device_finalize_t, 13) /* device finalize */ \
|
| 123 |
+
\
|
| 124 |
+
macro (ompt_callback_device_load, ompt_callback_device_load_t, 14) /* device load */ \
|
| 125 |
+
macro (ompt_callback_device_unload, ompt_callback_device_unload_t, 15) /* device unload */ \
|
| 126 |
+
\
|
| 127 |
+
/* Optional Events */ \
|
| 128 |
+
macro (ompt_callback_sync_region_wait, ompt_callback_sync_region_t, 16) /* sync region wait begin or end */ \
|
| 129 |
+
\
|
| 130 |
+
macro (ompt_callback_mutex_released, ompt_callback_mutex_t, 17) /* mutex released */ \
|
| 131 |
+
\
|
| 132 |
+
macro (ompt_callback_dependences, ompt_callback_dependences_t, 18) /* report task dependences */ \
|
| 133 |
+
macro (ompt_callback_task_dependence, ompt_callback_task_dependence_t, 19) /* report task dependence */ \
|
| 134 |
+
\
|
| 135 |
+
macro (ompt_callback_work, ompt_callback_work_t, 20) /* task at work begin or end */ \
|
| 136 |
+
\
|
| 137 |
+
macro (ompt_callback_master, ompt_callback_master_t, 21) /* task at master begin or end */ \
|
| 138 |
+
\
|
| 139 |
+
macro (ompt_callback_target_map, ompt_callback_target_map_t, 22) /* target map */ \
|
| 140 |
+
\
|
| 141 |
+
macro (ompt_callback_sync_region, ompt_callback_sync_region_t, 23) /* sync region begin or end */ \
|
| 142 |
+
\
|
| 143 |
+
macro (ompt_callback_lock_init, ompt_callback_mutex_acquire_t, 24) /* lock init */ \
|
| 144 |
+
macro (ompt_callback_lock_destroy, ompt_callback_mutex_t, 25) /* lock destroy */ \
|
| 145 |
+
\
|
| 146 |
+
macro (ompt_callback_mutex_acquire, ompt_callback_mutex_acquire_t, 26) /* mutex acquire */ \
|
| 147 |
+
macro (ompt_callback_mutex_acquired, ompt_callback_mutex_t, 27) /* mutex acquired */ \
|
| 148 |
+
\
|
| 149 |
+
macro (ompt_callback_nest_lock, ompt_callback_nest_lock_t, 28) /* nest lock */ \
|
| 150 |
+
\
|
| 151 |
+
macro (ompt_callback_flush, ompt_callback_flush_t, 29) /* after executing flush */ \
|
| 152 |
+
\
|
| 153 |
+
macro (ompt_callback_cancel, ompt_callback_cancel_t, 30) /* cancel innermost binding region */ \
|
| 154 |
+
\
|
| 155 |
+
macro (ompt_callback_reduction, ompt_callback_sync_region_t, 31) /* reduction */ \
|
| 156 |
+
\
|
| 157 |
+
macro (ompt_callback_dispatch, ompt_callback_dispatch_t, 32) /* dispatch of work */
|
| 158 |
+
|
| 159 |
+
/*****************************************************************************
|
| 160 |
+
* implementation specific types
|
| 161 |
+
*****************************************************************************/
|
| 162 |
+
|
| 163 |
+
typedef enum kmp_mutex_impl_t {
|
| 164 |
+
#define kmp_mutex_impl_macro(impl, code) impl = code,
|
| 165 |
+
FOREACH_KMP_MUTEX_IMPL(kmp_mutex_impl_macro)
|
| 166 |
+
#undef kmp_mutex_impl_macro
|
| 167 |
+
} kmp_mutex_impl_t;
|
| 168 |
+
|
| 169 |
+
/*****************************************************************************
|
| 170 |
+
* definitions generated from spec
|
| 171 |
+
*****************************************************************************/
|
| 172 |
+
|
| 173 |
+
typedef enum ompt_callbacks_t {
|
| 174 |
+
ompt_callback_thread_begin = 1,
|
| 175 |
+
ompt_callback_thread_end = 2,
|
| 176 |
+
ompt_callback_parallel_begin = 3,
|
| 177 |
+
ompt_callback_parallel_end = 4,
|
| 178 |
+
ompt_callback_task_create = 5,
|
| 179 |
+
ompt_callback_task_schedule = 6,
|
| 180 |
+
ompt_callback_implicit_task = 7,
|
| 181 |
+
ompt_callback_target = 8,
|
| 182 |
+
ompt_callback_target_data_op = 9,
|
| 183 |
+
ompt_callback_target_submit = 10,
|
| 184 |
+
ompt_callback_control_tool = 11,
|
| 185 |
+
ompt_callback_device_initialize = 12,
|
| 186 |
+
ompt_callback_device_finalize = 13,
|
| 187 |
+
ompt_callback_device_load = 14,
|
| 188 |
+
ompt_callback_device_unload = 15,
|
| 189 |
+
ompt_callback_sync_region_wait = 16,
|
| 190 |
+
ompt_callback_mutex_released = 17,
|
| 191 |
+
ompt_callback_dependences = 18,
|
| 192 |
+
ompt_callback_task_dependence = 19,
|
| 193 |
+
ompt_callback_work = 20,
|
| 194 |
+
ompt_callback_master = 21,
|
| 195 |
+
ompt_callback_target_map = 22,
|
| 196 |
+
ompt_callback_sync_region = 23,
|
| 197 |
+
ompt_callback_lock_init = 24,
|
| 198 |
+
ompt_callback_lock_destroy = 25,
|
| 199 |
+
ompt_callback_mutex_acquire = 26,
|
| 200 |
+
ompt_callback_mutex_acquired = 27,
|
| 201 |
+
ompt_callback_nest_lock = 28,
|
| 202 |
+
ompt_callback_flush = 29,
|
| 203 |
+
ompt_callback_cancel = 30,
|
| 204 |
+
ompt_callback_reduction = 31,
|
| 205 |
+
ompt_callback_dispatch = 32
|
| 206 |
+
} ompt_callbacks_t;
|
| 207 |
+
|
| 208 |
+
typedef enum ompt_record_t {
|
| 209 |
+
ompt_record_ompt = 1,
|
| 210 |
+
ompt_record_native = 2,
|
| 211 |
+
ompt_record_invalid = 3
|
| 212 |
+
} ompt_record_t;
|
| 213 |
+
|
| 214 |
+
typedef enum ompt_record_native_t {
|
| 215 |
+
ompt_record_native_info = 1,
|
| 216 |
+
ompt_record_native_event = 2
|
| 217 |
+
} ompt_record_native_t;
|
| 218 |
+
|
| 219 |
+
typedef enum ompt_set_result_t {
|
| 220 |
+
ompt_set_error = 0,
|
| 221 |
+
ompt_set_never = 1,
|
| 222 |
+
ompt_set_impossible = 2,
|
| 223 |
+
ompt_set_sometimes = 3,
|
| 224 |
+
ompt_set_sometimes_paired = 4,
|
| 225 |
+
ompt_set_always = 5
|
| 226 |
+
} ompt_set_result_t;
|
| 227 |
+
|
| 228 |
+
typedef uint64_t ompt_id_t;
|
| 229 |
+
|
| 230 |
+
typedef uint64_t ompt_device_time_t;
|
| 231 |
+
|
| 232 |
+
typedef uint64_t ompt_buffer_cursor_t;
|
| 233 |
+
|
| 234 |
+
typedef enum ompt_thread_t {
|
| 235 |
+
ompt_thread_initial = 1,
|
| 236 |
+
ompt_thread_worker = 2,
|
| 237 |
+
ompt_thread_other = 3,
|
| 238 |
+
ompt_thread_unknown = 4
|
| 239 |
+
} ompt_thread_t;
|
| 240 |
+
|
| 241 |
+
typedef enum ompt_scope_endpoint_t {
|
| 242 |
+
ompt_scope_begin = 1,
|
| 243 |
+
ompt_scope_end = 2
|
| 244 |
+
} ompt_scope_endpoint_t;
|
| 245 |
+
|
| 246 |
+
typedef enum ompt_dispatch_t {
|
| 247 |
+
ompt_dispatch_iteration = 1,
|
| 248 |
+
ompt_dispatch_section = 2
|
| 249 |
+
} ompt_dispatch_t;
|
| 250 |
+
|
| 251 |
+
typedef enum ompt_sync_region_t {
|
| 252 |
+
ompt_sync_region_barrier = 1,
|
| 253 |
+
ompt_sync_region_barrier_implicit = 2,
|
| 254 |
+
ompt_sync_region_barrier_explicit = 3,
|
| 255 |
+
ompt_sync_region_barrier_implementation = 4,
|
| 256 |
+
ompt_sync_region_taskwait = 5,
|
| 257 |
+
ompt_sync_region_taskgroup = 6,
|
| 258 |
+
ompt_sync_region_reduction = 7
|
| 259 |
+
} ompt_sync_region_t;
|
| 260 |
+
|
| 261 |
+
typedef enum ompt_target_data_op_t {
|
| 262 |
+
ompt_target_data_alloc = 1,
|
| 263 |
+
ompt_target_data_transfer_to_device = 2,
|
| 264 |
+
ompt_target_data_transfer_from_device = 3,
|
| 265 |
+
ompt_target_data_delete = 4,
|
| 266 |
+
ompt_target_data_associate = 5,
|
| 267 |
+
ompt_target_data_disassociate = 6
|
| 268 |
+
} ompt_target_data_op_t;
|
| 269 |
+
|
| 270 |
+
typedef enum ompt_work_t {
|
| 271 |
+
ompt_work_loop = 1,
|
| 272 |
+
ompt_work_sections = 2,
|
| 273 |
+
ompt_work_single_executor = 3,
|
| 274 |
+
ompt_work_single_other = 4,
|
| 275 |
+
ompt_work_workshare = 5,
|
| 276 |
+
ompt_work_distribute = 6,
|
| 277 |
+
ompt_work_taskloop = 7
|
| 278 |
+
} ompt_work_t;
|
| 279 |
+
|
| 280 |
+
typedef enum ompt_mutex_t {
|
| 281 |
+
ompt_mutex_lock = 1,
|
| 282 |
+
ompt_mutex_test_lock = 2,
|
| 283 |
+
ompt_mutex_nest_lock = 3,
|
| 284 |
+
ompt_mutex_test_nest_lock = 4,
|
| 285 |
+
ompt_mutex_critical = 5,
|
| 286 |
+
ompt_mutex_atomic = 6,
|
| 287 |
+
ompt_mutex_ordered = 7
|
| 288 |
+
} ompt_mutex_t;
|
| 289 |
+
|
| 290 |
+
typedef enum ompt_native_mon_flag_t {
|
| 291 |
+
ompt_native_data_motion_explicit = 0x01,
|
| 292 |
+
ompt_native_data_motion_implicit = 0x02,
|
| 293 |
+
ompt_native_kernel_invocation = 0x04,
|
| 294 |
+
ompt_native_kernel_execution = 0x08,
|
| 295 |
+
ompt_native_driver = 0x10,
|
| 296 |
+
ompt_native_runtime = 0x20,
|
| 297 |
+
ompt_native_overhead = 0x40,
|
| 298 |
+
ompt_native_idleness = 0x80
|
| 299 |
+
} ompt_native_mon_flag_t;
|
| 300 |
+
|
| 301 |
+
typedef enum ompt_task_flag_t {
|
| 302 |
+
ompt_task_initial = 0x00000001,
|
| 303 |
+
ompt_task_implicit = 0x00000002,
|
| 304 |
+
ompt_task_explicit = 0x00000004,
|
| 305 |
+
ompt_task_target = 0x00000008,
|
| 306 |
+
ompt_task_undeferred = 0x08000000,
|
| 307 |
+
ompt_task_untied = 0x10000000,
|
| 308 |
+
ompt_task_final = 0x20000000,
|
| 309 |
+
ompt_task_mergeable = 0x40000000,
|
| 310 |
+
ompt_task_merged = 0x80000000
|
| 311 |
+
} ompt_task_flag_t;
|
| 312 |
+
|
| 313 |
+
typedef enum ompt_task_status_t {
|
| 314 |
+
ompt_task_complete = 1,
|
| 315 |
+
ompt_task_yield = 2,
|
| 316 |
+
ompt_task_cancel = 3,
|
| 317 |
+
ompt_task_detach = 4,
|
| 318 |
+
ompt_task_early_fulfill = 5,
|
| 319 |
+
ompt_task_late_fulfill = 6,
|
| 320 |
+
ompt_task_switch = 7
|
| 321 |
+
} ompt_task_status_t;
|
| 322 |
+
|
| 323 |
+
typedef enum ompt_target_t {
|
| 324 |
+
ompt_target = 1,
|
| 325 |
+
ompt_target_enter_data = 2,
|
| 326 |
+
ompt_target_exit_data = 3,
|
| 327 |
+
ompt_target_update = 4
|
| 328 |
+
} ompt_target_t;
|
| 329 |
+
|
| 330 |
+
typedef enum ompt_parallel_flag_t {
|
| 331 |
+
ompt_parallel_invoker_program = 0x00000001,
|
| 332 |
+
ompt_parallel_invoker_runtime = 0x00000002,
|
| 333 |
+
ompt_parallel_league = 0x40000000,
|
| 334 |
+
ompt_parallel_team = 0x80000000
|
| 335 |
+
} ompt_parallel_flag_t;
|
| 336 |
+
|
| 337 |
+
typedef enum ompt_target_map_flag_t {
|
| 338 |
+
ompt_target_map_flag_to = 0x01,
|
| 339 |
+
ompt_target_map_flag_from = 0x02,
|
| 340 |
+
ompt_target_map_flag_alloc = 0x04,
|
| 341 |
+
ompt_target_map_flag_release = 0x08,
|
| 342 |
+
ompt_target_map_flag_delete = 0x10,
|
| 343 |
+
ompt_target_map_flag_implicit = 0x20
|
| 344 |
+
} ompt_target_map_flag_t;
|
| 345 |
+
|
| 346 |
+
typedef enum ompt_dependence_type_t {
|
| 347 |
+
ompt_dependence_type_in = 1,
|
| 348 |
+
ompt_dependence_type_out = 2,
|
| 349 |
+
ompt_dependence_type_inout = 3,
|
| 350 |
+
ompt_dependence_type_mutexinoutset = 4,
|
| 351 |
+
ompt_dependence_type_source = 5,
|
| 352 |
+
ompt_dependence_type_sink = 6
|
| 353 |
+
} ompt_dependence_type_t;
|
| 354 |
+
|
| 355 |
+
typedef enum ompt_cancel_flag_t {
|
| 356 |
+
ompt_cancel_parallel = 0x01,
|
| 357 |
+
ompt_cancel_sections = 0x02,
|
| 358 |
+
ompt_cancel_loop = 0x04,
|
| 359 |
+
ompt_cancel_taskgroup = 0x08,
|
| 360 |
+
ompt_cancel_activated = 0x10,
|
| 361 |
+
ompt_cancel_detected = 0x20,
|
| 362 |
+
ompt_cancel_discarded_task = 0x40
|
| 363 |
+
} ompt_cancel_flag_t;
|
| 364 |
+
|
| 365 |
+
typedef uint64_t ompt_hwid_t;
|
| 366 |
+
|
| 367 |
+
typedef uint64_t ompt_wait_id_t;
|
| 368 |
+
|
| 369 |
+
typedef enum ompt_frame_flag_t {
|
| 370 |
+
ompt_frame_runtime = 0x00,
|
| 371 |
+
ompt_frame_application = 0x01,
|
| 372 |
+
ompt_frame_cfa = 0x10,
|
| 373 |
+
ompt_frame_framepointer = 0x20,
|
| 374 |
+
ompt_frame_stackaddress = 0x30
|
| 375 |
+
} ompt_frame_flag_t;
|
| 376 |
+
|
| 377 |
+
typedef enum ompt_state_t {
|
| 378 |
+
ompt_state_work_serial = 0x000,
|
| 379 |
+
ompt_state_work_parallel = 0x001,
|
| 380 |
+
ompt_state_work_reduction = 0x002,
|
| 381 |
+
|
| 382 |
+
ompt_state_wait_barrier = 0x010,
|
| 383 |
+
ompt_state_wait_barrier_implicit_parallel = 0x011,
|
| 384 |
+
ompt_state_wait_barrier_implicit_workshare = 0x012,
|
| 385 |
+
ompt_state_wait_barrier_implicit = 0x013,
|
| 386 |
+
ompt_state_wait_barrier_explicit = 0x014,
|
| 387 |
+
|
| 388 |
+
ompt_state_wait_taskwait = 0x020,
|
| 389 |
+
ompt_state_wait_taskgroup = 0x021,
|
| 390 |
+
|
| 391 |
+
ompt_state_wait_mutex = 0x040,
|
| 392 |
+
ompt_state_wait_lock = 0x041,
|
| 393 |
+
ompt_state_wait_critical = 0x042,
|
| 394 |
+
ompt_state_wait_atomic = 0x043,
|
| 395 |
+
ompt_state_wait_ordered = 0x044,
|
| 396 |
+
|
| 397 |
+
ompt_state_wait_target = 0x080,
|
| 398 |
+
ompt_state_wait_target_map = 0x081,
|
| 399 |
+
ompt_state_wait_target_update = 0x082,
|
| 400 |
+
|
| 401 |
+
ompt_state_idle = 0x100,
|
| 402 |
+
ompt_state_overhead = 0x101,
|
| 403 |
+
ompt_state_undefined = 0x102
|
| 404 |
+
} ompt_state_t;
|
| 405 |
+
|
| 406 |
+
typedef uint64_t (*ompt_get_unique_id_t) (void);
|
| 407 |
+
|
| 408 |
+
typedef uint64_t ompd_size_t;
|
| 409 |
+
|
| 410 |
+
typedef uint64_t ompd_wait_id_t;
|
| 411 |
+
|
| 412 |
+
typedef uint64_t ompd_addr_t;
|
| 413 |
+
typedef int64_t ompd_word_t;
|
| 414 |
+
typedef uint64_t ompd_seg_t;
|
| 415 |
+
|
| 416 |
+
typedef uint64_t ompd_device_t;
|
| 417 |
+
|
| 418 |
+
typedef uint64_t ompd_thread_id_t;
|
| 419 |
+
|
| 420 |
+
typedef enum ompd_scope_t {
|
| 421 |
+
ompd_scope_global = 1,
|
| 422 |
+
ompd_scope_address_space = 2,
|
| 423 |
+
ompd_scope_thread = 3,
|
| 424 |
+
ompd_scope_parallel = 4,
|
| 425 |
+
ompd_scope_implicit_task = 5,
|
| 426 |
+
ompd_scope_task = 6
|
| 427 |
+
} ompd_scope_t;
|
| 428 |
+
|
| 429 |
+
typedef uint64_t ompd_icv_id_t;
|
| 430 |
+
|
| 431 |
+
typedef enum ompd_rc_t {
|
| 432 |
+
ompd_rc_ok = 0,
|
| 433 |
+
ompd_rc_unavailable = 1,
|
| 434 |
+
ompd_rc_stale_handle = 2,
|
| 435 |
+
ompd_rc_bad_input = 3,
|
| 436 |
+
ompd_rc_error = 4,
|
| 437 |
+
ompd_rc_unsupported = 5,
|
| 438 |
+
ompd_rc_needs_state_tracking = 6,
|
| 439 |
+
ompd_rc_incompatible = 7,
|
| 440 |
+
ompd_rc_device_read_error = 8,
|
| 441 |
+
ompd_rc_device_write_error = 9,
|
| 442 |
+
ompd_rc_nomem = 10,
|
| 443 |
+
} ompd_rc_t;
|
| 444 |
+
|
| 445 |
+
typedef void (*ompt_interface_fn_t) (void);
|
| 446 |
+
|
| 447 |
+
typedef ompt_interface_fn_t (*ompt_function_lookup_t) (
|
| 448 |
+
const char *interface_function_name
|
| 449 |
+
);
|
| 450 |
+
|
| 451 |
+
typedef union ompt_data_t {
|
| 452 |
+
uint64_t value;
|
| 453 |
+
void *ptr;
|
| 454 |
+
} ompt_data_t;
|
| 455 |
+
|
| 456 |
+
typedef struct ompt_frame_t {
|
| 457 |
+
ompt_data_t exit_frame;
|
| 458 |
+
ompt_data_t enter_frame;
|
| 459 |
+
int exit_frame_flags;
|
| 460 |
+
int enter_frame_flags;
|
| 461 |
+
} ompt_frame_t;
|
| 462 |
+
|
| 463 |
+
typedef void (*ompt_callback_t) (void);
|
| 464 |
+
|
| 465 |
+
typedef void ompt_device_t;
|
| 466 |
+
|
| 467 |
+
typedef void ompt_buffer_t;
|
| 468 |
+
|
| 469 |
+
typedef void (*ompt_callback_buffer_request_t) (
|
| 470 |
+
int device_num,
|
| 471 |
+
ompt_buffer_t **buffer,
|
| 472 |
+
size_t *bytes
|
| 473 |
+
);
|
| 474 |
+
|
| 475 |
+
typedef void (*ompt_callback_buffer_complete_t) (
|
| 476 |
+
int device_num,
|
| 477 |
+
ompt_buffer_t *buffer,
|
| 478 |
+
size_t bytes,
|
| 479 |
+
ompt_buffer_cursor_t begin,
|
| 480 |
+
int buffer_owned
|
| 481 |
+
);
|
| 482 |
+
|
| 483 |
+
typedef void (*ompt_finalize_t) (
|
| 484 |
+
ompt_data_t *tool_data
|
| 485 |
+
);
|
| 486 |
+
|
| 487 |
+
typedef int (*ompt_initialize_t) (
|
| 488 |
+
ompt_function_lookup_t lookup,
|
| 489 |
+
int initial_device_num,
|
| 490 |
+
ompt_data_t *tool_data
|
| 491 |
+
);
|
| 492 |
+
|
| 493 |
+
typedef struct ompt_start_tool_result_t {
|
| 494 |
+
ompt_initialize_t initialize;
|
| 495 |
+
ompt_finalize_t finalize;
|
| 496 |
+
ompt_data_t tool_data;
|
| 497 |
+
} ompt_start_tool_result_t;
|
| 498 |
+
|
| 499 |
+
typedef struct ompt_record_abstract_t {
|
| 500 |
+
ompt_record_native_t rclass;
|
| 501 |
+
const char *type;
|
| 502 |
+
ompt_device_time_t start_time;
|
| 503 |
+
ompt_device_time_t end_time;
|
| 504 |
+
ompt_hwid_t hwid;
|
| 505 |
+
} ompt_record_abstract_t;
|
| 506 |
+
|
| 507 |
+
typedef struct ompt_dependence_t {
|
| 508 |
+
ompt_data_t variable;
|
| 509 |
+
ompt_dependence_type_t dependence_type;
|
| 510 |
+
} ompt_dependence_t;
|
| 511 |
+
|
| 512 |
+
typedef int (*ompt_enumerate_states_t) (
|
| 513 |
+
int current_state,
|
| 514 |
+
int *next_state,
|
| 515 |
+
const char **next_state_name
|
| 516 |
+
);
|
| 517 |
+
|
| 518 |
+
typedef int (*ompt_enumerate_mutex_impls_t) (
|
| 519 |
+
int current_impl,
|
| 520 |
+
int *next_impl,
|
| 521 |
+
const char **next_impl_name
|
| 522 |
+
);
|
| 523 |
+
|
| 524 |
+
typedef ompt_set_result_t (*ompt_set_callback_t) (
|
| 525 |
+
ompt_callbacks_t event,
|
| 526 |
+
ompt_callback_t callback
|
| 527 |
+
);
|
| 528 |
+
|
| 529 |
+
typedef int (*ompt_get_callback_t) (
|
| 530 |
+
ompt_callbacks_t event,
|
| 531 |
+
ompt_callback_t *callback
|
| 532 |
+
);
|
| 533 |
+
|
| 534 |
+
typedef ompt_data_t *(*ompt_get_thread_data_t) (void);
|
| 535 |
+
|
| 536 |
+
typedef int (*ompt_get_num_procs_t) (void);
|
| 537 |
+
|
| 538 |
+
typedef int (*ompt_get_num_places_t) (void);
|
| 539 |
+
|
| 540 |
+
typedef int (*ompt_get_place_proc_ids_t) (
|
| 541 |
+
int place_num,
|
| 542 |
+
int ids_size,
|
| 543 |
+
int *ids
|
| 544 |
+
);
|
| 545 |
+
|
| 546 |
+
typedef int (*ompt_get_place_num_t) (void);
|
| 547 |
+
|
| 548 |
+
typedef int (*ompt_get_partition_place_nums_t) (
|
| 549 |
+
int place_nums_size,
|
| 550 |
+
int *place_nums
|
| 551 |
+
);
|
| 552 |
+
|
| 553 |
+
typedef int (*ompt_get_proc_id_t) (void);
|
| 554 |
+
|
| 555 |
+
typedef int (*ompt_get_state_t) (
|
| 556 |
+
ompt_wait_id_t *wait_id
|
| 557 |
+
);
|
| 558 |
+
|
| 559 |
+
typedef int (*ompt_get_parallel_info_t) (
|
| 560 |
+
int ancestor_level,
|
| 561 |
+
ompt_data_t **parallel_data,
|
| 562 |
+
int *team_size
|
| 563 |
+
);
|
| 564 |
+
|
| 565 |
+
typedef int (*ompt_get_task_info_t) (
|
| 566 |
+
int ancestor_level,
|
| 567 |
+
int *flags,
|
| 568 |
+
ompt_data_t **task_data,
|
| 569 |
+
ompt_frame_t **task_frame,
|
| 570 |
+
ompt_data_t **parallel_data,
|
| 571 |
+
int *thread_num
|
| 572 |
+
);
|
| 573 |
+
|
| 574 |
+
typedef int (*ompt_get_task_memory_t)(
|
| 575 |
+
void **addr,
|
| 576 |
+
size_t *size,
|
| 577 |
+
int block
|
| 578 |
+
);
|
| 579 |
+
|
| 580 |
+
typedef int (*ompt_get_target_info_t) (
|
| 581 |
+
uint64_t *device_num,
|
| 582 |
+
ompt_id_t *target_id,
|
| 583 |
+
ompt_id_t *host_op_id
|
| 584 |
+
);
|
| 585 |
+
|
| 586 |
+
typedef int (*ompt_get_num_devices_t) (void);
|
| 587 |
+
|
| 588 |
+
typedef void (*ompt_finalize_tool_t) (void);
|
| 589 |
+
|
| 590 |
+
typedef int (*ompt_get_device_num_procs_t) (
|
| 591 |
+
ompt_device_t *device
|
| 592 |
+
);
|
| 593 |
+
|
| 594 |
+
typedef ompt_device_time_t (*ompt_get_device_time_t) (
|
| 595 |
+
ompt_device_t *device
|
| 596 |
+
);
|
| 597 |
+
|
| 598 |
+
typedef double (*ompt_translate_time_t) (
|
| 599 |
+
ompt_device_t *device,
|
| 600 |
+
ompt_device_time_t time
|
| 601 |
+
);
|
| 602 |
+
|
| 603 |
+
typedef ompt_set_result_t (*ompt_set_trace_ompt_t) (
|
| 604 |
+
ompt_device_t *device,
|
| 605 |
+
unsigned int enable,
|
| 606 |
+
unsigned int etype
|
| 607 |
+
);
|
| 608 |
+
|
| 609 |
+
typedef ompt_set_result_t (*ompt_set_trace_native_t) (
|
| 610 |
+
ompt_device_t *device,
|
| 611 |
+
int enable,
|
| 612 |
+
int flags
|
| 613 |
+
);
|
| 614 |
+
|
| 615 |
+
typedef int (*ompt_start_trace_t) (
|
| 616 |
+
ompt_device_t *device,
|
| 617 |
+
ompt_callback_buffer_request_t request,
|
| 618 |
+
ompt_callback_buffer_complete_t complete
|
| 619 |
+
);
|
| 620 |
+
|
| 621 |
+
typedef int (*ompt_pause_trace_t) (
|
| 622 |
+
ompt_device_t *device,
|
| 623 |
+
int begin_pause
|
| 624 |
+
);
|
| 625 |
+
|
| 626 |
+
typedef int (*ompt_flush_trace_t) (
|
| 627 |
+
ompt_device_t *device
|
| 628 |
+
);
|
| 629 |
+
|
| 630 |
+
typedef int (*ompt_stop_trace_t) (
|
| 631 |
+
ompt_device_t *device
|
| 632 |
+
);
|
| 633 |
+
|
| 634 |
+
typedef int (*ompt_advance_buffer_cursor_t) (
|
| 635 |
+
ompt_device_t *device,
|
| 636 |
+
ompt_buffer_t *buffer,
|
| 637 |
+
size_t size,
|
| 638 |
+
ompt_buffer_cursor_t current,
|
| 639 |
+
ompt_buffer_cursor_t *next
|
| 640 |
+
);
|
| 641 |
+
|
| 642 |
+
typedef ompt_record_t (*ompt_get_record_type_t) (
|
| 643 |
+
ompt_buffer_t *buffer,
|
| 644 |
+
ompt_buffer_cursor_t current
|
| 645 |
+
);
|
| 646 |
+
|
| 647 |
+
typedef void *(*ompt_get_record_native_t) (
|
| 648 |
+
ompt_buffer_t *buffer,
|
| 649 |
+
ompt_buffer_cursor_t current,
|
| 650 |
+
ompt_id_t *host_op_id
|
| 651 |
+
);
|
| 652 |
+
|
| 653 |
+
typedef ompt_record_abstract_t *
|
| 654 |
+
(*ompt_get_record_abstract_t) (
|
| 655 |
+
void *native_record
|
| 656 |
+
);
|
| 657 |
+
|
| 658 |
+
typedef void (*ompt_callback_thread_begin_t) (
|
| 659 |
+
ompt_thread_t thread_type,
|
| 660 |
+
ompt_data_t *thread_data
|
| 661 |
+
);
|
| 662 |
+
|
| 663 |
+
typedef struct ompt_record_thread_begin_t {
|
| 664 |
+
ompt_thread_t thread_type;
|
| 665 |
+
} ompt_record_thread_begin_t;
|
| 666 |
+
|
| 667 |
+
typedef void (*ompt_callback_thread_end_t) (
|
| 668 |
+
ompt_data_t *thread_data
|
| 669 |
+
);
|
| 670 |
+
|
| 671 |
+
typedef void (*ompt_callback_parallel_begin_t) (
|
| 672 |
+
ompt_data_t *encountering_task_data,
|
| 673 |
+
const ompt_frame_t *encountering_task_frame,
|
| 674 |
+
ompt_data_t *parallel_data,
|
| 675 |
+
unsigned int requested_parallelism,
|
| 676 |
+
int flags,
|
| 677 |
+
const void *codeptr_ra
|
| 678 |
+
);
|
| 679 |
+
|
| 680 |
+
typedef struct ompt_record_parallel_begin_t {
|
| 681 |
+
ompt_id_t encountering_task_id;
|
| 682 |
+
ompt_id_t parallel_id;
|
| 683 |
+
unsigned int requested_parallelism;
|
| 684 |
+
int flags;
|
| 685 |
+
const void *codeptr_ra;
|
| 686 |
+
} ompt_record_parallel_begin_t;
|
| 687 |
+
|
| 688 |
+
typedef void (*ompt_callback_parallel_end_t) (
|
| 689 |
+
ompt_data_t *parallel_data,
|
| 690 |
+
ompt_data_t *encountering_task_data,
|
| 691 |
+
int flags,
|
| 692 |
+
const void *codeptr_ra
|
| 693 |
+
);
|
| 694 |
+
|
| 695 |
+
typedef struct ompt_record_parallel_end_t {
|
| 696 |
+
ompt_id_t parallel_id;
|
| 697 |
+
ompt_id_t encountering_task_id;
|
| 698 |
+
int flags;
|
| 699 |
+
const void *codeptr_ra;
|
| 700 |
+
} ompt_record_parallel_end_t;
|
| 701 |
+
|
| 702 |
+
typedef void (*ompt_callback_work_t) (
|
| 703 |
+
ompt_work_t wstype,
|
| 704 |
+
ompt_scope_endpoint_t endpoint,
|
| 705 |
+
ompt_data_t *parallel_data,
|
| 706 |
+
ompt_data_t *task_data,
|
| 707 |
+
uint64_t count,
|
| 708 |
+
const void *codeptr_ra
|
| 709 |
+
);
|
| 710 |
+
|
| 711 |
+
typedef struct ompt_record_work_t {
|
| 712 |
+
ompt_work_t wstype;
|
| 713 |
+
ompt_scope_endpoint_t endpoint;
|
| 714 |
+
ompt_id_t parallel_id;
|
| 715 |
+
ompt_id_t task_id;
|
| 716 |
+
uint64_t count;
|
| 717 |
+
const void *codeptr_ra;
|
| 718 |
+
} ompt_record_work_t;
|
| 719 |
+
|
| 720 |
+
typedef void (*ompt_callback_dispatch_t) (
|
| 721 |
+
ompt_data_t *parallel_data,
|
| 722 |
+
ompt_data_t *task_data,
|
| 723 |
+
ompt_dispatch_t kind,
|
| 724 |
+
ompt_data_t instance
|
| 725 |
+
);
|
| 726 |
+
|
| 727 |
+
typedef struct ompt_record_dispatch_t {
|
| 728 |
+
ompt_id_t parallel_id;
|
| 729 |
+
ompt_id_t task_id;
|
| 730 |
+
ompt_dispatch_t kind;
|
| 731 |
+
ompt_data_t instance;
|
| 732 |
+
} ompt_record_dispatch_t;
|
| 733 |
+
|
| 734 |
+
typedef void (*ompt_callback_task_create_t) (
|
| 735 |
+
ompt_data_t *encountering_task_data,
|
| 736 |
+
const ompt_frame_t *encountering_task_frame,
|
| 737 |
+
ompt_data_t *new_task_data,
|
| 738 |
+
int flags,
|
| 739 |
+
int has_dependences,
|
| 740 |
+
const void *codeptr_ra
|
| 741 |
+
);
|
| 742 |
+
|
| 743 |
+
typedef struct ompt_record_task_create_t {
|
| 744 |
+
ompt_id_t encountering_task_id;
|
| 745 |
+
ompt_id_t new_task_id;
|
| 746 |
+
int flags;
|
| 747 |
+
int has_dependences;
|
| 748 |
+
const void *codeptr_ra;
|
| 749 |
+
} ompt_record_task_create_t;
|
| 750 |
+
|
| 751 |
+
typedef void (*ompt_callback_dependences_t) (
|
| 752 |
+
ompt_data_t *task_data,
|
| 753 |
+
const ompt_dependence_t *deps,
|
| 754 |
+
int ndeps
|
| 755 |
+
);
|
| 756 |
+
|
| 757 |
+
typedef struct ompt_record_dependences_t {
|
| 758 |
+
ompt_id_t task_id;
|
| 759 |
+
ompt_dependence_t dep;
|
| 760 |
+
int ndeps;
|
| 761 |
+
} ompt_record_dependences_t;
|
| 762 |
+
|
| 763 |
+
typedef void (*ompt_callback_task_dependence_t) (
|
| 764 |
+
ompt_data_t *src_task_data,
|
| 765 |
+
ompt_data_t *sink_task_data
|
| 766 |
+
);
|
| 767 |
+
|
| 768 |
+
typedef struct ompt_record_task_dependence_t {
|
| 769 |
+
ompt_id_t src_task_id;
|
| 770 |
+
ompt_id_t sink_task_id;
|
| 771 |
+
} ompt_record_task_dependence_t;
|
| 772 |
+
|
| 773 |
+
typedef void (*ompt_callback_task_schedule_t) (
|
| 774 |
+
ompt_data_t *prior_task_data,
|
| 775 |
+
ompt_task_status_t prior_task_status,
|
| 776 |
+
ompt_data_t *next_task_data
|
| 777 |
+
);
|
| 778 |
+
|
| 779 |
+
typedef struct ompt_record_task_schedule_t {
|
| 780 |
+
ompt_id_t prior_task_id;
|
| 781 |
+
ompt_task_status_t prior_task_status;
|
| 782 |
+
ompt_id_t next_task_id;
|
| 783 |
+
} ompt_record_task_schedule_t;
|
| 784 |
+
|
| 785 |
+
typedef void (*ompt_callback_implicit_task_t) (
|
| 786 |
+
ompt_scope_endpoint_t endpoint,
|
| 787 |
+
ompt_data_t *parallel_data,
|
| 788 |
+
ompt_data_t *task_data,
|
| 789 |
+
unsigned int actual_parallelism,
|
| 790 |
+
unsigned int index,
|
| 791 |
+
int flags
|
| 792 |
+
);
|
| 793 |
+
|
| 794 |
+
typedef struct ompt_record_implicit_task_t {
|
| 795 |
+
ompt_scope_endpoint_t endpoint;
|
| 796 |
+
ompt_id_t parallel_id;
|
| 797 |
+
ompt_id_t task_id;
|
| 798 |
+
unsigned int actual_parallelism;
|
| 799 |
+
unsigned int index;
|
| 800 |
+
int flags;
|
| 801 |
+
} ompt_record_implicit_task_t;
|
| 802 |
+
|
| 803 |
+
typedef void (*ompt_callback_master_t) (
|
| 804 |
+
ompt_scope_endpoint_t endpoint,
|
| 805 |
+
ompt_data_t *parallel_data,
|
| 806 |
+
ompt_data_t *task_data,
|
| 807 |
+
const void *codeptr_ra
|
| 808 |
+
);
|
| 809 |
+
|
| 810 |
+
typedef struct ompt_record_master_t {
|
| 811 |
+
ompt_scope_endpoint_t endpoint;
|
| 812 |
+
ompt_id_t parallel_id;
|
| 813 |
+
ompt_id_t task_id;
|
| 814 |
+
const void *codeptr_ra;
|
| 815 |
+
} ompt_record_master_t;
|
| 816 |
+
|
| 817 |
+
typedef void (*ompt_callback_sync_region_t) (
|
| 818 |
+
ompt_sync_region_t kind,
|
| 819 |
+
ompt_scope_endpoint_t endpoint,
|
| 820 |
+
ompt_data_t *parallel_data,
|
| 821 |
+
ompt_data_t *task_data,
|
| 822 |
+
const void *codeptr_ra
|
| 823 |
+
);
|
| 824 |
+
|
| 825 |
+
typedef struct ompt_record_sync_region_t {
|
| 826 |
+
ompt_sync_region_t kind;
|
| 827 |
+
ompt_scope_endpoint_t endpoint;
|
| 828 |
+
ompt_id_t parallel_id;
|
| 829 |
+
ompt_id_t task_id;
|
| 830 |
+
const void *codeptr_ra;
|
| 831 |
+
} ompt_record_sync_region_t;
|
| 832 |
+
|
| 833 |
+
typedef void (*ompt_callback_mutex_acquire_t) (
|
| 834 |
+
ompt_mutex_t kind,
|
| 835 |
+
unsigned int hint,
|
| 836 |
+
unsigned int impl,
|
| 837 |
+
ompt_wait_id_t wait_id,
|
| 838 |
+
const void *codeptr_ra
|
| 839 |
+
);
|
| 840 |
+
|
| 841 |
+
typedef struct ompt_record_mutex_acquire_t {
|
| 842 |
+
ompt_mutex_t kind;
|
| 843 |
+
unsigned int hint;
|
| 844 |
+
unsigned int impl;
|
| 845 |
+
ompt_wait_id_t wait_id;
|
| 846 |
+
const void *codeptr_ra;
|
| 847 |
+
} ompt_record_mutex_acquire_t;
|
| 848 |
+
|
| 849 |
+
typedef void (*ompt_callback_mutex_t) (
|
| 850 |
+
ompt_mutex_t kind,
|
| 851 |
+
ompt_wait_id_t wait_id,
|
| 852 |
+
const void *codeptr_ra
|
| 853 |
+
);
|
| 854 |
+
|
| 855 |
+
typedef struct ompt_record_mutex_t {
|
| 856 |
+
ompt_mutex_t kind;
|
| 857 |
+
ompt_wait_id_t wait_id;
|
| 858 |
+
const void *codeptr_ra;
|
| 859 |
+
} ompt_record_mutex_t;
|
| 860 |
+
|
| 861 |
+
typedef void (*ompt_callback_nest_lock_t) (
|
| 862 |
+
ompt_scope_endpoint_t endpoint,
|
| 863 |
+
ompt_wait_id_t wait_id,
|
| 864 |
+
const void *codeptr_ra
|
| 865 |
+
);
|
| 866 |
+
|
| 867 |
+
typedef struct ompt_record_nest_lock_t {
|
| 868 |
+
ompt_scope_endpoint_t endpoint;
|
| 869 |
+
ompt_wait_id_t wait_id;
|
| 870 |
+
const void *codeptr_ra;
|
| 871 |
+
} ompt_record_nest_lock_t;
|
| 872 |
+
|
| 873 |
+
typedef void (*ompt_callback_flush_t) (
|
| 874 |
+
ompt_data_t *thread_data,
|
| 875 |
+
const void *codeptr_ra
|
| 876 |
+
);
|
| 877 |
+
|
| 878 |
+
typedef struct ompt_record_flush_t {
|
| 879 |
+
const void *codeptr_ra;
|
| 880 |
+
} ompt_record_flush_t;
|
| 881 |
+
|
| 882 |
+
typedef void (*ompt_callback_cancel_t) (
|
| 883 |
+
ompt_data_t *task_data,
|
| 884 |
+
int flags,
|
| 885 |
+
const void *codeptr_ra
|
| 886 |
+
);
|
| 887 |
+
|
| 888 |
+
typedef struct ompt_record_cancel_t {
|
| 889 |
+
ompt_id_t task_id;
|
| 890 |
+
int flags;
|
| 891 |
+
const void *codeptr_ra;
|
| 892 |
+
} ompt_record_cancel_t;
|
| 893 |
+
|
| 894 |
+
typedef void (*ompt_callback_device_initialize_t) (
|
| 895 |
+
int device_num,
|
| 896 |
+
const char *type,
|
| 897 |
+
ompt_device_t *device,
|
| 898 |
+
ompt_function_lookup_t lookup,
|
| 899 |
+
const char *documentation
|
| 900 |
+
);
|
| 901 |
+
|
| 902 |
+
typedef void (*ompt_callback_device_finalize_t) (
|
| 903 |
+
int device_num
|
| 904 |
+
);
|
| 905 |
+
|
| 906 |
+
typedef void (*ompt_callback_device_load_t) (
|
| 907 |
+
int device_num,
|
| 908 |
+
const char *filename,
|
| 909 |
+
int64_t offset_in_file,
|
| 910 |
+
void *vma_in_file,
|
| 911 |
+
size_t bytes,
|
| 912 |
+
void *host_addr,
|
| 913 |
+
void *device_addr,
|
| 914 |
+
uint64_t module_id
|
| 915 |
+
);
|
| 916 |
+
|
| 917 |
+
typedef void (*ompt_callback_device_unload_t) (
|
| 918 |
+
int device_num,
|
| 919 |
+
uint64_t module_id
|
| 920 |
+
);
|
| 921 |
+
|
| 922 |
+
typedef void (*ompt_callback_target_data_op_t) (
|
| 923 |
+
ompt_id_t target_id,
|
| 924 |
+
ompt_id_t host_op_id,
|
| 925 |
+
ompt_target_data_op_t optype,
|
| 926 |
+
void *src_addr,
|
| 927 |
+
int src_device_num,
|
| 928 |
+
void *dest_addr,
|
| 929 |
+
int dest_device_num,
|
| 930 |
+
size_t bytes,
|
| 931 |
+
const void *codeptr_ra
|
| 932 |
+
);
|
| 933 |
+
|
| 934 |
+
typedef struct ompt_record_target_data_op_t {
|
| 935 |
+
ompt_id_t host_op_id;
|
| 936 |
+
ompt_target_data_op_t optype;
|
| 937 |
+
void *src_addr;
|
| 938 |
+
int src_device_num;
|
| 939 |
+
void *dest_addr;
|
| 940 |
+
int dest_device_num;
|
| 941 |
+
size_t bytes;
|
| 942 |
+
ompt_device_time_t end_time;
|
| 943 |
+
const void *codeptr_ra;
|
| 944 |
+
} ompt_record_target_data_op_t;
|
| 945 |
+
|
| 946 |
+
typedef void (*ompt_callback_target_t) (
|
| 947 |
+
ompt_target_t kind,
|
| 948 |
+
ompt_scope_endpoint_t endpoint,
|
| 949 |
+
int device_num,
|
| 950 |
+
ompt_data_t *task_data,
|
| 951 |
+
ompt_id_t target_id,
|
| 952 |
+
const void *codeptr_ra
|
| 953 |
+
);
|
| 954 |
+
|
| 955 |
+
typedef struct ompt_record_target_t {
|
| 956 |
+
ompt_target_t kind;
|
| 957 |
+
ompt_scope_endpoint_t endpoint;
|
| 958 |
+
int device_num;
|
| 959 |
+
ompt_id_t task_id;
|
| 960 |
+
ompt_id_t target_id;
|
| 961 |
+
const void *codeptr_ra;
|
| 962 |
+
} ompt_record_target_t;
|
| 963 |
+
|
| 964 |
+
typedef void (*ompt_callback_target_map_t) (
|
| 965 |
+
ompt_id_t target_id,
|
| 966 |
+
unsigned int nitems,
|
| 967 |
+
void **host_addr,
|
| 968 |
+
void **device_addr,
|
| 969 |
+
size_t *bytes,
|
| 970 |
+
unsigned int *mapping_flags,
|
| 971 |
+
const void *codeptr_ra
|
| 972 |
+
);
|
| 973 |
+
|
| 974 |
+
typedef struct ompt_record_target_map_t {
|
| 975 |
+
ompt_id_t target_id;
|
| 976 |
+
unsigned int nitems;
|
| 977 |
+
void **host_addr;
|
| 978 |
+
void **device_addr;
|
| 979 |
+
size_t *bytes;
|
| 980 |
+
unsigned int *mapping_flags;
|
| 981 |
+
const void *codeptr_ra;
|
| 982 |
+
} ompt_record_target_map_t;
|
| 983 |
+
|
| 984 |
+
typedef void (*ompt_callback_target_submit_t) (
|
| 985 |
+
ompt_id_t target_id,
|
| 986 |
+
ompt_id_t host_op_id,
|
| 987 |
+
unsigned int requested_num_teams
|
| 988 |
+
);
|
| 989 |
+
|
| 990 |
+
typedef struct ompt_record_target_kernel_t {
|
| 991 |
+
ompt_id_t host_op_id;
|
| 992 |
+
unsigned int requested_num_teams;
|
| 993 |
+
unsigned int granted_num_teams;
|
| 994 |
+
ompt_device_time_t end_time;
|
| 995 |
+
} ompt_record_target_kernel_t;
|
| 996 |
+
|
| 997 |
+
typedef int (*ompt_callback_control_tool_t) (
|
| 998 |
+
uint64_t command,
|
| 999 |
+
uint64_t modifier,
|
| 1000 |
+
void *arg,
|
| 1001 |
+
const void *codeptr_ra
|
| 1002 |
+
);
|
| 1003 |
+
|
| 1004 |
+
typedef struct ompt_record_control_tool_t {
|
| 1005 |
+
uint64_t command;
|
| 1006 |
+
uint64_t modifier;
|
| 1007 |
+
const void *codeptr_ra;
|
| 1008 |
+
} ompt_record_control_tool_t;
|
| 1009 |
+
|
| 1010 |
+
typedef struct ompd_address_t {
|
| 1011 |
+
ompd_seg_t segment;
|
| 1012 |
+
ompd_addr_t address;
|
| 1013 |
+
} ompd_address_t;
|
| 1014 |
+
|
| 1015 |
+
typedef struct ompd_frame_info_t {
|
| 1016 |
+
ompd_address_t frame_address;
|
| 1017 |
+
ompd_word_t frame_flag;
|
| 1018 |
+
} ompd_frame_info_t;
|
| 1019 |
+
|
| 1020 |
+
typedef struct _ompd_aspace_handle ompd_address_space_handle_t;
|
| 1021 |
+
typedef struct _ompd_thread_handle ompd_thread_handle_t;
|
| 1022 |
+
typedef struct _ompd_parallel_handle ompd_parallel_handle_t;
|
| 1023 |
+
typedef struct _ompd_task_handle ompd_task_handle_t;
|
| 1024 |
+
|
| 1025 |
+
typedef struct _ompd_aspace_cont ompd_address_space_context_t;
|
| 1026 |
+
typedef struct _ompd_thread_cont ompd_thread_context_t;
|
| 1027 |
+
|
| 1028 |
+
typedef struct ompd_device_type_sizes_t {
|
| 1029 |
+
uint8_t sizeof_char;
|
| 1030 |
+
uint8_t sizeof_short;
|
| 1031 |
+
uint8_t sizeof_int;
|
| 1032 |
+
uint8_t sizeof_long;
|
| 1033 |
+
uint8_t sizeof_long_long;
|
| 1034 |
+
uint8_t sizeof_pointer;
|
| 1035 |
+
} ompd_device_type_sizes_t;
|
| 1036 |
+
|
| 1037 |
+
typedef struct ompt_record_ompt_t {
|
| 1038 |
+
ompt_callbacks_t type;
|
| 1039 |
+
ompt_device_time_t time;
|
| 1040 |
+
ompt_id_t thread_id;
|
| 1041 |
+
ompt_id_t target_id;
|
| 1042 |
+
union {
|
| 1043 |
+
ompt_record_thread_begin_t thread_begin;
|
| 1044 |
+
ompt_record_parallel_begin_t parallel_begin;
|
| 1045 |
+
ompt_record_parallel_end_t parallel_end;
|
| 1046 |
+
ompt_record_work_t work;
|
| 1047 |
+
ompt_record_dispatch_t dispatch;
|
| 1048 |
+
ompt_record_task_create_t task_create;
|
| 1049 |
+
ompt_record_dependences_t dependences;
|
| 1050 |
+
ompt_record_task_dependence_t task_dependence;
|
| 1051 |
+
ompt_record_task_schedule_t task_schedule;
|
| 1052 |
+
ompt_record_implicit_task_t implicit_task;
|
| 1053 |
+
ompt_record_master_t master;
|
| 1054 |
+
ompt_record_sync_region_t sync_region;
|
| 1055 |
+
ompt_record_mutex_acquire_t mutex_acquire;
|
| 1056 |
+
ompt_record_mutex_t mutex;
|
| 1057 |
+
ompt_record_nest_lock_t nest_lock;
|
| 1058 |
+
ompt_record_flush_t flush;
|
| 1059 |
+
ompt_record_cancel_t cancel;
|
| 1060 |
+
ompt_record_target_t target;
|
| 1061 |
+
ompt_record_target_data_op_t target_data_op;
|
| 1062 |
+
ompt_record_target_map_t target_map;
|
| 1063 |
+
ompt_record_target_kernel_t target_kernel;
|
| 1064 |
+
ompt_record_control_tool_t control_tool;
|
| 1065 |
+
} record;
|
| 1066 |
+
} ompt_record_ompt_t;
|
| 1067 |
+
|
| 1068 |
+
typedef ompt_record_ompt_t *(*ompt_get_record_ompt_t) (
|
| 1069 |
+
ompt_buffer_t *buffer,
|
| 1070 |
+
ompt_buffer_cursor_t current
|
| 1071 |
+
);
|
| 1072 |
+
|
| 1073 |
+
#define ompt_id_none 0
|
| 1074 |
+
#define ompt_data_none {0}
|
| 1075 |
+
#define ompt_time_none 0
|
| 1076 |
+
#define ompt_hwid_none 0
|
| 1077 |
+
#define ompt_addr_none ~0
|
| 1078 |
+
#define ompt_mutex_impl_none 0
|
| 1079 |
+
#define ompt_wait_id_none 0
|
| 1080 |
+
|
| 1081 |
+
#define ompd_segment_none 0
|
| 1082 |
+
|
| 1083 |
+
#endif /* __OMPT__ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (226 Bytes). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cuda_stdint.h
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2009-2017 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* Redistribution and use in source and binary forms, with or without
|
| 5 |
+
* modification, are permitted provided that the following conditions
|
| 6 |
+
* are met:
|
| 7 |
+
* * Redistributions of source code must retain the above copyright
|
| 8 |
+
* notice, this list of conditions and the following disclaimer.
|
| 9 |
+
* * Redistributions in binary form must reproduce the above copyright
|
| 10 |
+
* notice, this list of conditions and the following disclaimer in the
|
| 11 |
+
* documentation and/or other materials provided with the distribution.
|
| 12 |
+
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
| 13 |
+
* contributors may be used to endorse or promote products derived
|
| 14 |
+
* from this software without specific prior written permission.
|
| 15 |
+
*
|
| 16 |
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
| 17 |
+
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
| 18 |
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
| 19 |
+
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
| 20 |
+
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
| 21 |
+
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
| 22 |
+
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
| 23 |
+
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
| 24 |
+
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 25 |
+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 26 |
+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 27 |
+
*/
|
| 28 |
+
|
| 29 |
+
#ifndef __cuda_stdint_h__
|
| 30 |
+
#define __cuda_stdint_h__
|
| 31 |
+
|
| 32 |
+
// Compiler-specific treatment for C99's stdint.h
|
| 33 |
+
//
|
| 34 |
+
// By default, this header will use the standard headers (so it
|
| 35 |
+
// is your responsibility to make sure they are available), except
|
| 36 |
+
// on MSVC before Visual Studio 2010, when they were not provided.
|
| 37 |
+
// To support old MSVC, a few of the commonly-used definitions are
|
| 38 |
+
// provided here. If more definitions are needed, add them here,
|
| 39 |
+
// or replace these definitions with a complete implementation,
|
| 40 |
+
// such as the ones available from Google, Boost, or MSVC10. You
|
| 41 |
+
// can prevent the definition of any of these types (in order to
|
| 42 |
+
// use your own) by #defining CU_STDINT_TYPES_ALREADY_DEFINED.
|
| 43 |
+
|
| 44 |
+
#if !defined(CU_STDINT_TYPES_ALREADY_DEFINED)
|
| 45 |
+
|
| 46 |
+
// In VS including stdint.h forces the C++ runtime dep - provide an opt-out
|
| 47 |
+
// (CU_STDINT_VS_FORCE_NO_STDINT_H) for users that care (notably static
|
| 48 |
+
// cudart).
|
| 49 |
+
#if defined(_MSC_VER) && ((_MSC_VER < 1600) || defined(CU_STDINT_VS_FORCE_NO_STDINT_H))
|
| 50 |
+
|
| 51 |
+
// These definitions can be used with MSVC 8 and 9,
|
| 52 |
+
// which don't ship with stdint.h:
|
| 53 |
+
|
| 54 |
+
typedef unsigned char uint8_t;
|
| 55 |
+
|
| 56 |
+
typedef short int16_t;
|
| 57 |
+
typedef unsigned short uint16_t;
|
| 58 |
+
|
| 59 |
+
// To keep it consistent with all MSVC build. define those types
|
| 60 |
+
// in the exact same way they are defined with the MSVC headers
|
| 61 |
+
#if defined(_MSC_VER)
|
| 62 |
+
typedef signed char int8_t;
|
| 63 |
+
|
| 64 |
+
typedef int int32_t;
|
| 65 |
+
typedef unsigned int uint32_t;
|
| 66 |
+
|
| 67 |
+
typedef long long int64_t;
|
| 68 |
+
typedef unsigned long long uint64_t;
|
| 69 |
+
#else
|
| 70 |
+
typedef char int8_t;
|
| 71 |
+
|
| 72 |
+
typedef long int32_t;
|
| 73 |
+
typedef unsigned long uint32_t;
|
| 74 |
+
|
| 75 |
+
typedef __int64 int64_t;
|
| 76 |
+
typedef unsigned __int64 uint64_t;
|
| 77 |
+
#endif
|
| 78 |
+
|
| 79 |
+
#elif defined(__DJGPP__)
|
| 80 |
+
|
| 81 |
+
// These definitions can be used when compiling
|
| 82 |
+
// C code with DJGPP, which only provides stdint.h
|
| 83 |
+
// when compiling C++ code with TR1 enabled.
|
| 84 |
+
|
| 85 |
+
typedef char int8_t;
|
| 86 |
+
typedef unsigned char uint8_t;
|
| 87 |
+
|
| 88 |
+
typedef short int16_t;
|
| 89 |
+
typedef unsigned short uint16_t;
|
| 90 |
+
|
| 91 |
+
typedef long int32_t;
|
| 92 |
+
typedef unsigned long uint32_t;
|
| 93 |
+
|
| 94 |
+
typedef long long int64_t;
|
| 95 |
+
typedef unsigned long long uint64_t;
|
| 96 |
+
|
| 97 |
+
#else
|
| 98 |
+
|
| 99 |
+
// Use standard headers, as specified by C99 and C++ TR1.
|
| 100 |
+
// Known to be provided by:
|
| 101 |
+
// - gcc/glibc, supported by all versions of glibc
|
| 102 |
+
// - djgpp, supported since 2001
|
| 103 |
+
// - MSVC, supported by Visual Studio 2010 and later
|
| 104 |
+
|
| 105 |
+
#include <stdint.h>
|
| 106 |
+
|
| 107 |
+
#endif
|
| 108 |
+
|
| 109 |
+
#endif // !defined(CU_STDINT_TYPES_ALREADY_DEFINED)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
#endif // file guard
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_driver_cbid.h
ADDED
|
@@ -0,0 +1,690 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
// *************************************************************************
|
| 3 |
+
// Definitions of indices for API functions, unique across entire API
|
| 4 |
+
// *************************************************************************
|
| 5 |
+
|
| 6 |
+
// This file is generated. Any changes you make will be lost during the next clean build.
|
| 7 |
+
// CUDA public interface, for type definitions and cu* function prototypes
|
| 8 |
+
|
| 9 |
+
typedef enum CUpti_driver_api_trace_cbid_enum {
|
| 10 |
+
CUPTI_DRIVER_TRACE_CBID_INVALID = 0,
|
| 11 |
+
CUPTI_DRIVER_TRACE_CBID_cuInit = 1,
|
| 12 |
+
CUPTI_DRIVER_TRACE_CBID_cuDriverGetVersion = 2,
|
| 13 |
+
CUPTI_DRIVER_TRACE_CBID_cuDeviceGet = 3,
|
| 14 |
+
CUPTI_DRIVER_TRACE_CBID_cuDeviceGetCount = 4,
|
| 15 |
+
CUPTI_DRIVER_TRACE_CBID_cuDeviceGetName = 5,
|
| 16 |
+
CUPTI_DRIVER_TRACE_CBID_cuDeviceComputeCapability = 6,
|
| 17 |
+
CUPTI_DRIVER_TRACE_CBID_cuDeviceTotalMem = 7,
|
| 18 |
+
CUPTI_DRIVER_TRACE_CBID_cuDeviceGetProperties = 8,
|
| 19 |
+
CUPTI_DRIVER_TRACE_CBID_cuDeviceGetAttribute = 9,
|
| 20 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxCreate = 10,
|
| 21 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy = 11,
|
| 22 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxAttach = 12,
|
| 23 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxDetach = 13,
|
| 24 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxPushCurrent = 14,
|
| 25 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxPopCurrent = 15,
|
| 26 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxGetDevice = 16,
|
| 27 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxSynchronize = 17,
|
| 28 |
+
CUPTI_DRIVER_TRACE_CBID_cuModuleLoad = 18,
|
| 29 |
+
CUPTI_DRIVER_TRACE_CBID_cuModuleLoadData = 19,
|
| 30 |
+
CUPTI_DRIVER_TRACE_CBID_cuModuleLoadDataEx = 20,
|
| 31 |
+
CUPTI_DRIVER_TRACE_CBID_cuModuleLoadFatBinary = 21,
|
| 32 |
+
CUPTI_DRIVER_TRACE_CBID_cuModuleUnload = 22,
|
| 33 |
+
CUPTI_DRIVER_TRACE_CBID_cuModuleGetFunction = 23,
|
| 34 |
+
CUPTI_DRIVER_TRACE_CBID_cuModuleGetGlobal = 24,
|
| 35 |
+
CUPTI_DRIVER_TRACE_CBID_cu64ModuleGetGlobal = 25,
|
| 36 |
+
CUPTI_DRIVER_TRACE_CBID_cuModuleGetTexRef = 26,
|
| 37 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemGetInfo = 27,
|
| 38 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemGetInfo = 28,
|
| 39 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemAlloc = 29,
|
| 40 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemAlloc = 30,
|
| 41 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemAllocPitch = 31,
|
| 42 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemAllocPitch = 32,
|
| 43 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemFree = 33,
|
| 44 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemFree = 34,
|
| 45 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemGetAddressRange = 35,
|
| 46 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemGetAddressRange = 36,
|
| 47 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemAllocHost = 37,
|
| 48 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemFreeHost = 38,
|
| 49 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemHostAlloc = 39,
|
| 50 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemHostGetDevicePointer = 40,
|
| 51 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemHostGetDevicePointer = 41,
|
| 52 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemHostGetFlags = 42,
|
| 53 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD = 43,
|
| 54 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemcpyHtoD = 44,
|
| 55 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH = 45,
|
| 56 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoH = 46,
|
| 57 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD = 47,
|
| 58 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoD = 48,
|
| 59 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA = 49,
|
| 60 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoA = 50,
|
| 61 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD = 51,
|
| 62 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemcpyAtoD = 52,
|
| 63 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA = 53,
|
| 64 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH = 54,
|
| 65 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA = 55,
|
| 66 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D = 56,
|
| 67 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned = 57,
|
| 68 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D = 58,
|
| 69 |
+
CUPTI_DRIVER_TRACE_CBID_cu64Memcpy3D = 59,
|
| 70 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync = 60,
|
| 71 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemcpyHtoDAsync = 61,
|
| 72 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync = 62,
|
| 73 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoHAsync = 63,
|
| 74 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync = 64,
|
| 75 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoDAsync = 65,
|
| 76 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync = 66,
|
| 77 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync = 67,
|
| 78 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync = 68,
|
| 79 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync = 69,
|
| 80 |
+
CUPTI_DRIVER_TRACE_CBID_cu64Memcpy3DAsync = 70,
|
| 81 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD8 = 71,
|
| 82 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemsetD8 = 72,
|
| 83 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD16 = 73,
|
| 84 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemsetD16 = 74,
|
| 85 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD32 = 75,
|
| 86 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemsetD32 = 76,
|
| 87 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8 = 77,
|
| 88 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D8 = 78,
|
| 89 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16 = 79,
|
| 90 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D16 = 80,
|
| 91 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32 = 81,
|
| 92 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D32 = 82,
|
| 93 |
+
CUPTI_DRIVER_TRACE_CBID_cuFuncSetBlockShape = 83,
|
| 94 |
+
CUPTI_DRIVER_TRACE_CBID_cuFuncSetSharedSize = 84,
|
| 95 |
+
CUPTI_DRIVER_TRACE_CBID_cuFuncGetAttribute = 85,
|
| 96 |
+
CUPTI_DRIVER_TRACE_CBID_cuFuncSetCacheConfig = 86,
|
| 97 |
+
CUPTI_DRIVER_TRACE_CBID_cuArrayCreate = 87,
|
| 98 |
+
CUPTI_DRIVER_TRACE_CBID_cuArrayGetDescriptor = 88,
|
| 99 |
+
CUPTI_DRIVER_TRACE_CBID_cuArrayDestroy = 89,
|
| 100 |
+
CUPTI_DRIVER_TRACE_CBID_cuArray3DCreate = 90,
|
| 101 |
+
CUPTI_DRIVER_TRACE_CBID_cuArray3DGetDescriptor = 91,
|
| 102 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefCreate = 92,
|
| 103 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefDestroy = 93,
|
| 104 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefSetArray = 94,
|
| 105 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress = 95,
|
| 106 |
+
CUPTI_DRIVER_TRACE_CBID_cu64TexRefSetAddress = 96,
|
| 107 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress2D = 97,
|
| 108 |
+
CUPTI_DRIVER_TRACE_CBID_cu64TexRefSetAddress2D = 98,
|
| 109 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefSetFormat = 99,
|
| 110 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddressMode = 100,
|
| 111 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefSetFilterMode = 101,
|
| 112 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefSetFlags = 102,
|
| 113 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefGetAddress = 103,
|
| 114 |
+
CUPTI_DRIVER_TRACE_CBID_cu64TexRefGetAddress = 104,
|
| 115 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefGetArray = 105,
|
| 116 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefGetAddressMode = 106,
|
| 117 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefGetFilterMode = 107,
|
| 118 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefGetFormat = 108,
|
| 119 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefGetFlags = 109,
|
| 120 |
+
CUPTI_DRIVER_TRACE_CBID_cuParamSetSize = 110,
|
| 121 |
+
CUPTI_DRIVER_TRACE_CBID_cuParamSeti = 111,
|
| 122 |
+
CUPTI_DRIVER_TRACE_CBID_cuParamSetf = 112,
|
| 123 |
+
CUPTI_DRIVER_TRACE_CBID_cuParamSetv = 113,
|
| 124 |
+
CUPTI_DRIVER_TRACE_CBID_cuParamSetTexRef = 114,
|
| 125 |
+
CUPTI_DRIVER_TRACE_CBID_cuLaunch = 115,
|
| 126 |
+
CUPTI_DRIVER_TRACE_CBID_cuLaunchGrid = 116,
|
| 127 |
+
CUPTI_DRIVER_TRACE_CBID_cuLaunchGridAsync = 117,
|
| 128 |
+
CUPTI_DRIVER_TRACE_CBID_cuEventCreate = 118,
|
| 129 |
+
CUPTI_DRIVER_TRACE_CBID_cuEventRecord = 119,
|
| 130 |
+
CUPTI_DRIVER_TRACE_CBID_cuEventQuery = 120,
|
| 131 |
+
CUPTI_DRIVER_TRACE_CBID_cuEventSynchronize = 121,
|
| 132 |
+
CUPTI_DRIVER_TRACE_CBID_cuEventDestroy = 122,
|
| 133 |
+
CUPTI_DRIVER_TRACE_CBID_cuEventElapsedTime = 123,
|
| 134 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamCreate = 124,
|
| 135 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamQuery = 125,
|
| 136 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize = 126,
|
| 137 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamDestroy = 127,
|
| 138 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphicsUnregisterResource = 128,
|
| 139 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphicsSubResourceGetMappedArray = 129,
|
| 140 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceGetMappedPointer = 130,
|
| 141 |
+
CUPTI_DRIVER_TRACE_CBID_cu64GraphicsResourceGetMappedPointer = 131,
|
| 142 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceSetMapFlags = 132,
|
| 143 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphicsMapResources = 133,
|
| 144 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphicsUnmapResources = 134,
|
| 145 |
+
CUPTI_DRIVER_TRACE_CBID_cuGetExportTable = 135,
|
| 146 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxSetLimit = 136,
|
| 147 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxGetLimit = 137,
|
| 148 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D10GetDevice = 138,
|
| 149 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D10CtxCreate = 139,
|
| 150 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphicsD3D10RegisterResource = 140,
|
| 151 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D10RegisterResource = 141,
|
| 152 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D10UnregisterResource = 142,
|
| 153 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D10MapResources = 143,
|
| 154 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D10UnmapResources = 144,
|
| 155 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceSetMapFlags = 145,
|
| 156 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedArray = 146,
|
| 157 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedPointer = 147,
|
| 158 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedSize = 148,
|
| 159 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedPitch = 149,
|
| 160 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetSurfaceDimensions = 150,
|
| 161 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D11GetDevice = 151,
|
| 162 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D11CtxCreate = 152,
|
| 163 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphicsD3D11RegisterResource = 153,
|
| 164 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9GetDevice = 154,
|
| 165 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9CtxCreate = 155,
|
| 166 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphicsD3D9RegisterResource = 156,
|
| 167 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9GetDirect3DDevice = 157,
|
| 168 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9RegisterResource = 158,
|
| 169 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9UnregisterResource = 159,
|
| 170 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9MapResources = 160,
|
| 171 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9UnmapResources = 161,
|
| 172 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceSetMapFlags = 162,
|
| 173 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetSurfaceDimensions = 163,
|
| 174 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedArray = 164,
|
| 175 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedPointer = 165,
|
| 176 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedSize = 166,
|
| 177 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedPitch = 167,
|
| 178 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9Begin = 168,
|
| 179 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9End = 169,
|
| 180 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9RegisterVertexBuffer = 170,
|
| 181 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9MapVertexBuffer = 171,
|
| 182 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9UnmapVertexBuffer = 172,
|
| 183 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9UnregisterVertexBuffer = 173,
|
| 184 |
+
CUPTI_DRIVER_TRACE_CBID_cuGLCtxCreate = 174,
|
| 185 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphicsGLRegisterBuffer = 175,
|
| 186 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphicsGLRegisterImage = 176,
|
| 187 |
+
CUPTI_DRIVER_TRACE_CBID_cuWGLGetDevice = 177,
|
| 188 |
+
CUPTI_DRIVER_TRACE_CBID_cuGLInit = 178,
|
| 189 |
+
CUPTI_DRIVER_TRACE_CBID_cuGLRegisterBufferObject = 179,
|
| 190 |
+
CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObject = 180,
|
| 191 |
+
CUPTI_DRIVER_TRACE_CBID_cuGLUnmapBufferObject = 181,
|
| 192 |
+
CUPTI_DRIVER_TRACE_CBID_cuGLUnregisterBufferObject = 182,
|
| 193 |
+
CUPTI_DRIVER_TRACE_CBID_cuGLSetBufferObjectMapFlags = 183,
|
| 194 |
+
CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObjectAsync = 184,
|
| 195 |
+
CUPTI_DRIVER_TRACE_CBID_cuGLUnmapBufferObjectAsync = 185,
|
| 196 |
+
CUPTI_DRIVER_TRACE_CBID_cuVDPAUGetDevice = 186,
|
| 197 |
+
CUPTI_DRIVER_TRACE_CBID_cuVDPAUCtxCreate = 187,
|
| 198 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphicsVDPAURegisterVideoSurface = 188,
|
| 199 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphicsVDPAURegisterOutputSurface = 189,
|
| 200 |
+
CUPTI_DRIVER_TRACE_CBID_cuModuleGetSurfRef = 190,
|
| 201 |
+
CUPTI_DRIVER_TRACE_CBID_cuSurfRefCreate = 191,
|
| 202 |
+
CUPTI_DRIVER_TRACE_CBID_cuSurfRefDestroy = 192,
|
| 203 |
+
CUPTI_DRIVER_TRACE_CBID_cuSurfRefSetFormat = 193,
|
| 204 |
+
CUPTI_DRIVER_TRACE_CBID_cuSurfRefSetArray = 194,
|
| 205 |
+
CUPTI_DRIVER_TRACE_CBID_cuSurfRefGetFormat = 195,
|
| 206 |
+
CUPTI_DRIVER_TRACE_CBID_cuSurfRefGetArray = 196,
|
| 207 |
+
CUPTI_DRIVER_TRACE_CBID_cu64DeviceTotalMem = 197,
|
| 208 |
+
CUPTI_DRIVER_TRACE_CBID_cu64D3D10ResourceGetMappedPointer = 198,
|
| 209 |
+
CUPTI_DRIVER_TRACE_CBID_cu64D3D10ResourceGetMappedSize = 199,
|
| 210 |
+
CUPTI_DRIVER_TRACE_CBID_cu64D3D10ResourceGetMappedPitch = 200,
|
| 211 |
+
CUPTI_DRIVER_TRACE_CBID_cu64D3D10ResourceGetSurfaceDimensions = 201,
|
| 212 |
+
CUPTI_DRIVER_TRACE_CBID_cu64D3D9ResourceGetSurfaceDimensions = 202,
|
| 213 |
+
CUPTI_DRIVER_TRACE_CBID_cu64D3D9ResourceGetMappedPointer = 203,
|
| 214 |
+
CUPTI_DRIVER_TRACE_CBID_cu64D3D9ResourceGetMappedSize = 204,
|
| 215 |
+
CUPTI_DRIVER_TRACE_CBID_cu64D3D9ResourceGetMappedPitch = 205,
|
| 216 |
+
CUPTI_DRIVER_TRACE_CBID_cu64D3D9MapVertexBuffer = 206,
|
| 217 |
+
CUPTI_DRIVER_TRACE_CBID_cu64GLMapBufferObject = 207,
|
| 218 |
+
CUPTI_DRIVER_TRACE_CBID_cu64GLMapBufferObjectAsync = 208,
|
| 219 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D11GetDevices = 209,
|
| 220 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D11CtxCreateOnDevice = 210,
|
| 221 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D10GetDevices = 211,
|
| 222 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D10CtxCreateOnDevice = 212,
|
| 223 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9GetDevices = 213,
|
| 224 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9CtxCreateOnDevice = 214,
|
| 225 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemHostAlloc = 215,
|
| 226 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async = 216,
|
| 227 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemsetD8Async = 217,
|
| 228 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async = 218,
|
| 229 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemsetD16Async = 219,
|
| 230 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async = 220,
|
| 231 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemsetD32Async = 221,
|
| 232 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async = 222,
|
| 233 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D8Async = 223,
|
| 234 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async = 224,
|
| 235 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D16Async = 225,
|
| 236 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async = 226,
|
| 237 |
+
CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D32Async = 227,
|
| 238 |
+
CUPTI_DRIVER_TRACE_CBID_cu64ArrayCreate = 228,
|
| 239 |
+
CUPTI_DRIVER_TRACE_CBID_cu64ArrayGetDescriptor = 229,
|
| 240 |
+
CUPTI_DRIVER_TRACE_CBID_cu64Array3DCreate = 230,
|
| 241 |
+
CUPTI_DRIVER_TRACE_CBID_cu64Array3DGetDescriptor = 231,
|
| 242 |
+
CUPTI_DRIVER_TRACE_CBID_cu64Memcpy2D = 232,
|
| 243 |
+
CUPTI_DRIVER_TRACE_CBID_cu64Memcpy2DUnaligned = 233,
|
| 244 |
+
CUPTI_DRIVER_TRACE_CBID_cu64Memcpy2DAsync = 234,
|
| 245 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxCreate_v2 = 235,
|
| 246 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D10CtxCreate_v2 = 236,
|
| 247 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D11CtxCreate_v2 = 237,
|
| 248 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9CtxCreate_v2 = 238,
|
| 249 |
+
CUPTI_DRIVER_TRACE_CBID_cuGLCtxCreate_v2 = 239,
|
| 250 |
+
CUPTI_DRIVER_TRACE_CBID_cuVDPAUCtxCreate_v2 = 240,
|
| 251 |
+
CUPTI_DRIVER_TRACE_CBID_cuModuleGetGlobal_v2 = 241,
|
| 252 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemGetInfo_v2 = 242,
|
| 253 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2 = 243,
|
| 254 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemAllocPitch_v2 = 244,
|
| 255 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemFree_v2 = 245,
|
| 256 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemGetAddressRange_v2 = 246,
|
| 257 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemHostGetDevicePointer_v2 = 247,
|
| 258 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpy_v2 = 248,
|
| 259 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2 = 249,
|
| 260 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2 = 250,
|
| 261 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2 = 251,
|
| 262 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2 = 252,
|
| 263 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2 = 253,
|
| 264 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2 = 254,
|
| 265 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress_v2 = 255,
|
| 266 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress2D_v2 = 256,
|
| 267 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefGetAddress_v2 = 257,
|
| 268 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceGetMappedPointer_v2 = 258,
|
| 269 |
+
CUPTI_DRIVER_TRACE_CBID_cuDeviceTotalMem_v2 = 259,
|
| 270 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedPointer_v2 = 260,
|
| 271 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedSize_v2 = 261,
|
| 272 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedPitch_v2 = 262,
|
| 273 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetSurfaceDimensions_v2 = 263,
|
| 274 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetSurfaceDimensions_v2 = 264,
|
| 275 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedPointer_v2 = 265,
|
| 276 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedSize_v2 = 266,
|
| 277 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedPitch_v2 = 267,
|
| 278 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D9MapVertexBuffer_v2 = 268,
|
| 279 |
+
CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObject_v2 = 269,
|
| 280 |
+
CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObjectAsync_v2 = 270,
|
| 281 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemHostAlloc_v2 = 271,
|
| 282 |
+
CUPTI_DRIVER_TRACE_CBID_cuArrayCreate_v2 = 272,
|
| 283 |
+
CUPTI_DRIVER_TRACE_CBID_cuArrayGetDescriptor_v2 = 273,
|
| 284 |
+
CUPTI_DRIVER_TRACE_CBID_cuArray3DCreate_v2 = 274,
|
| 285 |
+
CUPTI_DRIVER_TRACE_CBID_cuArray3DGetDescriptor_v2 = 275,
|
| 286 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2 = 276,
|
| 287 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2 = 277,
|
| 288 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2 = 278,
|
| 289 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2 = 279,
|
| 290 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2 = 280,
|
| 291 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2 = 281,
|
| 292 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2 = 282,
|
| 293 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2 = 283,
|
| 294 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2 = 284,
|
| 295 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2 = 285,
|
| 296 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2 = 286,
|
| 297 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2 = 287,
|
| 298 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2 = 288,
|
| 299 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2 = 289,
|
| 300 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2 = 290,
|
| 301 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2 = 291,
|
| 302 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2 = 292,
|
| 303 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2 = 293,
|
| 304 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemAllocHost_v2 = 294,
|
| 305 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamWaitEvent = 295,
|
| 306 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxGetApiVersion = 296,
|
| 307 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D10GetDirect3DDevice = 297,
|
| 308 |
+
CUPTI_DRIVER_TRACE_CBID_cuD3D11GetDirect3DDevice = 298,
|
| 309 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxGetCacheConfig = 299,
|
| 310 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxSetCacheConfig = 300,
|
| 311 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemHostRegister = 301,
|
| 312 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemHostUnregister = 302,
|
| 313 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxSetCurrent = 303,
|
| 314 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxGetCurrent = 304,
|
| 315 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpy = 305,
|
| 316 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync = 306,
|
| 317 |
+
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel = 307,
|
| 318 |
+
CUPTI_DRIVER_TRACE_CBID_cuProfilerStart = 308,
|
| 319 |
+
CUPTI_DRIVER_TRACE_CBID_cuProfilerStop = 309,
|
| 320 |
+
CUPTI_DRIVER_TRACE_CBID_cuPointerGetAttribute = 310,
|
| 321 |
+
CUPTI_DRIVER_TRACE_CBID_cuProfilerInitialize = 311,
|
| 322 |
+
CUPTI_DRIVER_TRACE_CBID_cuDeviceCanAccessPeer = 312,
|
| 323 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxEnablePeerAccess = 313,
|
| 324 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxDisablePeerAccess = 314,
|
| 325 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemPeerRegister = 315,
|
| 326 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemPeerUnregister = 316,
|
| 327 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemPeerGetDevicePointer = 317,
|
| 328 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer = 318,
|
| 329 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync = 319,
|
| 330 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DPeer = 320,
|
| 331 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DPeerAsync = 321,
|
| 332 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy_v2 = 322,
|
| 333 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxPushCurrent_v2 = 323,
|
| 334 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxPopCurrent_v2 = 324,
|
| 335 |
+
CUPTI_DRIVER_TRACE_CBID_cuEventDestroy_v2 = 325,
|
| 336 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamDestroy_v2 = 326,
|
| 337 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress2D_v3 = 327,
|
| 338 |
+
CUPTI_DRIVER_TRACE_CBID_cuIpcGetMemHandle = 328,
|
| 339 |
+
CUPTI_DRIVER_TRACE_CBID_cuIpcOpenMemHandle = 329,
|
| 340 |
+
CUPTI_DRIVER_TRACE_CBID_cuIpcCloseMemHandle = 330,
|
| 341 |
+
CUPTI_DRIVER_TRACE_CBID_cuDeviceGetByPCIBusId = 331,
|
| 342 |
+
CUPTI_DRIVER_TRACE_CBID_cuDeviceGetPCIBusId = 332,
|
| 343 |
+
CUPTI_DRIVER_TRACE_CBID_cuGLGetDevices = 333,
|
| 344 |
+
CUPTI_DRIVER_TRACE_CBID_cuIpcGetEventHandle = 334,
|
| 345 |
+
CUPTI_DRIVER_TRACE_CBID_cuIpcOpenEventHandle = 335,
|
| 346 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxSetSharedMemConfig = 336,
|
| 347 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxGetSharedMemConfig = 337,
|
| 348 |
+
CUPTI_DRIVER_TRACE_CBID_cuFuncSetSharedMemConfig = 338,
|
| 349 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexObjectCreate = 339,
|
| 350 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexObjectDestroy = 340,
|
| 351 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexObjectGetResourceDesc = 341,
|
| 352 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexObjectGetTextureDesc = 342,
|
| 353 |
+
CUPTI_DRIVER_TRACE_CBID_cuSurfObjectCreate = 343,
|
| 354 |
+
CUPTI_DRIVER_TRACE_CBID_cuSurfObjectDestroy = 344,
|
| 355 |
+
CUPTI_DRIVER_TRACE_CBID_cuSurfObjectGetResourceDesc = 345,
|
| 356 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamAddCallback = 346,
|
| 357 |
+
CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayCreate = 347,
|
| 358 |
+
CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayGetLevel = 348,
|
| 359 |
+
CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayDestroy = 349,
|
| 360 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMipmappedArray = 350,
|
| 361 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMipmapFilterMode = 351,
|
| 362 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMipmapLevelBias = 352,
|
| 363 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMipmapLevelClamp = 353,
|
| 364 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMaxAnisotropy = 354,
|
| 365 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMipmappedArray = 355,
|
| 366 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMipmapFilterMode = 356,
|
| 367 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMipmapLevelBias = 357,
|
| 368 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMipmapLevelClamp = 358,
|
| 369 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMaxAnisotropy = 359,
|
| 370 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceGetMappedMipmappedArray = 360,
|
| 371 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexObjectGetResourceViewDesc = 361,
|
| 372 |
+
CUPTI_DRIVER_TRACE_CBID_cuLinkCreate = 362,
|
| 373 |
+
CUPTI_DRIVER_TRACE_CBID_cuLinkAddData = 363,
|
| 374 |
+
CUPTI_DRIVER_TRACE_CBID_cuLinkAddFile = 364,
|
| 375 |
+
CUPTI_DRIVER_TRACE_CBID_cuLinkComplete = 365,
|
| 376 |
+
CUPTI_DRIVER_TRACE_CBID_cuLinkDestroy = 366,
|
| 377 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamCreateWithPriority = 367,
|
| 378 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamGetPriority = 368,
|
| 379 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamGetFlags = 369,
|
| 380 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxGetStreamPriorityRange = 370,
|
| 381 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemAllocManaged = 371,
|
| 382 |
+
CUPTI_DRIVER_TRACE_CBID_cuGetErrorString = 372,
|
| 383 |
+
CUPTI_DRIVER_TRACE_CBID_cuGetErrorName = 373,
|
| 384 |
+
CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxActiveBlocksPerMultiprocessor = 374,
|
| 385 |
+
CUPTI_DRIVER_TRACE_CBID_cuCompilePtx = 375,
|
| 386 |
+
CUPTI_DRIVER_TRACE_CBID_cuBinaryFree = 376,
|
| 387 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamAttachMemAsync = 377,
|
| 388 |
+
CUPTI_DRIVER_TRACE_CBID_cuPointerSetAttribute = 378,
|
| 389 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemHostRegister_v2 = 379,
|
| 390 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceSetMapFlags_v2 = 380,
|
| 391 |
+
CUPTI_DRIVER_TRACE_CBID_cuLinkCreate_v2 = 381,
|
| 392 |
+
CUPTI_DRIVER_TRACE_CBID_cuLinkAddData_v2 = 382,
|
| 393 |
+
CUPTI_DRIVER_TRACE_CBID_cuLinkAddFile_v2 = 383,
|
| 394 |
+
CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxPotentialBlockSize = 384,
|
| 395 |
+
CUPTI_DRIVER_TRACE_CBID_cuGLGetDevices_v2 = 385,
|
| 396 |
+
CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxRetain = 386,
|
| 397 |
+
CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxRelease = 387,
|
| 398 |
+
CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxSetFlags = 388,
|
| 399 |
+
CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxReset = 389,
|
| 400 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphicsEGLRegisterImage = 390,
|
| 401 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxGetFlags = 391,
|
| 402 |
+
CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxGetState = 392,
|
| 403 |
+
CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerConnect = 393,
|
| 404 |
+
CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerDisconnect = 394,
|
| 405 |
+
CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerAcquireFrame = 395,
|
| 406 |
+
CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerReleaseFrame = 396,
|
| 407 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2_ptds = 397,
|
| 408 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2_ptds = 398,
|
| 409 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2_ptds = 399,
|
| 410 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2_ptds = 400,
|
| 411 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2_ptds = 401,
|
| 412 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2_ptds = 402,
|
| 413 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2_ptds = 403,
|
| 414 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2_ptds = 404,
|
| 415 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2_ptds = 405,
|
| 416 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2_ptds = 406,
|
| 417 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2_ptds = 407,
|
| 418 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpy_ptds = 408,
|
| 419 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer_ptds = 409,
|
| 420 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DPeer_ptds = 410,
|
| 421 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2_ptds = 411,
|
| 422 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2_ptds = 412,
|
| 423 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2_ptds = 413,
|
| 424 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2_ptds = 414,
|
| 425 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2_ptds = 415,
|
| 426 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2_ptds = 416,
|
| 427 |
+
CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObject_v2_ptds = 417,
|
| 428 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync_ptsz = 418,
|
| 429 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2_ptsz = 419,
|
| 430 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2_ptsz = 420,
|
| 431 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2_ptsz = 421,
|
| 432 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2_ptsz = 422,
|
| 433 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2_ptsz = 423,
|
| 434 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2_ptsz = 424,
|
| 435 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2_ptsz = 425,
|
| 436 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync_ptsz = 426,
|
| 437 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DPeerAsync_ptsz = 427,
|
| 438 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async_ptsz = 428,
|
| 439 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async_ptsz = 429,
|
| 440 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async_ptsz = 430,
|
| 441 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async_ptsz = 431,
|
| 442 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async_ptsz = 432,
|
| 443 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async_ptsz = 433,
|
| 444 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamGetPriority_ptsz = 434,
|
| 445 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamGetFlags_ptsz = 435,
|
| 446 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamWaitEvent_ptsz = 436,
|
| 447 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamAddCallback_ptsz = 437,
|
| 448 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamAttachMemAsync_ptsz = 438,
|
| 449 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamQuery_ptsz = 439,
|
| 450 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize_ptsz = 440,
|
| 451 |
+
CUPTI_DRIVER_TRACE_CBID_cuEventRecord_ptsz = 441,
|
| 452 |
+
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel_ptsz = 442,
|
| 453 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphicsMapResources_ptsz = 443,
|
| 454 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphicsUnmapResources_ptsz = 444,
|
| 455 |
+
CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObjectAsync_v2_ptsz = 445,
|
| 456 |
+
CUPTI_DRIVER_TRACE_CBID_cuEGLStreamProducerConnect = 446,
|
| 457 |
+
CUPTI_DRIVER_TRACE_CBID_cuEGLStreamProducerDisconnect = 447,
|
| 458 |
+
CUPTI_DRIVER_TRACE_CBID_cuEGLStreamProducerPresentFrame = 448,
|
| 459 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceGetMappedEglFrame = 449,
|
| 460 |
+
CUPTI_DRIVER_TRACE_CBID_cuPointerGetAttributes = 450,
|
| 461 |
+
CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags = 451,
|
| 462 |
+
CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxPotentialBlockSizeWithFlags = 452,
|
| 463 |
+
CUPTI_DRIVER_TRACE_CBID_cuEGLStreamProducerReturnFrame = 453,
|
| 464 |
+
CUPTI_DRIVER_TRACE_CBID_cuDeviceGetP2PAttribute = 454,
|
| 465 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefSetBorderColor = 455,
|
| 466 |
+
CUPTI_DRIVER_TRACE_CBID_cuTexRefGetBorderColor = 456,
|
| 467 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemAdvise = 457,
|
| 468 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue32 = 458,
|
| 469 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue32_ptsz = 459,
|
| 470 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue32 = 460,
|
| 471 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue32_ptsz = 461,
|
| 472 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamBatchMemOp = 462,
|
| 473 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamBatchMemOp_ptsz = 463,
|
| 474 |
+
CUPTI_DRIVER_TRACE_CBID_cuNVNbufferGetPointer = 464,
|
| 475 |
+
CUPTI_DRIVER_TRACE_CBID_cuNVNtextureGetArray = 465,
|
| 476 |
+
CUPTI_DRIVER_TRACE_CBID_cuNNSetAllocator = 466,
|
| 477 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemPrefetchAsync = 467,
|
| 478 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemPrefetchAsync_ptsz = 468,
|
| 479 |
+
CUPTI_DRIVER_TRACE_CBID_cuEventCreateFromNVNSync = 469,
|
| 480 |
+
CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerConnectWithFlags = 470,
|
| 481 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemRangeGetAttribute = 471,
|
| 482 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemRangeGetAttributes = 472,
|
| 483 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue64 = 473,
|
| 484 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue64_ptsz = 474,
|
| 485 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue64 = 475,
|
| 486 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue64_ptsz = 476,
|
| 487 |
+
CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel = 477,
|
| 488 |
+
CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel_ptsz = 478,
|
| 489 |
+
CUPTI_DRIVER_TRACE_CBID_cuEventCreateFromEGLSync = 479,
|
| 490 |
+
CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice = 480,
|
| 491 |
+
CUPTI_DRIVER_TRACE_CBID_cuFuncSetAttribute = 481,
|
| 492 |
+
CUPTI_DRIVER_TRACE_CBID_cuDeviceGetUuid = 482,
|
| 493 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamGetCtx = 483,
|
| 494 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamGetCtx_ptsz = 484,
|
| 495 |
+
CUPTI_DRIVER_TRACE_CBID_cuImportExternalMemory = 485,
|
| 496 |
+
CUPTI_DRIVER_TRACE_CBID_cuExternalMemoryGetMappedBuffer = 486,
|
| 497 |
+
CUPTI_DRIVER_TRACE_CBID_cuExternalMemoryGetMappedMipmappedArray = 487,
|
| 498 |
+
CUPTI_DRIVER_TRACE_CBID_cuDestroyExternalMemory = 488,
|
| 499 |
+
CUPTI_DRIVER_TRACE_CBID_cuImportExternalSemaphore = 489,
|
| 500 |
+
CUPTI_DRIVER_TRACE_CBID_cuSignalExternalSemaphoresAsync = 490,
|
| 501 |
+
CUPTI_DRIVER_TRACE_CBID_cuSignalExternalSemaphoresAsync_ptsz = 491,
|
| 502 |
+
CUPTI_DRIVER_TRACE_CBID_cuWaitExternalSemaphoresAsync = 492,
|
| 503 |
+
CUPTI_DRIVER_TRACE_CBID_cuWaitExternalSemaphoresAsync_ptsz = 493,
|
| 504 |
+
CUPTI_DRIVER_TRACE_CBID_cuDestroyExternalSemaphore = 494,
|
| 505 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCapture = 495,
|
| 506 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCapture_ptsz = 496,
|
| 507 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamEndCapture = 497,
|
| 508 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamEndCapture_ptsz = 498,
|
| 509 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamIsCapturing = 499,
|
| 510 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamIsCapturing_ptsz = 500,
|
| 511 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphCreate = 501,
|
| 512 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphAddKernelNode = 502,
|
| 513 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeGetParams = 503,
|
| 514 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemcpyNode = 504,
|
| 515 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphMemcpyNodeGetParams = 505,
|
| 516 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemsetNode = 506,
|
| 517 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphMemsetNodeGetParams = 507,
|
| 518 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphMemsetNodeSetParams = 508,
|
| 519 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetType = 509,
|
| 520 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphGetRootNodes = 510,
|
| 521 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetDependencies = 511,
|
| 522 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetDependentNodes = 512,
|
| 523 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiate = 513,
|
| 524 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch = 514,
|
| 525 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz = 515,
|
| 526 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphExecDestroy = 516,
|
| 527 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphDestroy = 517,
|
| 528 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphAddDependencies = 518,
|
| 529 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphRemoveDependencies = 519,
|
| 530 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphMemcpyNodeSetParams = 520,
|
| 531 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeSetParams = 521,
|
| 532 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphDestroyNode = 522,
|
| 533 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphClone = 523,
|
| 534 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphNodeFindInClone = 524,
|
| 535 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphAddChildGraphNode = 525,
|
| 536 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphAddEmptyNode = 526,
|
| 537 |
+
CUPTI_DRIVER_TRACE_CBID_cuLaunchHostFunc = 527,
|
| 538 |
+
CUPTI_DRIVER_TRACE_CBID_cuLaunchHostFunc_ptsz = 528,
|
| 539 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphChildGraphNodeGetGraph = 529,
|
| 540 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphAddHostNode = 530,
|
| 541 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphHostNodeGetParams = 531,
|
| 542 |
+
CUPTI_DRIVER_TRACE_CBID_cuDeviceGetLuid = 532,
|
| 543 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphHostNodeSetParams = 533,
|
| 544 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphGetNodes = 534,
|
| 545 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphGetEdges = 535,
|
| 546 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo = 536,
|
| 547 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo_ptsz = 537,
|
| 548 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphExecKernelNodeSetParams = 538,
|
| 549 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCapture_v2 = 539,
|
| 550 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCapture_v2_ptsz = 540,
|
| 551 |
+
CUPTI_DRIVER_TRACE_CBID_cuThreadExchangeStreamCaptureMode = 541,
|
| 552 |
+
CUPTI_DRIVER_TRACE_CBID_cuDeviceGetNvSciSyncAttributes = 542,
|
| 553 |
+
CUPTI_DRIVER_TRACE_CBID_cuOccupancyAvailableDynamicSMemPerBlock = 543,
|
| 554 |
+
CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxRelease_v2 = 544,
|
| 555 |
+
CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxReset_v2 = 545,
|
| 556 |
+
CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxSetFlags_v2 = 546,
|
| 557 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemAddressReserve = 547,
|
| 558 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemAddressFree = 548,
|
| 559 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemCreate = 549,
|
| 560 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemRelease = 550,
|
| 561 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemMap = 551,
|
| 562 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemUnmap = 552,
|
| 563 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemSetAccess = 553,
|
| 564 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemExportToShareableHandle = 554,
|
| 565 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemImportFromShareableHandle = 555,
|
| 566 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemGetAllocationGranularity = 556,
|
| 567 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemGetAllocationPropertiesFromHandle = 557,
|
| 568 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemGetAccess = 558,
|
| 569 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamSetFlags = 559,
|
| 570 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamSetFlags_ptsz = 560,
|
| 571 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphExecUpdate = 561,
|
| 572 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphExecMemcpyNodeSetParams = 562,
|
| 573 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphExecMemsetNodeSetParams = 563,
|
| 574 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphExecHostNodeSetParams = 564,
|
| 575 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemRetainAllocationHandle = 565,
|
| 576 |
+
CUPTI_DRIVER_TRACE_CBID_cuFuncGetModule = 566,
|
| 577 |
+
CUPTI_DRIVER_TRACE_CBID_cuIpcOpenMemHandle_v2 = 567,
|
| 578 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxResetPersistingL2Cache = 568,
|
| 579 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeCopyAttributes = 569,
|
| 580 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeGetAttribute = 570,
|
| 581 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeSetAttribute = 571,
|
| 582 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamCopyAttributes = 572,
|
| 583 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamCopyAttributes_ptsz = 573,
|
| 584 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamGetAttribute = 574,
|
| 585 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamGetAttribute_ptsz = 575,
|
| 586 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamSetAttribute = 576,
|
| 587 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamSetAttribute_ptsz = 577,
|
| 588 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiate_v2 = 578,
|
| 589 |
+
CUPTI_DRIVER_TRACE_CBID_cuDeviceGetTexture1DLinearMaxWidth = 579,
|
| 590 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphUpload = 580,
|
| 591 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphUpload_ptsz = 581,
|
| 592 |
+
CUPTI_DRIVER_TRACE_CBID_cuArrayGetSparseProperties = 582,
|
| 593 |
+
CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayGetSparseProperties = 583,
|
| 594 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemMapArrayAsync = 584,
|
| 595 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemMapArrayAsync_ptsz = 585,
|
| 596 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphExecChildGraphNodeSetParams = 586,
|
| 597 |
+
CUPTI_DRIVER_TRACE_CBID_cuEventRecordWithFlags = 587,
|
| 598 |
+
CUPTI_DRIVER_TRACE_CBID_cuEventRecordWithFlags_ptsz = 588,
|
| 599 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphAddEventRecordNode = 589,
|
| 600 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphAddEventWaitNode = 590,
|
| 601 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphEventRecordNodeGetEvent = 591,
|
| 602 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphEventWaitNodeGetEvent = 592,
|
| 603 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphEventRecordNodeSetEvent = 593,
|
| 604 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphEventWaitNodeSetEvent = 594,
|
| 605 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphExecEventRecordNodeSetEvent = 595,
|
| 606 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphExecEventWaitNodeSetEvent = 596,
|
| 607 |
+
CUPTI_DRIVER_TRACE_CBID_cuArrayGetPlane = 597,
|
| 608 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemAllocAsync = 598,
|
| 609 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemAllocAsync_ptsz = 599,
|
| 610 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemFreeAsync = 600,
|
| 611 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemFreeAsync_ptsz = 601,
|
| 612 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemPoolTrimTo = 602,
|
| 613 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemPoolSetAttribute = 603,
|
| 614 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemPoolGetAttribute = 604,
|
| 615 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemPoolSetAccess = 605,
|
| 616 |
+
CUPTI_DRIVER_TRACE_CBID_cuDeviceGetDefaultMemPool = 606,
|
| 617 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemPoolCreate = 607,
|
| 618 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemPoolDestroy = 608,
|
| 619 |
+
CUPTI_DRIVER_TRACE_CBID_cuDeviceSetMemPool = 609,
|
| 620 |
+
CUPTI_DRIVER_TRACE_CBID_cuDeviceGetMemPool = 610,
|
| 621 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemAllocFromPoolAsync = 611,
|
| 622 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemAllocFromPoolAsync_ptsz = 612,
|
| 623 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemPoolExportToShareableHandle = 613,
|
| 624 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemPoolImportFromShareableHandle = 614,
|
| 625 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemPoolExportPointer = 615,
|
| 626 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemPoolImportPointer = 616,
|
| 627 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemPoolGetAccess = 617,
|
| 628 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphAddExternalSemaphoresSignalNode = 618,
|
| 629 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphExternalSemaphoresSignalNodeGetParams = 619,
|
| 630 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphExternalSemaphoresSignalNodeSetParams = 620,
|
| 631 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphAddExternalSemaphoresWaitNode = 621,
|
| 632 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphExternalSemaphoresWaitNodeGetParams = 622,
|
| 633 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphExternalSemaphoresWaitNodeSetParams = 623,
|
| 634 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphExecExternalSemaphoresSignalNodeSetParams = 624,
|
| 635 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphExecExternalSemaphoresWaitNodeSetParams = 625,
|
| 636 |
+
CUPTI_DRIVER_TRACE_CBID_cuGetProcAddress = 626,
|
| 637 |
+
CUPTI_DRIVER_TRACE_CBID_cuFlushGPUDirectRDMAWrites = 627,
|
| 638 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphDebugDotPrint = 628,
|
| 639 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo_v2 = 629,
|
| 640 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo_v2_ptsz = 630,
|
| 641 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamUpdateCaptureDependencies = 631,
|
| 642 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamUpdateCaptureDependencies_ptsz = 632,
|
| 643 |
+
CUPTI_DRIVER_TRACE_CBID_cuUserObjectCreate = 633,
|
| 644 |
+
CUPTI_DRIVER_TRACE_CBID_cuUserObjectRetain = 634,
|
| 645 |
+
CUPTI_DRIVER_TRACE_CBID_cuUserObjectRelease = 635,
|
| 646 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphRetainUserObject = 636,
|
| 647 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphReleaseUserObject = 637,
|
| 648 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemAllocNode = 638,
|
| 649 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemFreeNode = 639,
|
| 650 |
+
CUPTI_DRIVER_TRACE_CBID_cuDeviceGraphMemTrim = 640,
|
| 651 |
+
CUPTI_DRIVER_TRACE_CBID_cuDeviceGetGraphMemAttribute = 641,
|
| 652 |
+
CUPTI_DRIVER_TRACE_CBID_cuDeviceSetGraphMemAttribute = 642,
|
| 653 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiateWithFlags = 643,
|
| 654 |
+
CUPTI_DRIVER_TRACE_CBID_cuDeviceGetExecAffinitySupport = 644,
|
| 655 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxCreate_v3 = 645,
|
| 656 |
+
CUPTI_DRIVER_TRACE_CBID_cuCtxGetExecAffinity = 646,
|
| 657 |
+
CUPTI_DRIVER_TRACE_CBID_cuDeviceGetUuid_v2 = 647,
|
| 658 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphMemAllocNodeGetParams = 648,
|
| 659 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphMemFreeNodeGetParams = 649,
|
| 660 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphNodeSetEnabled = 650,
|
| 661 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetEnabled = 651,
|
| 662 |
+
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx = 652,
|
| 663 |
+
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx_ptsz = 653,
|
| 664 |
+
CUPTI_DRIVER_TRACE_CBID_cuArrayGetMemoryRequirements = 654,
|
| 665 |
+
CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayGetMemoryRequirements = 655,
|
| 666 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiateWithParams = 656,
|
| 667 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiateWithParams_ptsz = 657,
|
| 668 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphExecGetFlags = 658,
|
| 669 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue32_v2 = 659,
|
| 670 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue32_v2_ptsz = 660,
|
| 671 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue64_v2 = 661,
|
| 672 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue64_v2_ptsz = 662,
|
| 673 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue32_v2 = 663,
|
| 674 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue32_v2_ptsz = 664,
|
| 675 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue64_v2 = 665,
|
| 676 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue64_v2_ptsz = 666,
|
| 677 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamBatchMemOp_v2 = 667,
|
| 678 |
+
CUPTI_DRIVER_TRACE_CBID_cuStreamBatchMemOp_v2_ptsz = 668,
|
| 679 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphAddBatchMemOpNode = 669,
|
| 680 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphBatchMemOpNodeGetParams = 670,
|
| 681 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphBatchMemOpNodeSetParams = 671,
|
| 682 |
+
CUPTI_DRIVER_TRACE_CBID_cuGraphExecBatchMemOpNodeSetParams = 672,
|
| 683 |
+
CUPTI_DRIVER_TRACE_CBID_cuModuleGetLoadingMode = 673,
|
| 684 |
+
CUPTI_DRIVER_TRACE_CBID_cuMemGetHandleForAddressRange = 674,
|
| 685 |
+
CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxPotentialClusterSize = 675,
|
| 686 |
+
CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxActiveClusters = 676,
|
| 687 |
+
CUPTI_DRIVER_TRACE_CBID_SIZE = 677,
|
| 688 |
+
CUPTI_DRIVER_TRACE_CBID_FORCE_INT = 0x7fffffff
|
| 689 |
+
} CUpti_driver_api_trace_cbid;
|
| 690 |
+
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_metrics.h
ADDED
|
@@ -0,0 +1,825 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2011-2020 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(_CUPTI_METRIC_H_)
|
| 51 |
+
#define _CUPTI_METRIC_H_
|
| 52 |
+
|
| 53 |
+
#include <cuda.h>
|
| 54 |
+
#include <string.h>
|
| 55 |
+
#include <cuda_stdint.h>
|
| 56 |
+
#include <cupti_result.h>
|
| 57 |
+
|
| 58 |
+
#ifndef CUPTIAPI
|
| 59 |
+
#ifdef _WIN32
|
| 60 |
+
#define CUPTIAPI __stdcall
|
| 61 |
+
#else
|
| 62 |
+
#define CUPTIAPI
|
| 63 |
+
#endif
|
| 64 |
+
#endif
|
| 65 |
+
|
| 66 |
+
#if defined(__cplusplus)
|
| 67 |
+
extern "C" {
|
| 68 |
+
#endif
|
| 69 |
+
|
| 70 |
+
#if defined(__GNUC__) && defined(CUPTI_LIB)
|
| 71 |
+
#pragma GCC visibility push(default)
|
| 72 |
+
#endif
|
| 73 |
+
|
| 74 |
+
/**
|
| 75 |
+
* \defgroup CUPTI_METRIC_API CUPTI Metric API
|
| 76 |
+
* Functions, types, and enums that implement the CUPTI Metric API.
|
| 77 |
+
*
|
| 78 |
+
* \note CUPTI metric API from the header cupti_metrics.h are not supported on devices
|
| 79 |
+
* with compute capability 7.5 and higher (i.e. Turing and later GPU architectures).
|
| 80 |
+
* These API will be deprecated in a future CUDA release. These are replaced by
|
| 81 |
+
* Profiling API in the header cupti_profiler_target.h and Perfworks metrics API
|
| 82 |
+
* in the headers nvperf_host.h and nvperf_target.h which are supported on
|
| 83 |
+
* devices with compute capability 7.0 and higher (i.e. Volta and later GPU
|
| 84 |
+
* architectures).
|
| 85 |
+
*
|
| 86 |
+
* @{
|
| 87 |
+
*/
|
| 88 |
+
|
| 89 |
+
/**
|
| 90 |
+
* \brief ID for a metric.
|
| 91 |
+
*
|
| 92 |
+
* A metric provides a measure of some aspect of the device.
|
| 93 |
+
*/
|
| 94 |
+
typedef uint32_t CUpti_MetricID;
|
| 95 |
+
|
| 96 |
+
/**
|
| 97 |
+
* \brief A metric category.
|
| 98 |
+
*
|
| 99 |
+
* Each metric is assigned to a category that represents the general
|
| 100 |
+
* type of the metric. A metric's category is accessed using \ref
|
| 101 |
+
* cuptiMetricGetAttribute and the CUPTI_METRIC_ATTR_CATEGORY
|
| 102 |
+
* attribute.
|
| 103 |
+
*/
|
| 104 |
+
typedef enum {
|
| 105 |
+
/**
|
| 106 |
+
* A memory related metric.
|
| 107 |
+
*/
|
| 108 |
+
CUPTI_METRIC_CATEGORY_MEMORY = 0,
|
| 109 |
+
/**
|
| 110 |
+
* An instruction related metric.
|
| 111 |
+
*/
|
| 112 |
+
CUPTI_METRIC_CATEGORY_INSTRUCTION = 1,
|
| 113 |
+
/**
|
| 114 |
+
* A multiprocessor related metric.
|
| 115 |
+
*/
|
| 116 |
+
CUPTI_METRIC_CATEGORY_MULTIPROCESSOR = 2,
|
| 117 |
+
/**
|
| 118 |
+
* A cache related metric.
|
| 119 |
+
*/
|
| 120 |
+
CUPTI_METRIC_CATEGORY_CACHE = 3,
|
| 121 |
+
/**
|
| 122 |
+
* A texture related metric.
|
| 123 |
+
*/
|
| 124 |
+
CUPTI_METRIC_CATEGORY_TEXTURE = 4,
|
| 125 |
+
/**
|
| 126 |
+
*A Nvlink related metric.
|
| 127 |
+
*/
|
| 128 |
+
CUPTI_METRIC_CATEGORY_NVLINK = 5,
|
| 129 |
+
/**
|
| 130 |
+
*A PCIe related metric.
|
| 131 |
+
*/
|
| 132 |
+
CUPTI_METRIC_CATEGORY_PCIE = 6,
|
| 133 |
+
CUPTI_METRIC_CATEGORY_FORCE_INT = 0x7fffffff,
|
| 134 |
+
} CUpti_MetricCategory;
|
| 135 |
+
|
| 136 |
+
/**
|
| 137 |
+
* \brief A metric evaluation mode.
|
| 138 |
+
*
|
| 139 |
+
* A metric can be evaluated per hardware instance to know the load balancing
|
| 140 |
+
* across instances of a domain or the metric can be evaluated in aggregate mode
|
| 141 |
+
* when the events involved in metric evaluation are from different event
|
| 142 |
+
* domains. It might be possible to evaluate some metrics in both
|
| 143 |
+
* modes for convenience. A metric's evaluation mode is accessed using \ref
|
| 144 |
+
* CUpti_MetricEvaluationMode and the CUPTI_METRIC_ATTR_EVALUATION_MODE
|
| 145 |
+
* attribute.
|
| 146 |
+
*/
|
| 147 |
+
typedef enum {
|
| 148 |
+
/**
|
| 149 |
+
* If this bit is set, the metric can be profiled for each instance of the
|
| 150 |
+
* domain. The event values passed to \ref cuptiMetricGetValue can contain
|
| 151 |
+
* values for one instance of the domain. And \ref cuptiMetricGetValue can
|
| 152 |
+
* be called for each instance.
|
| 153 |
+
*/
|
| 154 |
+
CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE = 1,
|
| 155 |
+
/**
|
| 156 |
+
* If this bit is set, the metric can be profiled over all instances. The
|
| 157 |
+
* event values passed to \ref cuptiMetricGetValue can be aggregated values
|
| 158 |
+
* of events for all instances of the domain.
|
| 159 |
+
*/
|
| 160 |
+
CUPTI_METRIC_EVALUATION_MODE_AGGREGATE = 1 << 1,
|
| 161 |
+
CUPTI_METRIC_EVALUATION_MODE_FORCE_INT = 0x7fffffff,
|
| 162 |
+
} CUpti_MetricEvaluationMode;
|
| 163 |
+
|
| 164 |
+
/**
|
| 165 |
+
* \brief Kinds of metric values.
|
| 166 |
+
*
|
| 167 |
+
* Metric values can be one of several different kinds. Corresponding
|
| 168 |
+
* to each kind is a member of the CUpti_MetricValue union. The metric
|
| 169 |
+
* value returned by \ref cuptiMetricGetValue should be accessed using
|
| 170 |
+
* the appropriate member of that union based on its value kind.
|
| 171 |
+
*/
|
| 172 |
+
typedef enum {
|
| 173 |
+
/**
|
| 174 |
+
* The metric value is a 64-bit double.
|
| 175 |
+
*/
|
| 176 |
+
CUPTI_METRIC_VALUE_KIND_DOUBLE = 0,
|
| 177 |
+
/**
|
| 178 |
+
* The metric value is a 64-bit unsigned integer.
|
| 179 |
+
*/
|
| 180 |
+
CUPTI_METRIC_VALUE_KIND_UINT64 = 1,
|
| 181 |
+
/**
|
| 182 |
+
* The metric value is a percentage represented by a 64-bit
|
| 183 |
+
* double. For example, 57.5% is represented by the value 57.5.
|
| 184 |
+
*/
|
| 185 |
+
CUPTI_METRIC_VALUE_KIND_PERCENT = 2,
|
| 186 |
+
/**
|
| 187 |
+
* The metric value is a throughput represented by a 64-bit
|
| 188 |
+
* integer. The unit for throughput values is bytes/second.
|
| 189 |
+
*/
|
| 190 |
+
CUPTI_METRIC_VALUE_KIND_THROUGHPUT = 3,
|
| 191 |
+
/**
|
| 192 |
+
* The metric value is a 64-bit signed integer.
|
| 193 |
+
*/
|
| 194 |
+
CUPTI_METRIC_VALUE_KIND_INT64 = 4,
|
| 195 |
+
/**
|
| 196 |
+
* The metric value is a utilization level, as represented by
|
| 197 |
+
* CUpti_MetricValueUtilizationLevel.
|
| 198 |
+
*/
|
| 199 |
+
CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL = 5,
|
| 200 |
+
|
| 201 |
+
CUPTI_METRIC_VALUE_KIND_FORCE_INT = 0x7fffffff
|
| 202 |
+
} CUpti_MetricValueKind;
|
| 203 |
+
|
| 204 |
+
/**
|
| 205 |
+
* \brief Enumeration of utilization levels for metrics values of kind
|
| 206 |
+
* CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL. Utilization values can
|
| 207 |
+
* vary from IDLE (0) to MAX (10) but the enumeration only provides
|
| 208 |
+
* specific names for a few values.
|
| 209 |
+
*/
|
| 210 |
+
typedef enum {
|
| 211 |
+
CUPTI_METRIC_VALUE_UTILIZATION_IDLE = 0,
|
| 212 |
+
CUPTI_METRIC_VALUE_UTILIZATION_LOW = 2,
|
| 213 |
+
CUPTI_METRIC_VALUE_UTILIZATION_MID = 5,
|
| 214 |
+
CUPTI_METRIC_VALUE_UTILIZATION_HIGH = 8,
|
| 215 |
+
CUPTI_METRIC_VALUE_UTILIZATION_MAX = 10,
|
| 216 |
+
CUPTI_METRIC_VALUE_UTILIZATION_FORCE_INT = 0x7fffffff
|
| 217 |
+
} CUpti_MetricValueUtilizationLevel;
|
| 218 |
+
|
| 219 |
+
/**
|
| 220 |
+
* \brief Metric attributes.
|
| 221 |
+
*
|
| 222 |
+
* Metric attributes describe properties of a metric. These attributes
|
| 223 |
+
* can be read using \ref cuptiMetricGetAttribute.
|
| 224 |
+
*/
|
| 225 |
+
typedef enum {
|
| 226 |
+
/**
|
| 227 |
+
* Metric name. Value is a null terminated const c-string.
|
| 228 |
+
*/
|
| 229 |
+
CUPTI_METRIC_ATTR_NAME = 0,
|
| 230 |
+
/**
|
| 231 |
+
* Short description of metric. Value is a null terminated const c-string.
|
| 232 |
+
*/
|
| 233 |
+
CUPTI_METRIC_ATTR_SHORT_DESCRIPTION = 1,
|
| 234 |
+
/**
|
| 235 |
+
* Long description of metric. Value is a null terminated const c-string.
|
| 236 |
+
*/
|
| 237 |
+
CUPTI_METRIC_ATTR_LONG_DESCRIPTION = 2,
|
| 238 |
+
/**
|
| 239 |
+
* Category of the metric. Value is of type CUpti_MetricCategory.
|
| 240 |
+
*/
|
| 241 |
+
CUPTI_METRIC_ATTR_CATEGORY = 3,
|
| 242 |
+
/**
|
| 243 |
+
* Value type of the metric. Value is of type CUpti_MetricValueKind.
|
| 244 |
+
*/
|
| 245 |
+
CUPTI_METRIC_ATTR_VALUE_KIND = 4,
|
| 246 |
+
/**
|
| 247 |
+
* Metric evaluation mode. Value is of type CUpti_MetricEvaluationMode.
|
| 248 |
+
*/
|
| 249 |
+
CUPTI_METRIC_ATTR_EVALUATION_MODE = 5,
|
| 250 |
+
CUPTI_METRIC_ATTR_FORCE_INT = 0x7fffffff,
|
| 251 |
+
} CUpti_MetricAttribute;
|
| 252 |
+
|
| 253 |
+
/**
|
| 254 |
+
* \brief A metric value.
|
| 255 |
+
*
|
| 256 |
+
* Metric values can be one of several different kinds. Corresponding
|
| 257 |
+
* to each kind is a member of the CUpti_MetricValue union. The metric
|
| 258 |
+
* value returned by \ref cuptiMetricGetValue should be accessed using
|
| 259 |
+
* the appropriate member of that union based on its value kind.
|
| 260 |
+
*/
|
| 261 |
+
typedef union {
|
| 262 |
+
/*
|
| 263 |
+
* Value for CUPTI_METRIC_VALUE_KIND_DOUBLE.
|
| 264 |
+
*/
|
| 265 |
+
double metricValueDouble;
|
| 266 |
+
/*
|
| 267 |
+
* Value for CUPTI_METRIC_VALUE_KIND_UINT64.
|
| 268 |
+
*/
|
| 269 |
+
uint64_t metricValueUint64;
|
| 270 |
+
/*
|
| 271 |
+
* Value for CUPTI_METRIC_VALUE_KIND_INT64.
|
| 272 |
+
*/
|
| 273 |
+
int64_t metricValueInt64;
|
| 274 |
+
/*
|
| 275 |
+
* Value for CUPTI_METRIC_VALUE_KIND_PERCENT. For example, 57.5% is
|
| 276 |
+
* represented by the value 57.5.
|
| 277 |
+
*/
|
| 278 |
+
double metricValuePercent;
|
| 279 |
+
/*
|
| 280 |
+
* Value for CUPTI_METRIC_VALUE_KIND_THROUGHPUT. The unit for
|
| 281 |
+
* throughput values is bytes/second.
|
| 282 |
+
*/
|
| 283 |
+
uint64_t metricValueThroughput;
|
| 284 |
+
/*
|
| 285 |
+
* Value for CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL.
|
| 286 |
+
*/
|
| 287 |
+
CUpti_MetricValueUtilizationLevel metricValueUtilizationLevel;
|
| 288 |
+
} CUpti_MetricValue;
|
| 289 |
+
|
| 290 |
+
/**
|
| 291 |
+
* \brief Device class.
|
| 292 |
+
*
|
| 293 |
+
* Enumeration of device classes for metric property
|
| 294 |
+
* CUPTI_METRIC_PROPERTY_DEVICE_CLASS.
|
| 295 |
+
*/
|
| 296 |
+
typedef enum {
|
| 297 |
+
CUPTI_METRIC_PROPERTY_DEVICE_CLASS_TESLA = 0,
|
| 298 |
+
CUPTI_METRIC_PROPERTY_DEVICE_CLASS_QUADRO = 1,
|
| 299 |
+
CUPTI_METRIC_PROPERTY_DEVICE_CLASS_GEFORCE = 2,
|
| 300 |
+
CUPTI_METRIC_PROPERTY_DEVICE_CLASS_TEGRA = 3,
|
| 301 |
+
} CUpti_MetricPropertyDeviceClass;
|
| 302 |
+
|
| 303 |
+
/**
|
| 304 |
+
* \brief Metric device properties.
|
| 305 |
+
*
|
| 306 |
+
* Metric device properties describe device properties which are needed for a metric.
|
| 307 |
+
* Some of these properties can be collected using cuDeviceGetAttribute.
|
| 308 |
+
*/
|
| 309 |
+
typedef enum {
|
| 310 |
+
/*
|
| 311 |
+
* Number of multiprocessors on a device. This can be collected
|
| 312 |
+
* using value of \param CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT of
|
| 313 |
+
* cuDeviceGetAttribute.
|
| 314 |
+
*/
|
| 315 |
+
CUPTI_METRIC_PROPERTY_MULTIPROCESSOR_COUNT,
|
| 316 |
+
/*
|
| 317 |
+
* Maximum number of warps on a multiprocessor. This can be
|
| 318 |
+
* collected using ratio of value of \param
|
| 319 |
+
* CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR and \param
|
| 320 |
+
* CU_DEVICE_ATTRIBUTE_WARP_SIZE of cuDeviceGetAttribute.
|
| 321 |
+
*/
|
| 322 |
+
CUPTI_METRIC_PROPERTY_WARPS_PER_MULTIPROCESSOR,
|
| 323 |
+
/*
|
| 324 |
+
* GPU Time for kernel in ns. This should be profiled using CUPTI
|
| 325 |
+
* Activity API.
|
| 326 |
+
*/
|
| 327 |
+
CUPTI_METRIC_PROPERTY_KERNEL_GPU_TIME,
|
| 328 |
+
/*
|
| 329 |
+
* Clock rate for device in KHz. This should be collected using
|
| 330 |
+
* value of \param CU_DEVICE_ATTRIBUTE_CLOCK_RATE of
|
| 331 |
+
* cuDeviceGetAttribute.
|
| 332 |
+
*/
|
| 333 |
+
CUPTI_METRIC_PROPERTY_CLOCK_RATE,
|
| 334 |
+
/*
|
| 335 |
+
* Number of Frame buffer units for device. This should be collected
|
| 336 |
+
* using value of \param CUPTI_DEVICE_ATTRIBUTE_MAX_FRAME_BUFFERS of
|
| 337 |
+
* cuptiDeviceGetAttribute.
|
| 338 |
+
*/
|
| 339 |
+
CUPTI_METRIC_PROPERTY_FRAME_BUFFER_COUNT,
|
| 340 |
+
/*
|
| 341 |
+
* Global memory bandwidth in KBytes/sec. This should be collected
|
| 342 |
+
* using value of \param CUPTI_DEVICE_ATTR_GLOBAL_MEMORY_BANDWIDTH
|
| 343 |
+
* of cuptiDeviceGetAttribute.
|
| 344 |
+
*/
|
| 345 |
+
CUPTI_METRIC_PROPERTY_GLOBAL_MEMORY_BANDWIDTH,
|
| 346 |
+
/*
|
| 347 |
+
* PCIE link rate in Mega bits/sec. This should be collected using
|
| 348 |
+
* value of \param CUPTI_DEVICE_ATTR_PCIE_LINK_RATE of
|
| 349 |
+
* cuptiDeviceGetAttribute.
|
| 350 |
+
*/
|
| 351 |
+
CUPTI_METRIC_PROPERTY_PCIE_LINK_RATE,
|
| 352 |
+
/*
|
| 353 |
+
* PCIE link width for device. This should be collected using
|
| 354 |
+
* value of \param CUPTI_DEVICE_ATTR_PCIE_LINK_WIDTH of
|
| 355 |
+
* cuptiDeviceGetAttribute.
|
| 356 |
+
*/
|
| 357 |
+
CUPTI_METRIC_PROPERTY_PCIE_LINK_WIDTH,
|
| 358 |
+
/*
|
| 359 |
+
* PCIE generation for device. This should be collected using
|
| 360 |
+
* value of \param CUPTI_DEVICE_ATTR_PCIE_GEN of
|
| 361 |
+
* cuptiDeviceGetAttribute.
|
| 362 |
+
*/
|
| 363 |
+
CUPTI_METRIC_PROPERTY_PCIE_GEN,
|
| 364 |
+
/*
|
| 365 |
+
* The device class. This should be collected using
|
| 366 |
+
* value of \param CUPTI_DEVICE_ATTR_DEVICE_CLASS of
|
| 367 |
+
* cuptiDeviceGetAttribute.
|
| 368 |
+
*/
|
| 369 |
+
CUPTI_METRIC_PROPERTY_DEVICE_CLASS,
|
| 370 |
+
/*
|
| 371 |
+
* Peak single precision floating point operations that
|
| 372 |
+
* can be performed in one cycle by the device.
|
| 373 |
+
* This should be collected using value of
|
| 374 |
+
* \param CUPTI_DEVICE_ATTR_FLOP_SP_PER_CYCLE of
|
| 375 |
+
* cuptiDeviceGetAttribute.
|
| 376 |
+
*/
|
| 377 |
+
CUPTI_METRIC_PROPERTY_FLOP_SP_PER_CYCLE,
|
| 378 |
+
/*
|
| 379 |
+
* Peak double precision floating point operations that
|
| 380 |
+
* can be performed in one cycle by the device.
|
| 381 |
+
* This should be collected using value of
|
| 382 |
+
* \param CUPTI_DEVICE_ATTR_FLOP_DP_PER_CYCLE of
|
| 383 |
+
* cuptiDeviceGetAttribute.
|
| 384 |
+
*/
|
| 385 |
+
CUPTI_METRIC_PROPERTY_FLOP_DP_PER_CYCLE,
|
| 386 |
+
/*
|
| 387 |
+
* Number of L2 units on a device. This can be collected
|
| 388 |
+
* using value of \param CUPTI_DEVICE_ATTR_MAX_L2_UNITS of
|
| 389 |
+
* cuDeviceGetAttribute.
|
| 390 |
+
*/
|
| 391 |
+
CUPTI_METRIC_PROPERTY_L2_UNITS,
|
| 392 |
+
/*
|
| 393 |
+
* Whether ECC support is enabled on the device. This can be
|
| 394 |
+
* collected using value of \param CU_DEVICE_ATTRIBUTE_ECC_ENABLED of
|
| 395 |
+
* cuDeviceGetAttribute.
|
| 396 |
+
*/
|
| 397 |
+
CUPTI_METRIC_PROPERTY_ECC_ENABLED,
|
| 398 |
+
/*
|
| 399 |
+
* Peak half precision floating point operations that
|
| 400 |
+
* can be performed in one cycle by the device.
|
| 401 |
+
* This should be collected using value of
|
| 402 |
+
* \param CUPTI_DEVICE_ATTR_FLOP_HP_PER_CYCLE of
|
| 403 |
+
* cuptiDeviceGetAttribute.
|
| 404 |
+
*/
|
| 405 |
+
CUPTI_METRIC_PROPERTY_FLOP_HP_PER_CYCLE,
|
| 406 |
+
/*
|
| 407 |
+
* NVLINK Bandwitdh for device. This should be collected
|
| 408 |
+
* using value of \param CUPTI_DEVICE_ATTR_GPU_CPU_NVLINK_BW of
|
| 409 |
+
* cuptiDeviceGetAttribute.
|
| 410 |
+
*/
|
| 411 |
+
CUPTI_METRIC_PROPERTY_GPU_CPU_NVLINK_BANDWIDTH,
|
| 412 |
+
} CUpti_MetricPropertyID;
|
| 413 |
+
|
| 414 |
+
/**
|
| 415 |
+
* \brief Get the total number of metrics available on any device.
|
| 416 |
+
*
|
| 417 |
+
* Returns the total number of metrics available on any CUDA-capable
|
| 418 |
+
* devices.
|
| 419 |
+
*
|
| 420 |
+
* \param numMetrics Returns the number of metrics
|
| 421 |
+
*
|
| 422 |
+
* \retval CUPTI_SUCCESS
|
| 423 |
+
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p numMetrics is NULL
|
| 424 |
+
*/
|
| 425 |
+
CUptiResult CUPTIAPI cuptiGetNumMetrics(uint32_t *numMetrics);
|
| 426 |
+
|
| 427 |
+
/**
|
| 428 |
+
* \brief Get all the metrics available on any device.
|
| 429 |
+
*
|
| 430 |
+
* Returns the metric IDs in \p metricArray for all CUDA-capable
|
| 431 |
+
* devices. The size of the \p metricArray buffer is given by \p
|
| 432 |
+
* *arraySizeBytes. The size of the \p metricArray buffer must be at
|
| 433 |
+
* least \p numMetrics * sizeof(CUpti_MetricID) or all metric IDs will
|
| 434 |
+
* not be returned. The value returned in \p *arraySizeBytes contains
|
| 435 |
+
* the number of bytes returned in \p metricArray.
|
| 436 |
+
*
|
| 437 |
+
* \param arraySizeBytes The size of \p metricArray in bytes, and
|
| 438 |
+
* returns the number of bytes written to \p metricArray
|
| 439 |
+
* \param metricArray Returns the IDs of the metrics
|
| 440 |
+
*
|
| 441 |
+
* \retval CUPTI_SUCCESS
|
| 442 |
+
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or
|
| 443 |
+
* \p metricArray are NULL
|
| 444 |
+
*/
|
| 445 |
+
CUptiResult CUPTIAPI cuptiEnumMetrics(size_t *arraySizeBytes,
|
| 446 |
+
CUpti_MetricID *metricArray);
|
| 447 |
+
|
| 448 |
+
/**
|
| 449 |
+
* \brief Get the number of metrics for a device.
|
| 450 |
+
*
|
| 451 |
+
* Returns the number of metrics available for a device.
|
| 452 |
+
*
|
| 453 |
+
* \param device The CUDA device
|
| 454 |
+
* \param numMetrics Returns the number of metrics available for the
|
| 455 |
+
* device
|
| 456 |
+
*
|
| 457 |
+
* \retval CUPTI_SUCCESS
|
| 458 |
+
* \retval CUPTI_ERROR_NOT_INITIALIZED
|
| 459 |
+
* \retval CUPTI_ERROR_INVALID_DEVICE
|
| 460 |
+
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p numMetrics is NULL
|
| 461 |
+
*/
|
| 462 |
+
CUptiResult CUPTIAPI cuptiDeviceGetNumMetrics(CUdevice device,
|
| 463 |
+
uint32_t *numMetrics);
|
| 464 |
+
|
| 465 |
+
/**
|
| 466 |
+
* \brief Get the metrics for a device.
|
| 467 |
+
*
|
| 468 |
+
* Returns the metric IDs in \p metricArray for a device. The size of
|
| 469 |
+
* the \p metricArray buffer is given by \p *arraySizeBytes. The size
|
| 470 |
+
* of the \p metricArray buffer must be at least \p numMetrics *
|
| 471 |
+
* sizeof(CUpti_MetricID) or else all metric IDs will not be
|
| 472 |
+
* returned. The value returned in \p *arraySizeBytes contains the
|
| 473 |
+
* number of bytes returned in \p metricArray.
|
| 474 |
+
*
|
| 475 |
+
* \param device The CUDA device
|
| 476 |
+
* \param arraySizeBytes The size of \p metricArray in bytes, and
|
| 477 |
+
* returns the number of bytes written to \p metricArray
|
| 478 |
+
* \param metricArray Returns the IDs of the metrics for the device
|
| 479 |
+
*
|
| 480 |
+
* \retval CUPTI_SUCCESS
|
| 481 |
+
* \retval CUPTI_ERROR_NOT_INITIALIZED
|
| 482 |
+
* \retval CUPTI_ERROR_INVALID_DEVICE
|
| 483 |
+
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or
|
| 484 |
+
* \p metricArray are NULL
|
| 485 |
+
*/
|
| 486 |
+
CUptiResult CUPTIAPI cuptiDeviceEnumMetrics(CUdevice device,
|
| 487 |
+
size_t *arraySizeBytes,
|
| 488 |
+
CUpti_MetricID *metricArray);
|
| 489 |
+
|
| 490 |
+
/**
|
| 491 |
+
* \brief Get a metric attribute.
|
| 492 |
+
*
|
| 493 |
+
* Returns a metric attribute in \p *value. The size of the \p
|
| 494 |
+
* value buffer is given by \p *valueSize. The value returned in \p
|
| 495 |
+
* *valueSize contains the number of bytes returned in \p value.
|
| 496 |
+
*
|
| 497 |
+
* If the attribute value is a c-string that is longer than \p
|
| 498 |
+
* *valueSize, then only the first \p *valueSize characters will be
|
| 499 |
+
* returned and there will be no terminating null byte.
|
| 500 |
+
*
|
| 501 |
+
* \param metric ID of the metric
|
| 502 |
+
* \param attrib The metric attribute to read
|
| 503 |
+
* \param valueSize The size of the \p value buffer in bytes, and
|
| 504 |
+
* returns the number of bytes written to \p value
|
| 505 |
+
* \param value Returns the attribute's value
|
| 506 |
+
*
|
| 507 |
+
* \retval CUPTI_SUCCESS
|
| 508 |
+
* \retval CUPTI_ERROR_NOT_INITIALIZED
|
| 509 |
+
* \retval CUPTI_ERROR_INVALID_METRIC_ID
|
| 510 |
+
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
|
| 511 |
+
* is NULL, or if \p attrib is not a metric attribute
|
| 512 |
+
* \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
|
| 513 |
+
* attribute values, indicates that the \p value buffer is too small
|
| 514 |
+
* to hold the attribute value.
|
| 515 |
+
*/
|
| 516 |
+
CUptiResult CUPTIAPI cuptiMetricGetAttribute(CUpti_MetricID metric,
|
| 517 |
+
CUpti_MetricAttribute attrib,
|
| 518 |
+
size_t *valueSize,
|
| 519 |
+
void *value);
|
| 520 |
+
|
| 521 |
+
/**
|
| 522 |
+
* \brief Find an metric by name.
|
| 523 |
+
*
|
| 524 |
+
* Find a metric by name and return the metric ID in \p *metric.
|
| 525 |
+
*
|
| 526 |
+
* \param device The CUDA device
|
| 527 |
+
* \param metricName The name of metric to find
|
| 528 |
+
* \param metric Returns the ID of the found metric or undefined if
|
| 529 |
+
* unable to find the metric
|
| 530 |
+
*
|
| 531 |
+
* \retval CUPTI_SUCCESS
|
| 532 |
+
* \retval CUPTI_ERROR_NOT_INITIALIZED
|
| 533 |
+
* \retval CUPTI_ERROR_INVALID_DEVICE
|
| 534 |
+
* \retval CUPTI_ERROR_INVALID_METRIC_NAME if unable to find a metric
|
| 535 |
+
* with name \p metricName. In this case \p *metric is undefined
|
| 536 |
+
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricName or \p
|
| 537 |
+
* metric are NULL.
|
| 538 |
+
*/
|
| 539 |
+
CUptiResult CUPTIAPI cuptiMetricGetIdFromName(CUdevice device,
|
| 540 |
+
const char *metricName,
|
| 541 |
+
CUpti_MetricID *metric);
|
| 542 |
+
|
| 543 |
+
/**
|
| 544 |
+
* \brief Get number of events required to calculate a metric.
|
| 545 |
+
*
|
| 546 |
+
* Returns the number of events in \p numEvents that are required to
|
| 547 |
+
* calculate a metric.
|
| 548 |
+
*
|
| 549 |
+
* \param metric ID of the metric
|
| 550 |
+
* \param numEvents Returns the number of events required for the metric
|
| 551 |
+
*
|
| 552 |
+
* \retval CUPTI_SUCCESS
|
| 553 |
+
* \retval CUPTI_ERROR_NOT_INITIALIZED
|
| 554 |
+
* \retval CUPTI_ERROR_INVALID_METRIC_ID
|
| 555 |
+
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p numEvents is NULL
|
| 556 |
+
*/
|
| 557 |
+
CUptiResult CUPTIAPI cuptiMetricGetNumEvents(CUpti_MetricID metric,
|
| 558 |
+
uint32_t *numEvents);
|
| 559 |
+
|
| 560 |
+
/**
|
| 561 |
+
* \brief Get the events required to calculating a metric.
|
| 562 |
+
*
|
| 563 |
+
* Gets the event IDs in \p eventIdArray required to calculate a \p
|
| 564 |
+
* metric. The size of the \p eventIdArray buffer is given by \p
|
| 565 |
+
* *eventIdArraySizeBytes and must be at least \p numEvents *
|
| 566 |
+
* sizeof(CUpti_EventID) or all events will not be returned. The value
|
| 567 |
+
* returned in \p *eventIdArraySizeBytes contains the number of bytes
|
| 568 |
+
* returned in \p eventIdArray.
|
| 569 |
+
*
|
| 570 |
+
* \param metric ID of the metric
|
| 571 |
+
* \param eventIdArraySizeBytes The size of \p eventIdArray in bytes,
|
| 572 |
+
* and returns the number of bytes written to \p eventIdArray
|
| 573 |
+
* \param eventIdArray Returns the IDs of the events required to
|
| 574 |
+
* calculate \p metric
|
| 575 |
+
*
|
| 576 |
+
* \retval CUPTI_SUCCESS
|
| 577 |
+
* \retval CUPTI_ERROR_NOT_INITIALIZED
|
| 578 |
+
* \retval CUPTI_ERROR_INVALID_METRIC_ID
|
| 579 |
+
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventIdArraySizeBytes or \p
|
| 580 |
+
* eventIdArray are NULL.
|
| 581 |
+
*/
|
| 582 |
+
CUptiResult CUPTIAPI cuptiMetricEnumEvents(CUpti_MetricID metric,
|
| 583 |
+
size_t *eventIdArraySizeBytes,
|
| 584 |
+
CUpti_EventID *eventIdArray);
|
| 585 |
+
|
| 586 |
+
/**
|
| 587 |
+
* \brief Get number of properties required to calculate a metric.
|
| 588 |
+
*
|
| 589 |
+
* Returns the number of properties in \p numProp that are required to
|
| 590 |
+
* calculate a metric.
|
| 591 |
+
*
|
| 592 |
+
* \param metric ID of the metric
|
| 593 |
+
* \param numProp Returns the number of properties required for the
|
| 594 |
+
* metric
|
| 595 |
+
*
|
| 596 |
+
* \retval CUPTI_SUCCESS
|
| 597 |
+
* \retval CUPTI_ERROR_NOT_INITIALIZED
|
| 598 |
+
* \retval CUPTI_ERROR_INVALID_METRIC_ID
|
| 599 |
+
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p numProp is NULL
|
| 600 |
+
*/
|
| 601 |
+
CUptiResult CUPTIAPI cuptiMetricGetNumProperties(CUpti_MetricID metric,
|
| 602 |
+
uint32_t *numProp);
|
| 603 |
+
|
| 604 |
+
/**
|
| 605 |
+
* \brief Get the properties required to calculating a metric.
|
| 606 |
+
*
|
| 607 |
+
* Gets the property IDs in \p propIdArray required to calculate a \p
|
| 608 |
+
* metric. The size of the \p propIdArray buffer is given by \p
|
| 609 |
+
* *propIdArraySizeBytes and must be at least \p numProp *
|
| 610 |
+
* sizeof(CUpti_DeviceAttribute) or all properties will not be
|
| 611 |
+
* returned. The value returned in \p *propIdArraySizeBytes contains
|
| 612 |
+
* the number of bytes returned in \p propIdArray.
|
| 613 |
+
*
|
| 614 |
+
* \param metric ID of the metric
|
| 615 |
+
* \param propIdArraySizeBytes The size of \p propIdArray in bytes,
|
| 616 |
+
* and returns the number of bytes written to \p propIdArray
|
| 617 |
+
* \param propIdArray Returns the IDs of the properties required to
|
| 618 |
+
* calculate \p metric
|
| 619 |
+
*
|
| 620 |
+
* \retval CUPTI_SUCCESS
|
| 621 |
+
* \retval CUPTI_ERROR_NOT_INITIALIZED
|
| 622 |
+
* \retval CUPTI_ERROR_INVALID_METRIC_ID
|
| 623 |
+
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p propIdArraySizeBytes or \p
|
| 624 |
+
* propIdArray are NULL.
|
| 625 |
+
*/
|
| 626 |
+
CUptiResult CUPTIAPI cuptiMetricEnumProperties(CUpti_MetricID metric,
|
| 627 |
+
size_t *propIdArraySizeBytes,
|
| 628 |
+
CUpti_MetricPropertyID *propIdArray);
|
| 629 |
+
|
| 630 |
+
|
| 631 |
+
/**
|
| 632 |
+
* \brief For a metric get the groups of events that must be collected
|
| 633 |
+
* in the same pass.
|
| 634 |
+
*
|
| 635 |
+
* For a metric get the groups of events that must be collected in the
|
| 636 |
+
* same pass to ensure that the metric is calculated correctly. If the
|
| 637 |
+
* events are not collected as specified then the metric value may be
|
| 638 |
+
* inaccurate.
|
| 639 |
+
*
|
| 640 |
+
* The function returns NULL if a metric does not have any required
|
| 641 |
+
* event group. In this case the events needed for the metric can be
|
| 642 |
+
* grouped in any manner for collection.
|
| 643 |
+
*
|
| 644 |
+
* \param context The context for event collection
|
| 645 |
+
* \param metric The metric ID
|
| 646 |
+
* \param eventGroupSets Returns a CUpti_EventGroupSets object that
|
| 647 |
+
* indicates the events that must be collected in the same pass to
|
| 648 |
+
* ensure the metric is calculated correctly. Returns NULL if no
|
| 649 |
+
* grouping is required for metric
|
| 650 |
+
* \retval CUPTI_SUCCESS
|
| 651 |
+
* \retval CUPTI_ERROR_NOT_INITIALIZED
|
| 652 |
+
* \retval CUPTI_ERROR_INVALID_METRIC_ID
|
| 653 |
+
*/
|
| 654 |
+
CUptiResult CUPTIAPI cuptiMetricGetRequiredEventGroupSets(CUcontext context,
|
| 655 |
+
CUpti_MetricID metric,
|
| 656 |
+
CUpti_EventGroupSets **eventGroupSets);
|
| 657 |
+
|
| 658 |
+
/**
|
| 659 |
+
* \brief For a set of metrics, get the grouping that indicates the
|
| 660 |
+
* number of passes and the event groups necessary to collect the
|
| 661 |
+
* events required for those metrics.
|
| 662 |
+
*
|
| 663 |
+
* For a set of metrics, get the grouping that indicates the number of
|
| 664 |
+
* passes and the event groups necessary to collect the events
|
| 665 |
+
* required for those metrics.
|
| 666 |
+
*
|
| 667 |
+
* \see cuptiEventGroupSetsCreate for details on event group set
|
| 668 |
+
* creation.
|
| 669 |
+
*
|
| 670 |
+
* \param context The context for event collection
|
| 671 |
+
* \param metricIdArraySizeBytes Size of the metricIdArray in bytes
|
| 672 |
+
* \param metricIdArray Array of metric IDs
|
| 673 |
+
* \param eventGroupPasses Returns a CUpti_EventGroupSets object that
|
| 674 |
+
* indicates the number of passes required to collect the events and
|
| 675 |
+
* the events to collect on each pass
|
| 676 |
+
*
|
| 677 |
+
* \retval CUPTI_SUCCESS
|
| 678 |
+
* \retval CUPTI_ERROR_NOT_INITIALIZED
|
| 679 |
+
* \retval CUPTI_ERROR_INVALID_CONTEXT
|
| 680 |
+
* \retval CUPTI_ERROR_INVALID_METRIC_ID
|
| 681 |
+
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricIdArray or
|
| 682 |
+
* \p eventGroupPasses is NULL
|
| 683 |
+
*/
|
| 684 |
+
CUptiResult CUPTIAPI cuptiMetricCreateEventGroupSets(CUcontext context,
|
| 685 |
+
size_t metricIdArraySizeBytes,
|
| 686 |
+
CUpti_MetricID *metricIdArray,
|
| 687 |
+
CUpti_EventGroupSets **eventGroupPasses);
|
| 688 |
+
|
| 689 |
+
/**
|
| 690 |
+
* \brief Calculate the value for a metric.
|
| 691 |
+
*
|
| 692 |
+
* Use the events collected for a metric to calculate the metric
|
| 693 |
+
* value. Metric value evaluation depends on the evaluation mode
|
| 694 |
+
* \ref CUpti_MetricEvaluationMode that the metric supports.
|
| 695 |
+
* If a metric has evaluation mode as CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE,
|
| 696 |
+
* then it assumes that the input event value is for one domain instance.
|
| 697 |
+
* If a metric has evaluation mode as CUPTI_METRIC_EVALUATION_MODE_AGGREGATE,
|
| 698 |
+
* it assumes that input event values are
|
| 699 |
+
* normalized to represent all domain instances on a device. For the
|
| 700 |
+
* most accurate metric collection, the events required for the metric
|
| 701 |
+
* should be collected for all profiled domain instances. For example,
|
| 702 |
+
* to collect all instances of an event, set the
|
| 703 |
+
* CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES attribute on
|
| 704 |
+
* the group containing the event to 1. The normalized value for the
|
| 705 |
+
* event is then: (\p sum_event_values * \p totalInstanceCount) / \p
|
| 706 |
+
* instanceCount, where \p sum_event_values is the summation of the
|
| 707 |
+
* event values across all profiled domain instances, \p
|
| 708 |
+
* totalInstanceCount is obtained from querying
|
| 709 |
+
* CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT and \p instanceCount
|
| 710 |
+
* is obtained from querying CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT (or
|
| 711 |
+
* CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT).
|
| 712 |
+
*
|
| 713 |
+
* \param device The CUDA device that the metric is being calculated for
|
| 714 |
+
* \param metric The metric ID
|
| 715 |
+
* \param eventIdArraySizeBytes The size of \p eventIdArray in bytes
|
| 716 |
+
* \param eventIdArray The event IDs required to calculate \p metric
|
| 717 |
+
* \param eventValueArraySizeBytes The size of \p eventValueArray in bytes
|
| 718 |
+
* \param eventValueArray The normalized event values required to
|
| 719 |
+
* calculate \p metric. The values must be order to match the order of
|
| 720 |
+
* events in \p eventIdArray
|
| 721 |
+
* \param timeDuration The duration over which the events were
|
| 722 |
+
* collected, in ns
|
| 723 |
+
* \param metricValue Returns the value for the metric
|
| 724 |
+
*
|
| 725 |
+
* \retval CUPTI_SUCCESS
|
| 726 |
+
* \retval CUPTI_ERROR_NOT_INITIALIZED
|
| 727 |
+
* \retval CUPTI_ERROR_INVALID_METRIC_ID
|
| 728 |
+
* \retval CUPTI_ERROR_INVALID_OPERATION
|
| 729 |
+
* \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if the
|
| 730 |
+
* eventIdArray does not contain all the events needed for metric
|
| 731 |
+
* \retval CUPTI_ERROR_INVALID_EVENT_VALUE if any of the
|
| 732 |
+
* event values required for the metric is CUPTI_EVENT_OVERFLOW
|
| 733 |
+
* \retval CUPTI_ERROR_INVALID_METRIC_VALUE if the computed metric value
|
| 734 |
+
* cannot be represented in the metric's value type. For example,
|
| 735 |
+
* if the metric value type is unsigned and the computed metric value is negative
|
| 736 |
+
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricValue,
|
| 737 |
+
* \p eventIdArray or \p eventValueArray is NULL
|
| 738 |
+
*/
|
| 739 |
+
CUptiResult CUPTIAPI cuptiMetricGetValue(CUdevice device,
|
| 740 |
+
CUpti_MetricID metric,
|
| 741 |
+
size_t eventIdArraySizeBytes,
|
| 742 |
+
CUpti_EventID *eventIdArray,
|
| 743 |
+
size_t eventValueArraySizeBytes,
|
| 744 |
+
uint64_t *eventValueArray,
|
| 745 |
+
uint64_t timeDuration,
|
| 746 |
+
CUpti_MetricValue *metricValue);
|
| 747 |
+
|
| 748 |
+
/**
|
| 749 |
+
* \brief Calculate the value for a metric.
|
| 750 |
+
*
|
| 751 |
+
* Use the events and properties collected for a metric to calculate
|
| 752 |
+
* the metric value. Metric value evaluation depends on the evaluation
|
| 753 |
+
* mode \ref CUpti_MetricEvaluationMode that the metric supports. If
|
| 754 |
+
* a metric has evaluation mode as
|
| 755 |
+
* CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE, then it assumes that the
|
| 756 |
+
* input event value is for one domain instance. If a metric has
|
| 757 |
+
* evaluation mode as CUPTI_METRIC_EVALUATION_MODE_AGGREGATE, it
|
| 758 |
+
* assumes that input event values are normalized to represent all
|
| 759 |
+
* domain instances on a device. For the most accurate metric
|
| 760 |
+
* collection, the events required for the metric should be collected
|
| 761 |
+
* for all profiled domain instances. For example, to collect all
|
| 762 |
+
* instances of an event, set the
|
| 763 |
+
* CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES attribute on
|
| 764 |
+
* the group containing the event to 1. The normalized value for the
|
| 765 |
+
* event is then: (\p sum_event_values * \p totalInstanceCount) / \p
|
| 766 |
+
* instanceCount, where \p sum_event_values is the summation of the
|
| 767 |
+
* event values across all profiled domain instances, \p
|
| 768 |
+
* totalInstanceCount is obtained from querying
|
| 769 |
+
* CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT and \p instanceCount
|
| 770 |
+
* is obtained from querying CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT (or
|
| 771 |
+
* CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT).
|
| 772 |
+
*
|
| 773 |
+
* \param metric The metric ID
|
| 774 |
+
* \param eventIdArraySizeBytes The size of \p eventIdArray in bytes
|
| 775 |
+
* \param eventIdArray The event IDs required to calculate \p metric
|
| 776 |
+
* \param eventValueArraySizeBytes The size of \p eventValueArray in bytes
|
| 777 |
+
* \param eventValueArray The normalized event values required to
|
| 778 |
+
* calculate \p metric. The values must be order to match the order of
|
| 779 |
+
* events in \p eventIdArray
|
| 780 |
+
* \param propIdArraySizeBytes The size of \p propIdArray in bytes
|
| 781 |
+
* \param propIdArray The metric property IDs required to calculate \p metric
|
| 782 |
+
* \param propValueArraySizeBytes The size of \p propValueArray in bytes
|
| 783 |
+
* \param propValueArray The metric property values required to
|
| 784 |
+
* calculate \p metric. The values must be order to match the order of
|
| 785 |
+
* metric properties in \p propIdArray
|
| 786 |
+
* \param metricValue Returns the value for the metric
|
| 787 |
+
*
|
| 788 |
+
* \retval CUPTI_SUCCESS
|
| 789 |
+
* \retval CUPTI_ERROR_NOT_INITIALIZED
|
| 790 |
+
* \retval CUPTI_ERROR_INVALID_METRIC_ID
|
| 791 |
+
* \retval CUPTI_ERROR_INVALID_OPERATION
|
| 792 |
+
* \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if the
|
| 793 |
+
* eventIdArray does not contain all the events needed for metric
|
| 794 |
+
* \retval CUPTI_ERROR_INVALID_EVENT_VALUE if any of the
|
| 795 |
+
* event values required for the metric is CUPTI_EVENT_OVERFLOW
|
| 796 |
+
* \retval CUPTI_ERROR_NOT_COMPATIBLE if the computed metric value
|
| 797 |
+
* cannot be represented in the metric's value type. For example,
|
| 798 |
+
* if the metric value type is unsigned and the computed metric value is negative
|
| 799 |
+
* \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricValue,
|
| 800 |
+
* \p eventIdArray or \p eventValueArray is NULL
|
| 801 |
+
*/
|
| 802 |
+
CUptiResult CUPTIAPI cuptiMetricGetValue2(CUpti_MetricID metric,
|
| 803 |
+
size_t eventIdArraySizeBytes,
|
| 804 |
+
CUpti_EventID *eventIdArray,
|
| 805 |
+
size_t eventValueArraySizeBytes,
|
| 806 |
+
uint64_t *eventValueArray,
|
| 807 |
+
size_t propIdArraySizeBytes,
|
| 808 |
+
CUpti_MetricPropertyID *propIdArray,
|
| 809 |
+
size_t propValueArraySizeBytes,
|
| 810 |
+
uint64_t *propValueArray,
|
| 811 |
+
CUpti_MetricValue *metricValue);
|
| 812 |
+
|
| 813 |
+
/** @} */ /* END CUPTI_METRIC_API */
|
| 814 |
+
|
| 815 |
+
#if defined(__GNUC__) && defined(CUPTI_LIB)
|
| 816 |
+
#pragma GCC visibility pop
|
| 817 |
+
#endif
|
| 818 |
+
|
| 819 |
+
#if defined(__cplusplus)
|
| 820 |
+
}
|
| 821 |
+
#endif
|
| 822 |
+
|
| 823 |
+
#endif /*_CUPTI_METRIC_H_*/
|
| 824 |
+
|
| 825 |
+
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_gl_interop_meta.h
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// This file is generated. Any changes you make will be lost during the next clean build.
|
| 2 |
+
|
| 3 |
+
// CUDA public interface, for type definitions and api function prototypes
|
| 4 |
+
#include "cuda_gl_interop.h"
|
| 5 |
+
|
| 6 |
+
// *************************************************************************
|
| 7 |
+
// Definitions of structs to hold parameters for each function
|
| 8 |
+
// *************************************************************************
|
| 9 |
+
|
| 10 |
+
// Currently used parameter trace structures
|
| 11 |
+
typedef struct cudaGLGetDevices_v4010_params_st {
|
| 12 |
+
unsigned int *pCudaDeviceCount;
|
| 13 |
+
int *pCudaDevices;
|
| 14 |
+
unsigned int cudaDeviceCount;
|
| 15 |
+
enum cudaGLDeviceList deviceList;
|
| 16 |
+
} cudaGLGetDevices_v4010_params;
|
| 17 |
+
|
| 18 |
+
typedef struct cudaGraphicsGLRegisterImage_v3020_params_st {
|
| 19 |
+
struct cudaGraphicsResource **resource;
|
| 20 |
+
GLuint image;
|
| 21 |
+
GLenum target;
|
| 22 |
+
unsigned int flags;
|
| 23 |
+
} cudaGraphicsGLRegisterImage_v3020_params;
|
| 24 |
+
|
| 25 |
+
typedef struct cudaGraphicsGLRegisterBuffer_v3020_params_st {
|
| 26 |
+
struct cudaGraphicsResource **resource;
|
| 27 |
+
GLuint buffer;
|
| 28 |
+
unsigned int flags;
|
| 29 |
+
} cudaGraphicsGLRegisterBuffer_v3020_params;
|
| 30 |
+
|
| 31 |
+
typedef struct cudaGLSetGLDevice_v3020_params_st {
|
| 32 |
+
int device;
|
| 33 |
+
} cudaGLSetGLDevice_v3020_params;
|
| 34 |
+
|
| 35 |
+
typedef struct cudaGLRegisterBufferObject_v3020_params_st {
|
| 36 |
+
GLuint bufObj;
|
| 37 |
+
} cudaGLRegisterBufferObject_v3020_params;
|
| 38 |
+
|
| 39 |
+
typedef struct cudaGLMapBufferObject_v3020_params_st {
|
| 40 |
+
void **devPtr;
|
| 41 |
+
GLuint bufObj;
|
| 42 |
+
} cudaGLMapBufferObject_v3020_params;
|
| 43 |
+
|
| 44 |
+
typedef struct cudaGLUnmapBufferObject_v3020_params_st {
|
| 45 |
+
GLuint bufObj;
|
| 46 |
+
} cudaGLUnmapBufferObject_v3020_params;
|
| 47 |
+
|
| 48 |
+
typedef struct cudaGLUnregisterBufferObject_v3020_params_st {
|
| 49 |
+
GLuint bufObj;
|
| 50 |
+
} cudaGLUnregisterBufferObject_v3020_params;
|
| 51 |
+
|
| 52 |
+
typedef struct cudaGLSetBufferObjectMapFlags_v3020_params_st {
|
| 53 |
+
GLuint bufObj;
|
| 54 |
+
unsigned int flags;
|
| 55 |
+
} cudaGLSetBufferObjectMapFlags_v3020_params;
|
| 56 |
+
|
| 57 |
+
typedef struct cudaGLMapBufferObjectAsync_v3020_params_st {
|
| 58 |
+
void **devPtr;
|
| 59 |
+
GLuint bufObj;
|
| 60 |
+
cudaStream_t stream;
|
| 61 |
+
} cudaGLMapBufferObjectAsync_v3020_params;
|
| 62 |
+
|
| 63 |
+
typedef struct cudaGLUnmapBufferObjectAsync_v3020_params_st {
|
| 64 |
+
GLuint bufObj;
|
| 65 |
+
cudaStream_t stream;
|
| 66 |
+
} cudaGLUnmapBufferObjectAsync_v3020_params;
|
| 67 |
+
|
| 68 |
+
// Parameter trace structures for removed functions
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
// End of parameter trace structures
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/lib/__init__.py
ADDED
|
File without changes
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/lib/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (222 Bytes). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/common_functions.h
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 51 |
+
#if defined(_MSC_VER)
|
| 52 |
+
#pragma message("common_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 53 |
+
#else
|
| 54 |
+
#warning "common_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 55 |
+
#endif
|
| 56 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 57 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
#include "crt/common_functions.h"
|
| 61 |
+
|
| 62 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__)
|
| 63 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 64 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__
|
| 65 |
+
#endif
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaEGL.h
ADDED
|
@@ -0,0 +1,659 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef CUDAEGL_H
|
| 51 |
+
#define CUDAEGL_H
|
| 52 |
+
|
| 53 |
+
#include "cuda.h"
|
| 54 |
+
#include "EGL/egl.h"
|
| 55 |
+
#include "EGL/eglext.h"
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
#ifdef CUDA_FORCE_API_VERSION
|
| 59 |
+
#error "CUDA_FORCE_API_VERSION is no longer supported."
|
| 60 |
+
#endif
|
| 61 |
+
|
| 62 |
+
#ifdef __cplusplus
|
| 63 |
+
extern "C" {
|
| 64 |
+
#endif
|
| 65 |
+
|
| 66 |
+
/**
|
| 67 |
+
* \addtogroup CUDA_TYPES
|
| 68 |
+
* @{
|
| 69 |
+
*/
|
| 70 |
+
|
| 71 |
+
/**
|
| 72 |
+
* Maximum number of planes per frame
|
| 73 |
+
*/
|
| 74 |
+
#define MAX_PLANES 3
|
| 75 |
+
|
| 76 |
+
/**
|
| 77 |
+
* CUDA EglFrame type - array or pointer
|
| 78 |
+
*/
|
| 79 |
+
typedef enum CUeglFrameType_enum {
|
| 80 |
+
CU_EGL_FRAME_TYPE_ARRAY = 0, /**< Frame type CUDA array */
|
| 81 |
+
CU_EGL_FRAME_TYPE_PITCH = 1, /**< Frame type pointer */
|
| 82 |
+
} CUeglFrameType;
|
| 83 |
+
|
| 84 |
+
/**
|
| 85 |
+
* Indicates that timeout for ::cuEGLStreamConsumerAcquireFrame is infinite.
|
| 86 |
+
*/
|
| 87 |
+
#define CUDA_EGL_INFINITE_TIMEOUT 0xFFFFFFFF
|
| 88 |
+
|
| 89 |
+
/**
|
| 90 |
+
* Resource location flags- sysmem or vidmem
|
| 91 |
+
*
|
| 92 |
+
* For CUDA context on iGPU, since video and system memory are equivalent -
|
| 93 |
+
* these flags will not have an effect on the execution.
|
| 94 |
+
*
|
| 95 |
+
* For CUDA context on dGPU, applications can use the flag ::CUeglResourceLocationFlags
|
| 96 |
+
* to give a hint about the desired location.
|
| 97 |
+
*
|
| 98 |
+
* ::CU_EGL_RESOURCE_LOCATION_SYSMEM - the frame data is made resident on the system memory
|
| 99 |
+
* to be accessed by CUDA.
|
| 100 |
+
*
|
| 101 |
+
* ::CU_EGL_RESOURCE_LOCATION_VIDMEM - the frame data is made resident on the dedicated
|
| 102 |
+
* video memory to be accessed by CUDA.
|
| 103 |
+
*
|
| 104 |
+
* There may be an additional latency due to new allocation and data migration,
|
| 105 |
+
* if the frame is produced on a different memory.
|
| 106 |
+
|
| 107 |
+
*/
|
| 108 |
+
typedef enum CUeglResourceLocationFlags_enum {
|
| 109 |
+
CU_EGL_RESOURCE_LOCATION_SYSMEM = 0x00, /**< Resource location sysmem */
|
| 110 |
+
CU_EGL_RESOURCE_LOCATION_VIDMEM = 0x01 /**< Resource location vidmem */
|
| 111 |
+
} CUeglResourceLocationFlags;
|
| 112 |
+
|
| 113 |
+
/**
|
| 114 |
+
* CUDA EGL Color Format - The different planar and multiplanar formats currently supported for CUDA_EGL interops.
|
| 115 |
+
* Three channel formats are currently not supported for ::CU_EGL_FRAME_TYPE_ARRAY
|
| 116 |
+
*/
|
| 117 |
+
typedef enum CUeglColorFormat_enum {
|
| 118 |
+
CU_EGL_COLOR_FORMAT_YUV420_PLANAR = 0x00, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 119 |
+
CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR = 0x01, /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV420Planar. */
|
| 120 |
+
CU_EGL_COLOR_FORMAT_YUV422_PLANAR = 0x02, /**< Y, U, V each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 121 |
+
CU_EGL_COLOR_FORMAT_YUV422_SEMIPLANAR = 0x03, /**< Y, UV in two surfaces with VU byte ordering, width, height ratio same as YUV422Planar. */
|
| 122 |
+
CU_EGL_COLOR_FORMAT_RGB = 0x04, /**< R/G/B three channels in one surface with BGR byte ordering. Only pitch linear format supported. */
|
| 123 |
+
CU_EGL_COLOR_FORMAT_BGR = 0x05, /**< R/G/B three channels in one surface with RGB byte ordering. Only pitch linear format supported. */
|
| 124 |
+
CU_EGL_COLOR_FORMAT_ARGB = 0x06, /**< R/G/B/A four channels in one surface with BGRA byte ordering. */
|
| 125 |
+
CU_EGL_COLOR_FORMAT_RGBA = 0x07, /**< R/G/B/A four channels in one surface with ABGR byte ordering. */
|
| 126 |
+
CU_EGL_COLOR_FORMAT_L = 0x08, /**< single luminance channel in one surface. */
|
| 127 |
+
CU_EGL_COLOR_FORMAT_R = 0x09, /**< single color channel in one surface. */
|
| 128 |
+
CU_EGL_COLOR_FORMAT_YUV444_PLANAR = 0x0A, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
|
| 129 |
+
CU_EGL_COLOR_FORMAT_YUV444_SEMIPLANAR = 0x0B, /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV444Planar. */
|
| 130 |
+
CU_EGL_COLOR_FORMAT_YUYV_422 = 0x0C, /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
|
| 131 |
+
CU_EGL_COLOR_FORMAT_UYVY_422 = 0x0D, /**< Y, U, V in one surface, interleaved as YUYV in one channel. */
|
| 132 |
+
CU_EGL_COLOR_FORMAT_ABGR = 0x0E, /**< R/G/B/A four channels in one surface with RGBA byte ordering. */
|
| 133 |
+
CU_EGL_COLOR_FORMAT_BGRA = 0x0F, /**< R/G/B/A four channels in one surface with ARGB byte ordering. */
|
| 134 |
+
CU_EGL_COLOR_FORMAT_A = 0x10, /**< Alpha color format - one channel in one surface. */
|
| 135 |
+
CU_EGL_COLOR_FORMAT_RG = 0x11, /**< R/G color format - two channels in one surface with GR byte ordering */
|
| 136 |
+
CU_EGL_COLOR_FORMAT_AYUV = 0x12, /**< Y, U, V, A four channels in one surface, interleaved as VUYA. */
|
| 137 |
+
CU_EGL_COLOR_FORMAT_YVU444_SEMIPLANAR = 0x13, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
|
| 138 |
+
CU_EGL_COLOR_FORMAT_YVU422_SEMIPLANAR = 0x14, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 139 |
+
CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR = 0x15, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 140 |
+
CU_EGL_COLOR_FORMAT_Y10V10U10_444_SEMIPLANAR = 0x16, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
|
| 141 |
+
CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR = 0x17, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 142 |
+
CU_EGL_COLOR_FORMAT_Y12V12U12_444_SEMIPLANAR = 0x18, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
|
| 143 |
+
CU_EGL_COLOR_FORMAT_Y12V12U12_420_SEMIPLANAR = 0x19, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 144 |
+
CU_EGL_COLOR_FORMAT_VYUY_ER = 0x1A, /**< Extended Range Y, U, V in one surface, interleaved as YVYU in one channel. */
|
| 145 |
+
CU_EGL_COLOR_FORMAT_UYVY_ER = 0x1B, /**< Extended Range Y, U, V in one surface, interleaved as YUYV in one channel. */
|
| 146 |
+
CU_EGL_COLOR_FORMAT_YUYV_ER = 0x1C, /**< Extended Range Y, U, V in one surface, interleaved as UYVY in one channel. */
|
| 147 |
+
CU_EGL_COLOR_FORMAT_YVYU_ER = 0x1D, /**< Extended Range Y, U, V in one surface, interleaved as VYUY in one channel. */
|
| 148 |
+
CU_EGL_COLOR_FORMAT_YUV_ER = 0x1E, /**< Extended Range Y, U, V three channels in one surface, interleaved as VUY. Only pitch linear format supported. */
|
| 149 |
+
CU_EGL_COLOR_FORMAT_YUVA_ER = 0x1F, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as AVUY. */
|
| 150 |
+
CU_EGL_COLOR_FORMAT_AYUV_ER = 0x20, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as VUYA. */
|
| 151 |
+
CU_EGL_COLOR_FORMAT_YUV444_PLANAR_ER = 0x21, /**< Extended Range Y, U, V in three surfaces, U/V width = Y width, U/V height = Y height. */
|
| 152 |
+
CU_EGL_COLOR_FORMAT_YUV422_PLANAR_ER = 0x22, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 153 |
+
CU_EGL_COLOR_FORMAT_YUV420_PLANAR_ER = 0x23, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 154 |
+
CU_EGL_COLOR_FORMAT_YUV444_SEMIPLANAR_ER = 0x24, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = Y width, U/V height = Y height. */
|
| 155 |
+
CU_EGL_COLOR_FORMAT_YUV422_SEMIPLANAR_ER = 0x25, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 156 |
+
CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR_ER = 0x26, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 157 |
+
CU_EGL_COLOR_FORMAT_YVU444_PLANAR_ER = 0x27, /**< Extended Range Y, V, U in three surfaces, U/V width = Y width, U/V height = Y height. */
|
| 158 |
+
CU_EGL_COLOR_FORMAT_YVU422_PLANAR_ER = 0x28, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 159 |
+
CU_EGL_COLOR_FORMAT_YVU420_PLANAR_ER = 0x29, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 160 |
+
CU_EGL_COLOR_FORMAT_YVU444_SEMIPLANAR_ER = 0x2A, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
|
| 161 |
+
CU_EGL_COLOR_FORMAT_YVU422_SEMIPLANAR_ER = 0x2B, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 162 |
+
CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR_ER = 0x2C, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 163 |
+
CU_EGL_COLOR_FORMAT_BAYER_RGGB = 0x2D, /**< Bayer format - one channel in one surface with interleaved RGGB ordering. */
|
| 164 |
+
CU_EGL_COLOR_FORMAT_BAYER_BGGR = 0x2E, /**< Bayer format - one channel in one surface with interleaved BGGR ordering. */
|
| 165 |
+
CU_EGL_COLOR_FORMAT_BAYER_GRBG = 0x2F, /**< Bayer format - one channel in one surface with interleaved GRBG ordering. */
|
| 166 |
+
CU_EGL_COLOR_FORMAT_BAYER_GBRG = 0x30, /**< Bayer format - one channel in one surface with interleaved GBRG ordering. */
|
| 167 |
+
CU_EGL_COLOR_FORMAT_BAYER10_RGGB = 0x31, /**< Bayer10 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
|
| 168 |
+
CU_EGL_COLOR_FORMAT_BAYER10_BGGR = 0x32, /**< Bayer10 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
|
| 169 |
+
CU_EGL_COLOR_FORMAT_BAYER10_GRBG = 0x33, /**< Bayer10 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
|
| 170 |
+
CU_EGL_COLOR_FORMAT_BAYER10_GBRG = 0x34, /**< Bayer10 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
|
| 171 |
+
CU_EGL_COLOR_FORMAT_BAYER12_RGGB = 0x35, /**< Bayer12 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 172 |
+
CU_EGL_COLOR_FORMAT_BAYER12_BGGR = 0x36, /**< Bayer12 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 173 |
+
CU_EGL_COLOR_FORMAT_BAYER12_GRBG = 0x37, /**< Bayer12 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 174 |
+
CU_EGL_COLOR_FORMAT_BAYER12_GBRG = 0x38, /**< Bayer12 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 175 |
+
CU_EGL_COLOR_FORMAT_BAYER14_RGGB = 0x39, /**< Bayer14 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
|
| 176 |
+
CU_EGL_COLOR_FORMAT_BAYER14_BGGR = 0x3A, /**< Bayer14 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
|
| 177 |
+
CU_EGL_COLOR_FORMAT_BAYER14_GRBG = 0x3B, /**< Bayer14 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
|
| 178 |
+
CU_EGL_COLOR_FORMAT_BAYER14_GBRG = 0x3C, /**< Bayer14 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
|
| 179 |
+
CU_EGL_COLOR_FORMAT_BAYER20_RGGB = 0x3D, /**< Bayer20 format - one channel in one surface with interleaved RGGB ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
|
| 180 |
+
CU_EGL_COLOR_FORMAT_BAYER20_BGGR = 0x3E, /**< Bayer20 format - one channel in one surface with interleaved BGGR ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
|
| 181 |
+
CU_EGL_COLOR_FORMAT_BAYER20_GRBG = 0x3F, /**< Bayer20 format - one channel in one surface with interleaved GRBG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
|
| 182 |
+
CU_EGL_COLOR_FORMAT_BAYER20_GBRG = 0x40, /**< Bayer20 format - one channel in one surface with interleaved GBRG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
|
| 183 |
+
CU_EGL_COLOR_FORMAT_YVU444_PLANAR = 0x41, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
|
| 184 |
+
CU_EGL_COLOR_FORMAT_YVU422_PLANAR = 0x42, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 185 |
+
CU_EGL_COLOR_FORMAT_YVU420_PLANAR = 0x43, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 186 |
+
CU_EGL_COLOR_FORMAT_BAYER_ISP_RGGB = 0x44, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved RGGB ordering and mapped to opaque integer datatype. */
|
| 187 |
+
CU_EGL_COLOR_FORMAT_BAYER_ISP_BGGR = 0x45, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved BGGR ordering and mapped to opaque integer datatype. */
|
| 188 |
+
CU_EGL_COLOR_FORMAT_BAYER_ISP_GRBG = 0x46, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GRBG ordering and mapped to opaque integer datatype. */
|
| 189 |
+
CU_EGL_COLOR_FORMAT_BAYER_ISP_GBRG = 0x47, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GBRG ordering and mapped to opaque integer datatype. */
|
| 190 |
+
CU_EGL_COLOR_FORMAT_BAYER_BCCR = 0x48, /**< Bayer format - one channel in one surface with interleaved BCCR ordering. */
|
| 191 |
+
CU_EGL_COLOR_FORMAT_BAYER_RCCB = 0x49, /**< Bayer format - one channel in one surface with interleaved RCCB ordering. */
|
| 192 |
+
CU_EGL_COLOR_FORMAT_BAYER_CRBC = 0x4A, /**< Bayer format - one channel in one surface with interleaved CRBC ordering. */
|
| 193 |
+
CU_EGL_COLOR_FORMAT_BAYER_CBRC = 0x4B, /**< Bayer format - one channel in one surface with interleaved CBRC ordering. */
|
| 194 |
+
CU_EGL_COLOR_FORMAT_BAYER10_CCCC = 0x4C, /**< Bayer10 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
|
| 195 |
+
CU_EGL_COLOR_FORMAT_BAYER12_BCCR = 0x4D, /**< Bayer12 format - one channel in one surface with interleaved BCCR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 196 |
+
CU_EGL_COLOR_FORMAT_BAYER12_RCCB = 0x4E, /**< Bayer12 format - one channel in one surface with interleaved RCCB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 197 |
+
CU_EGL_COLOR_FORMAT_BAYER12_CRBC = 0x4F, /**< Bayer12 format - one channel in one surface with interleaved CRBC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 198 |
+
CU_EGL_COLOR_FORMAT_BAYER12_CBRC = 0x50, /**< Bayer12 format - one channel in one surface with interleaved CBRC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 199 |
+
CU_EGL_COLOR_FORMAT_BAYER12_CCCC = 0x51, /**< Bayer12 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 200 |
+
CU_EGL_COLOR_FORMAT_Y = 0x52, /**< Color format for single Y plane. */
|
| 201 |
+
CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR_2020 = 0x53, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 202 |
+
CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR_2020 = 0x54, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 203 |
+
CU_EGL_COLOR_FORMAT_YUV420_PLANAR_2020 = 0x55, /**< Y, U, V each in a separate surface, U/V width = 1/2 Y width, U/V height= 1/2 Y height. */
|
| 204 |
+
CU_EGL_COLOR_FORMAT_YVU420_PLANAR_2020 = 0x56, /**< Y, V, U each in a separate surface, U/V width = 1/2 Y width, U/V height
|
| 205 |
+
= 1/2 Y height. */
|
| 206 |
+
CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR_709 = 0x57, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 207 |
+
CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR_709 = 0x58, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 208 |
+
CU_EGL_COLOR_FORMAT_YUV420_PLANAR_709 = 0x59, /**< Y, U, V each in a separate surface, U/V width = 1/2 Y width, U/V height
|
| 209 |
+
= 1/2 Y height. */
|
| 210 |
+
CU_EGL_COLOR_FORMAT_YVU420_PLANAR_709 = 0x5A, /**< Y, V, U each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 211 |
+
CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_709 = 0x5B, /**< Y10, V10U10 in two surfaces (VU as one surface), U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 212 |
+
CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_2020 = 0x5C, /**< Y10, V10U10 in two surfaces (VU as one surface), U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 213 |
+
CU_EGL_COLOR_FORMAT_Y10V10U10_422_SEMIPLANAR_2020 = 0x5D, /**< Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */
|
| 214 |
+
CU_EGL_COLOR_FORMAT_Y10V10U10_422_SEMIPLANAR = 0x5E, /**< Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */
|
| 215 |
+
CU_EGL_COLOR_FORMAT_Y10V10U10_422_SEMIPLANAR_709 = 0x5F, /**< Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */
|
| 216 |
+
CU_EGL_COLOR_FORMAT_Y_ER = 0x60, /**< Extended Range Color format for single Y plane. */
|
| 217 |
+
CU_EGL_COLOR_FORMAT_Y_709_ER = 0x61, /**< Extended Range Color format for single Y plane. */
|
| 218 |
+
CU_EGL_COLOR_FORMAT_Y10_ER = 0x62, /**< Extended Range Color format for single Y10 plane. */
|
| 219 |
+
CU_EGL_COLOR_FORMAT_Y10_709_ER = 0x63, /**< Extended Range Color format for single Y10 plane. */
|
| 220 |
+
CU_EGL_COLOR_FORMAT_Y12_ER = 0x64, /**< Extended Range Color format for single Y12 plane. */
|
| 221 |
+
CU_EGL_COLOR_FORMAT_Y12_709_ER = 0x65, /**< Extended Range Color format for single Y12 plane. */
|
| 222 |
+
CU_EGL_COLOR_FORMAT_YUVA = 0x66, /**< Y, U, V, A four channels in one surface, interleaved as AVUY. */
|
| 223 |
+
CU_EGL_COLOR_FORMAT_YUV = 0x67, /**< Y, U, V three channels in one surface, interleaved as VUY. Only pitch linear format supported. */
|
| 224 |
+
CU_EGL_COLOR_FORMAT_YVYU = 0x68, /**< Y, U, V in one surface, interleaved as YVYU in one channel. */
|
| 225 |
+
CU_EGL_COLOR_FORMAT_VYUY = 0x69, /**< Y, U, V in one surface, interleaved as VYUY in one channel. */
|
| 226 |
+
CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_ER = 0x6A, /**< Extended Range Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 227 |
+
CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_709_ER = 0x6B, /**< Extended Range Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 228 |
+
CU_EGL_COLOR_FORMAT_Y10V10U10_444_SEMIPLANAR_ER = 0x6C, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
|
| 229 |
+
CU_EGL_COLOR_FORMAT_Y10V10U10_444_SEMIPLANAR_709_ER = 0x6D, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
|
| 230 |
+
CU_EGL_COLOR_FORMAT_Y12V12U12_420_SEMIPLANAR_ER = 0x6E, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 231 |
+
CU_EGL_COLOR_FORMAT_Y12V12U12_420_SEMIPLANAR_709_ER = 0x6F, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 232 |
+
CU_EGL_COLOR_FORMAT_Y12V12U12_444_SEMIPLANAR_ER = 0x70, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
|
| 233 |
+
CU_EGL_COLOR_FORMAT_Y12V12U12_444_SEMIPLANAR_709_ER = 0x71, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
|
| 234 |
+
CU_EGL_COLOR_FORMAT_MAX
|
| 235 |
+
} CUeglColorFormat;
|
| 236 |
+
|
| 237 |
+
/**
|
| 238 |
+
* CUDA EGLFrame structure Descriptor - structure defining one frame of EGL.
|
| 239 |
+
*
|
| 240 |
+
* Each frame may contain one or more planes depending on whether the surface * is Multiplanar or not.
|
| 241 |
+
*/
|
| 242 |
+
typedef struct CUeglFrame_st {
|
| 243 |
+
union {
|
| 244 |
+
CUarray pArray[MAX_PLANES]; /**< Array of CUarray corresponding to each plane*/
|
| 245 |
+
void* pPitch[MAX_PLANES]; /**< Array of Pointers corresponding to each plane*/
|
| 246 |
+
} frame;
|
| 247 |
+
unsigned int width; /**< Width of first plane */
|
| 248 |
+
unsigned int height; /**< Height of first plane */
|
| 249 |
+
unsigned int depth; /**< Depth of first plane */
|
| 250 |
+
unsigned int pitch; /**< Pitch of first plane */
|
| 251 |
+
unsigned int planeCount; /**< Number of planes */
|
| 252 |
+
unsigned int numChannels; /**< Number of channels for the plane */
|
| 253 |
+
CUeglFrameType frameType; /**< Array or Pitch */
|
| 254 |
+
CUeglColorFormat eglColorFormat; /**< CUDA EGL Color Format*/
|
| 255 |
+
CUarray_format cuFormat; /**< CUDA Array Format*/
|
| 256 |
+
} CUeglFrame_v1;
|
| 257 |
+
typedef CUeglFrame_v1 CUeglFrame;
|
| 258 |
+
|
| 259 |
+
/**
|
| 260 |
+
* CUDA EGLSream Connection
|
| 261 |
+
*/
|
| 262 |
+
typedef struct CUeglStreamConnection_st* CUeglStreamConnection;
|
| 263 |
+
|
| 264 |
+
/** @} */ /* END CUDA_TYPES */
|
| 265 |
+
|
| 266 |
+
/**
|
| 267 |
+
* \file cudaEGL.h
|
| 268 |
+
* \brief Header file for the EGL interoperability functions of the
|
| 269 |
+
* low-level CUDA driver application programming interface.
|
| 270 |
+
*/
|
| 271 |
+
|
| 272 |
+
/**
|
| 273 |
+
* \defgroup CUDA_EGL EGL Interoperability
|
| 274 |
+
* \ingroup CUDA_DRIVER
|
| 275 |
+
*
|
| 276 |
+
* ___MANBRIEF___ EGL interoperability functions of the low-level CUDA
|
| 277 |
+
* driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
|
| 278 |
+
*
|
| 279 |
+
* This section describes the EGL interoperability functions of the
|
| 280 |
+
* low-level CUDA driver application programming interface.
|
| 281 |
+
*
|
| 282 |
+
* @{
|
| 283 |
+
*/
|
| 284 |
+
|
| 285 |
+
/**
|
| 286 |
+
* \brief Registers an EGL image
|
| 287 |
+
*
|
| 288 |
+
* Registers the EGLImageKHR specified by \p image for access by
|
| 289 |
+
* CUDA. A handle to the registered object is returned as \p pCudaResource.
|
| 290 |
+
* Additional Mapping/Unmapping is not required for the registered resource and
|
| 291 |
+
* ::cuGraphicsResourceGetMappedEglFrame can be directly called on the \p pCudaResource.
|
| 292 |
+
*
|
| 293 |
+
* The application will be responsible for synchronizing access to shared objects.
|
| 294 |
+
* The application must ensure that any pending operation which access the objects have completed
|
| 295 |
+
* before passing control to CUDA. This may be accomplished by issuing and waiting for
|
| 296 |
+
* glFinish command on all GLcontexts (for OpenGL and likewise for other APIs).
|
| 297 |
+
* The application will be also responsible for ensuring that any pending operation on the
|
| 298 |
+
* registered CUDA resource has completed prior to executing subsequent commands in other APIs
|
| 299 |
+
* accesing the same memory objects.
|
| 300 |
+
* This can be accomplished by calling cuCtxSynchronize or cuEventSynchronize (preferably).
|
| 301 |
+
*
|
| 302 |
+
* The surface's intended usage is specified using \p flags, as follows:
|
| 303 |
+
*
|
| 304 |
+
* - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
|
| 305 |
+
* resource will be used. It is therefore assumed that this resource will be
|
| 306 |
+
* read from and written to by CUDA. This is the default value.
|
| 307 |
+
* - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA
|
| 308 |
+
* will not write to this resource.
|
| 309 |
+
* - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that
|
| 310 |
+
* CUDA will not read from this resource and will write over the
|
| 311 |
+
* entire contents of the resource, so none of the data previously
|
| 312 |
+
* stored in the resource will be preserved.
|
| 313 |
+
*
|
| 314 |
+
* The EGLImageKHR is an object which can be used to create EGLImage target resource. It is defined as a void pointer.
|
| 315 |
+
* typedef void* EGLImageKHR
|
| 316 |
+
*
|
| 317 |
+
* \param pCudaResource - Pointer to the returned object handle
|
| 318 |
+
* \param image - An EGLImageKHR image which can be used to create target resource.
|
| 319 |
+
* \param flags - Map flags
|
| 320 |
+
*
|
| 321 |
+
* \return
|
| 322 |
+
* ::CUDA_SUCCESS,
|
| 323 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 324 |
+
* ::CUDA_ERROR_ALREADY_MAPPED,
|
| 325 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 326 |
+
*
|
| 327 |
+
* \sa ::cuGraphicsEGLRegisterImage, ::cuGraphicsUnregisterResource,
|
| 328 |
+
* ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
|
| 329 |
+
* ::cuGraphicsUnmapResources,
|
| 330 |
+
* ::cudaGraphicsEGLRegisterImage
|
| 331 |
+
*/
|
| 332 |
+
CUresult CUDAAPI cuGraphicsEGLRegisterImage(CUgraphicsResource *pCudaResource, EGLImageKHR image, unsigned int flags);
|
| 333 |
+
|
| 334 |
+
/**
|
| 335 |
+
* \brief Connect CUDA to EGLStream as a consumer.
|
| 336 |
+
*
|
| 337 |
+
* Connect CUDA as a consumer to EGLStreamKHR specified by \p stream.
|
| 338 |
+
*
|
| 339 |
+
* The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
|
| 340 |
+
* API to another.
|
| 341 |
+
*
|
| 342 |
+
* \param conn - Pointer to the returned connection handle
|
| 343 |
+
* \param stream - EGLStreamKHR handle
|
| 344 |
+
*
|
| 345 |
+
* \return
|
| 346 |
+
* ::CUDA_SUCCESS,
|
| 347 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 348 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 349 |
+
*
|
| 350 |
+
* \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
|
| 351 |
+
* ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
|
| 352 |
+
* ::cudaEGLStreamConsumerConnect
|
| 353 |
+
*/
|
| 354 |
+
CUresult CUDAAPI cuEGLStreamConsumerConnect(CUeglStreamConnection *conn, EGLStreamKHR stream);
|
| 355 |
+
|
| 356 |
+
/**
|
| 357 |
+
* \brief Connect CUDA to EGLStream as a consumer with given flags.
|
| 358 |
+
*
|
| 359 |
+
* Connect CUDA as a consumer to EGLStreamKHR specified by \p stream with specified \p flags defined by CUeglResourceLocationFlags.
|
| 360 |
+
*
|
| 361 |
+
* The flags specify whether the consumer wants to access frames from system memory or video memory.
|
| 362 |
+
* Default is ::CU_EGL_RESOURCE_LOCATION_VIDMEM.
|
| 363 |
+
*
|
| 364 |
+
* \param conn - Pointer to the returned connection handle
|
| 365 |
+
* \param stream - EGLStreamKHR handle
|
| 366 |
+
* \param flags - Flags denote intended location - system or video.
|
| 367 |
+
*
|
| 368 |
+
* \return
|
| 369 |
+
* ::CUDA_SUCCESS,
|
| 370 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 371 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 372 |
+
*
|
| 373 |
+
* \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
|
| 374 |
+
* ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
|
| 375 |
+
* ::cudaEGLStreamConsumerConnectWithFlags
|
| 376 |
+
*/
|
| 377 |
+
|
| 378 |
+
CUresult CUDAAPI cuEGLStreamConsumerConnectWithFlags(CUeglStreamConnection *conn, EGLStreamKHR stream, unsigned int flags);
|
| 379 |
+
|
| 380 |
+
/**
|
| 381 |
+
* \brief Disconnect CUDA as a consumer to EGLStream .
|
| 382 |
+
*
|
| 383 |
+
* Disconnect CUDA as a consumer to EGLStreamKHR.
|
| 384 |
+
*
|
| 385 |
+
* \param conn - Conection to disconnect.
|
| 386 |
+
*
|
| 387 |
+
* \return
|
| 388 |
+
* ::CUDA_SUCCESS,
|
| 389 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 390 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 391 |
+
*
|
| 392 |
+
* \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
|
| 393 |
+
* ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
|
| 394 |
+
* ::cudaEGLStreamConsumerDisconnect
|
| 395 |
+
*/
|
| 396 |
+
CUresult CUDAAPI cuEGLStreamConsumerDisconnect(CUeglStreamConnection *conn);
|
| 397 |
+
|
| 398 |
+
/**
|
| 399 |
+
* \brief Acquire an image frame from the EGLStream with CUDA as a consumer.
|
| 400 |
+
*
|
| 401 |
+
* Acquire an image frame from EGLStreamKHR. This API can also acquire an old frame presented
|
| 402 |
+
* by the producer unless explicitly disabled by setting EGL_SUPPORT_REUSE_NV flag to EGL_FALSE
|
| 403 |
+
* during stream initialization. By default, EGLStream is created with this flag set to EGL_TRUE.
|
| 404 |
+
* ::cuGraphicsResourceGetMappedEglFrame can be called on \p pCudaResource to get
|
| 405 |
+
* ::CUeglFrame.
|
| 406 |
+
*
|
| 407 |
+
* \param conn - Connection on which to acquire
|
| 408 |
+
* \param pCudaResource - CUDA resource on which the stream frame will be mapped for use.
|
| 409 |
+
* \param pStream - CUDA stream for synchronization and any data migrations
|
| 410 |
+
* implied by ::CUeglResourceLocationFlags.
|
| 411 |
+
* \param timeout - Desired timeout in usec for a new frame to be acquired.
|
| 412 |
+
* If set as ::CUDA_EGL_INFINITE_TIMEOUT, acquire waits infinitely.
|
| 413 |
+
* After timeout occurs CUDA consumer tries to acquire an old frame
|
| 414 |
+
* if available and EGL_SUPPORT_REUSE_NV flag is set.
|
| 415 |
+
*
|
| 416 |
+
* \return
|
| 417 |
+
* ::CUDA_SUCCESS,
|
| 418 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 419 |
+
* ::CUDA_ERROR_LAUNCH_TIMEOUT,
|
| 420 |
+
*
|
| 421 |
+
* \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
|
| 422 |
+
* ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
|
| 423 |
+
* ::cudaEGLStreamConsumerAcquireFrame
|
| 424 |
+
*/
|
| 425 |
+
CUresult CUDAAPI cuEGLStreamConsumerAcquireFrame(CUeglStreamConnection *conn,
|
| 426 |
+
CUgraphicsResource *pCudaResource, CUstream *pStream, unsigned int timeout);
|
| 427 |
+
/**
|
| 428 |
+
* \brief Releases the last frame acquired from the EGLStream.
|
| 429 |
+
*
|
| 430 |
+
* Release the acquired image frame specified by \p pCudaResource to EGLStreamKHR.
|
| 431 |
+
* If EGL_SUPPORT_REUSE_NV flag is set to EGL_TRUE, at the time of EGL creation
|
| 432 |
+
* this API doesn't release the last frame acquired on the EGLStream.
|
| 433 |
+
* By default, EGLStream is created with this flag set to EGL_TRUE.
|
| 434 |
+
*
|
| 435 |
+
* \param conn - Connection on which to release
|
| 436 |
+
* \param pCudaResource - CUDA resource whose corresponding frame is to be released
|
| 437 |
+
* \param pStream - CUDA stream on which release will be done.
|
| 438 |
+
*
|
| 439 |
+
* \return
|
| 440 |
+
* ::CUDA_SUCCESS,
|
| 441 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 442 |
+
*
|
| 443 |
+
* \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
|
| 444 |
+
* ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
|
| 445 |
+
* ::cudaEGLStreamConsumerReleaseFrame
|
| 446 |
+
*/
|
| 447 |
+
CUresult CUDAAPI cuEGLStreamConsumerReleaseFrame(CUeglStreamConnection *conn,
|
| 448 |
+
CUgraphicsResource pCudaResource, CUstream *pStream);
|
| 449 |
+
|
| 450 |
+
/**
|
| 451 |
+
* \brief Connect CUDA to EGLStream as a producer.
|
| 452 |
+
*
|
| 453 |
+
* Connect CUDA as a producer to EGLStreamKHR specified by \p stream.
|
| 454 |
+
*
|
| 455 |
+
* The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
|
| 456 |
+
* API to another.
|
| 457 |
+
*
|
| 458 |
+
* \param conn - Pointer to the returned connection handle
|
| 459 |
+
* \param stream - EGLStreamKHR handle
|
| 460 |
+
* \param width - width of the image to be submitted to the stream
|
| 461 |
+
* \param height - height of the image to be submitted to the stream
|
| 462 |
+
*
|
| 463 |
+
* \return
|
| 464 |
+
* ::CUDA_SUCCESS,
|
| 465 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 466 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 467 |
+
*
|
| 468 |
+
* \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
|
| 469 |
+
* ::cuEGLStreamProducerPresentFrame,
|
| 470 |
+
* ::cudaEGLStreamProducerConnect
|
| 471 |
+
*/
|
| 472 |
+
CUresult CUDAAPI cuEGLStreamProducerConnect(CUeglStreamConnection *conn, EGLStreamKHR stream,
|
| 473 |
+
EGLint width, EGLint height);
|
| 474 |
+
|
| 475 |
+
/**
|
| 476 |
+
* \brief Disconnect CUDA as a producer to EGLStream .
|
| 477 |
+
*
|
| 478 |
+
* Disconnect CUDA as a producer to EGLStreamKHR.
|
| 479 |
+
*
|
| 480 |
+
* \param conn - Conection to disconnect.
|
| 481 |
+
*
|
| 482 |
+
* \return
|
| 483 |
+
* ::CUDA_SUCCESS,
|
| 484 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 485 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 486 |
+
*
|
| 487 |
+
* \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
|
| 488 |
+
* ::cuEGLStreamProducerPresentFrame,
|
| 489 |
+
* ::cudaEGLStreamProducerDisconnect
|
| 490 |
+
*/
|
| 491 |
+
CUresult CUDAAPI cuEGLStreamProducerDisconnect(CUeglStreamConnection *conn);
|
| 492 |
+
|
| 493 |
+
/**
|
| 494 |
+
* \brief Present a CUDA eglFrame to the EGLStream with CUDA as a producer.
|
| 495 |
+
*
|
| 496 |
+
* When a frame is presented by the producer, it gets associated with the EGLStream
|
| 497 |
+
* and thus it is illegal to free the frame before the producer is disconnected.
|
| 498 |
+
* If a frame is freed and reused it may lead to undefined behavior.
|
| 499 |
+
*
|
| 500 |
+
* If producer and consumer are on different GPUs (iGPU and dGPU) then frametype
|
| 501 |
+
* ::CU_EGL_FRAME_TYPE_ARRAY is not supported. ::CU_EGL_FRAME_TYPE_PITCH can be used for
|
| 502 |
+
* such cross-device applications.
|
| 503 |
+
*
|
| 504 |
+
* The ::CUeglFrame is defined as:
|
| 505 |
+
* \code
|
| 506 |
+
* typedef struct CUeglFrame_st {
|
| 507 |
+
* union {
|
| 508 |
+
* CUarray pArray[MAX_PLANES];
|
| 509 |
+
* void* pPitch[MAX_PLANES];
|
| 510 |
+
* } frame;
|
| 511 |
+
* unsigned int width;
|
| 512 |
+
* unsigned int height;
|
| 513 |
+
* unsigned int depth;
|
| 514 |
+
* unsigned int pitch;
|
| 515 |
+
* unsigned int planeCount;
|
| 516 |
+
* unsigned int numChannels;
|
| 517 |
+
* CUeglFrameType frameType;
|
| 518 |
+
* CUeglColorFormat eglColorFormat;
|
| 519 |
+
* CUarray_format cuFormat;
|
| 520 |
+
* } CUeglFrame;
|
| 521 |
+
* \endcode
|
| 522 |
+
*
|
| 523 |
+
* For ::CUeglFrame of type ::CU_EGL_FRAME_TYPE_PITCH, the application may present sub-region of a memory
|
| 524 |
+
* allocation. In that case, the pitched pointer will specify the start address of the sub-region in
|
| 525 |
+
* the allocation and corresponding ::CUeglFrame fields will specify the dimensions of the sub-region.
|
| 526 |
+
*
|
| 527 |
+
* \param conn - Connection on which to present the CUDA array
|
| 528 |
+
* \param eglframe - CUDA Eglstream Proucer Frame handle to be sent to the consumer over EglStream.
|
| 529 |
+
* \param pStream - CUDA stream on which to present the frame.
|
| 530 |
+
*
|
| 531 |
+
* \return
|
| 532 |
+
* ::CUDA_SUCCESS,
|
| 533 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 534 |
+
*
|
| 535 |
+
* \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
|
| 536 |
+
* ::cuEGLStreamProducerReturnFrame,
|
| 537 |
+
* ::cudaEGLStreamProducerPresentFrame
|
| 538 |
+
*/
|
| 539 |
+
CUresult CUDAAPI cuEGLStreamProducerPresentFrame(CUeglStreamConnection *conn,
|
| 540 |
+
CUeglFrame eglframe, CUstream *pStream);
|
| 541 |
+
|
| 542 |
+
/**
|
| 543 |
+
* \brief Return the CUDA eglFrame to the EGLStream released by the consumer.
|
| 544 |
+
*
|
| 545 |
+
* This API can potentially return CUDA_ERROR_LAUNCH_TIMEOUT if the consumer has not
|
| 546 |
+
* returned a frame to EGL stream. If timeout is returned the application can retry.
|
| 547 |
+
*
|
| 548 |
+
* \param conn - Connection on which to return
|
| 549 |
+
* \param eglframe - CUDA Eglstream Proucer Frame handle returned from the consumer over EglStream.
|
| 550 |
+
* \param pStream - CUDA stream on which to return the frame.
|
| 551 |
+
*
|
| 552 |
+
* \return
|
| 553 |
+
* ::CUDA_SUCCESS,
|
| 554 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 555 |
+
* ::CUDA_ERROR_LAUNCH_TIMEOUT
|
| 556 |
+
*
|
| 557 |
+
* \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
|
| 558 |
+
* ::cuEGLStreamProducerPresentFrame,
|
| 559 |
+
* ::cudaEGLStreamProducerReturnFrame
|
| 560 |
+
*/
|
| 561 |
+
CUresult CUDAAPI cuEGLStreamProducerReturnFrame(CUeglStreamConnection *conn,
|
| 562 |
+
CUeglFrame *eglframe, CUstream *pStream);
|
| 563 |
+
|
| 564 |
+
/**
|
| 565 |
+
* \brief Get an eglFrame through which to access a registered EGL graphics resource.
|
| 566 |
+
*
|
| 567 |
+
* Returns in \p *eglFrame an eglFrame pointer through which the registered graphics resource
|
| 568 |
+
* \p resource may be accessed.
|
| 569 |
+
* This API can only be called for registered EGL graphics resources.
|
| 570 |
+
*
|
| 571 |
+
* The ::CUeglFrame is defined as:
|
| 572 |
+
* \code
|
| 573 |
+
* typedef struct CUeglFrame_st {
|
| 574 |
+
* union {
|
| 575 |
+
* CUarray pArray[MAX_PLANES];
|
| 576 |
+
* void* pPitch[MAX_PLANES];
|
| 577 |
+
* } frame;
|
| 578 |
+
* unsigned int width;
|
| 579 |
+
* unsigned int height;
|
| 580 |
+
* unsigned int depth;
|
| 581 |
+
* unsigned int pitch;
|
| 582 |
+
* unsigned int planeCount;
|
| 583 |
+
* unsigned int numChannels;
|
| 584 |
+
* CUeglFrameType frameType;
|
| 585 |
+
* CUeglColorFormat eglColorFormat;
|
| 586 |
+
* CUarray_format cuFormat;
|
| 587 |
+
* } CUeglFrame;
|
| 588 |
+
* \endcode
|
| 589 |
+
*
|
| 590 |
+
* If \p resource is not registered then ::CUDA_ERROR_NOT_MAPPED is returned.
|
| 591 |
+
* *
|
| 592 |
+
* \param eglFrame - Returned eglFrame.
|
| 593 |
+
* \param resource - Registered resource to access.
|
| 594 |
+
* \param index - Index for cubemap surfaces.
|
| 595 |
+
* \param mipLevel - Mipmap level for the subresource to access.
|
| 596 |
+
*
|
| 597 |
+
* \return
|
| 598 |
+
* ::CUDA_SUCCESS,
|
| 599 |
+
* ::CUDA_ERROR_DEINITIALIZED,
|
| 600 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 601 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 602 |
+
* ::CUDA_ERROR_INVALID_VALUE,
|
| 603 |
+
* ::CUDA_ERROR_INVALID_HANDLE,
|
| 604 |
+
* ::CUDA_ERROR_NOT_MAPPED
|
| 605 |
+
*
|
| 606 |
+
* \sa
|
| 607 |
+
* ::cuGraphicsMapResources,
|
| 608 |
+
* ::cuGraphicsSubResourceGetMappedArray,
|
| 609 |
+
* ::cuGraphicsResourceGetMappedPointer,
|
| 610 |
+
* ::cudaGraphicsResourceGetMappedEglFrame
|
| 611 |
+
*/
|
| 612 |
+
CUresult CUDAAPI cuGraphicsResourceGetMappedEglFrame(CUeglFrame* eglFrame, CUgraphicsResource resource, unsigned int index, unsigned int mipLevel);
|
| 613 |
+
|
| 614 |
+
/**
|
| 615 |
+
* \brief Creates an event from EGLSync object
|
| 616 |
+
*
|
| 617 |
+
* Creates an event *phEvent from an EGLSyncKHR eglSync with the flags specified
|
| 618 |
+
* via \p flags. Valid flags include:
|
| 619 |
+
* - ::CU_EVENT_DEFAULT: Default event creation flag.
|
| 620 |
+
* - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking
|
| 621 |
+
* synchronization. A CPU thread that uses ::cuEventSynchronize() to wait on
|
| 622 |
+
* an event created with this flag will block until the event has actually
|
| 623 |
+
* been completed.
|
| 624 |
+
*
|
| 625 |
+
* Once the \p eglSync gets destroyed, ::cuEventDestroy is the only API
|
| 626 |
+
* that can be invoked on the event.
|
| 627 |
+
*
|
| 628 |
+
* ::cuEventRecord and TimingData are not supported for events created from EGLSync.
|
| 629 |
+
*
|
| 630 |
+
* The EGLSyncKHR is an opaque handle to an EGL sync object.
|
| 631 |
+
* typedef void* EGLSyncKHR
|
| 632 |
+
*
|
| 633 |
+
* \param phEvent - Returns newly created event
|
| 634 |
+
* \param eglSync - Opaque handle to EGLSync object
|
| 635 |
+
* \param flags - Event creation flags
|
| 636 |
+
*
|
| 637 |
+
* \return
|
| 638 |
+
* ::CUDA_SUCCESS,
|
| 639 |
+
* ::CUDA_ERROR_DEINITIALIZED,
|
| 640 |
+
* ::CUDA_ERROR_NOT_INITIALIZED,
|
| 641 |
+
* ::CUDA_ERROR_INVALID_CONTEXT,
|
| 642 |
+
* ::CUDA_ERROR_INVALID_VALUE,
|
| 643 |
+
* ::CUDA_ERROR_OUT_OF_MEMORY
|
| 644 |
+
*
|
| 645 |
+
* \sa
|
| 646 |
+
* ::cuEventQuery,
|
| 647 |
+
* ::cuEventSynchronize,
|
| 648 |
+
* ::cuEventDestroy
|
| 649 |
+
*/
|
| 650 |
+
CUresult CUDAAPI cuEventCreateFromEGLSync(CUevent *phEvent, EGLSyncKHR eglSync, unsigned int flags);
|
| 651 |
+
|
| 652 |
+
/** @} */ /* END CUDA_EGL */
|
| 653 |
+
|
| 654 |
+
#ifdef __cplusplus
|
| 655 |
+
};
|
| 656 |
+
#endif
|
| 657 |
+
|
| 658 |
+
#endif
|
| 659 |
+
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_egl_interop.h
ADDED
|
@@ -0,0 +1,642 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_EGL_INTEROP_H__)
|
| 51 |
+
#define __CUDA_EGL_INTEROP_H__
|
| 52 |
+
|
| 53 |
+
#include "cuda_runtime_api.h"
|
| 54 |
+
#include "cuda_runtime.h"
|
| 55 |
+
#include "cudart_platform.h"
|
| 56 |
+
#include "EGL/egl.h"
|
| 57 |
+
#include "EGL/eglext.h"
|
| 58 |
+
|
| 59 |
+
#if defined(__cplusplus)
|
| 60 |
+
extern "C" {
|
| 61 |
+
#endif /* __cplusplus */
|
| 62 |
+
|
| 63 |
+
/**
|
| 64 |
+
* \addtogroup CUDART_TYPES
|
| 65 |
+
* @{
|
| 66 |
+
*/
|
| 67 |
+
|
| 68 |
+
/**
|
| 69 |
+
* Maximum number of planes per frame
|
| 70 |
+
*/
|
| 71 |
+
#define CUDA_EGL_MAX_PLANES 3
|
| 72 |
+
|
| 73 |
+
/**
|
| 74 |
+
* CUDA EglFrame type - array or pointer
|
| 75 |
+
*/
|
| 76 |
+
typedef enum cudaEglFrameType_enum
|
| 77 |
+
{
|
| 78 |
+
cudaEglFrameTypeArray = 0, /**< Frame type CUDA array */
|
| 79 |
+
cudaEglFrameTypePitch = 1, /**< Frame type CUDA pointer */
|
| 80 |
+
} cudaEglFrameType;
|
| 81 |
+
|
| 82 |
+
/**
|
| 83 |
+
* Resource location flags- sysmem or vidmem
|
| 84 |
+
*
|
| 85 |
+
* For CUDA context on iGPU, since video and system memory are equivalent -
|
| 86 |
+
* these flags will not have an effect on the execution.
|
| 87 |
+
*
|
| 88 |
+
* For CUDA context on dGPU, applications can use the flag ::cudaEglResourceLocationFlags
|
| 89 |
+
* to give a hint about the desired location.
|
| 90 |
+
*
|
| 91 |
+
* ::cudaEglResourceLocationSysmem - the frame data is made resident on the system memory
|
| 92 |
+
* to be accessed by CUDA.
|
| 93 |
+
*
|
| 94 |
+
* ::cudaEglResourceLocationVidmem - the frame data is made resident on the dedicated
|
| 95 |
+
* video memory to be accessed by CUDA.
|
| 96 |
+
*
|
| 97 |
+
* There may be an additional latency due to new allocation and data migration,
|
| 98 |
+
* if the frame is produced on a different memory.
|
| 99 |
+
*/
|
| 100 |
+
typedef enum cudaEglResourceLocationFlags_enum {
|
| 101 |
+
cudaEglResourceLocationSysmem = 0x00, /**< Resource location sysmem */
|
| 102 |
+
cudaEglResourceLocationVidmem = 0x01, /**< Resource location vidmem */
|
| 103 |
+
} cudaEglResourceLocationFlags;
|
| 104 |
+
|
| 105 |
+
/**
|
| 106 |
+
* CUDA EGL Color Format - The different planar and multiplanar formats currently supported for CUDA_EGL interops.
|
| 107 |
+
*/
|
| 108 |
+
typedef enum cudaEglColorFormat_enum {
|
| 109 |
+
cudaEglColorFormatYUV420Planar = 0, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 110 |
+
cudaEglColorFormatYUV420SemiPlanar = 1, /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV420Planar. */
|
| 111 |
+
cudaEglColorFormatYUV422Planar = 2, /**< Y, U, V each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 112 |
+
cudaEglColorFormatYUV422SemiPlanar = 3, /**< Y, UV in two surfaces with VU byte ordering, width, height ratio same as YUV422Planar. */
|
| 113 |
+
cudaEglColorFormatARGB = 6, /**< R/G/B/A four channels in one surface with BGRA byte ordering. */
|
| 114 |
+
cudaEglColorFormatRGBA = 7, /**< R/G/B/A four channels in one surface with ABGR byte ordering. */
|
| 115 |
+
cudaEglColorFormatL = 8, /**< single luminance channel in one surface. */
|
| 116 |
+
cudaEglColorFormatR = 9, /**< single color channel in one surface. */
|
| 117 |
+
cudaEglColorFormatYUV444Planar = 10, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
|
| 118 |
+
cudaEglColorFormatYUV444SemiPlanar = 11, /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV444Planar. */
|
| 119 |
+
cudaEglColorFormatYUYV422 = 12, /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
|
| 120 |
+
cudaEglColorFormatUYVY422 = 13, /**< Y, U, V in one surface, interleaved as YUYV in one channel. */
|
| 121 |
+
cudaEglColorFormatABGR = 14, /**< R/G/B/A four channels in one surface with RGBA byte ordering. */
|
| 122 |
+
cudaEglColorFormatBGRA = 15, /**< R/G/B/A four channels in one surface with ARGB byte ordering. */
|
| 123 |
+
cudaEglColorFormatA = 16, /**< Alpha color format - one channel in one surface. */
|
| 124 |
+
cudaEglColorFormatRG = 17, /**< R/G color format - two channels in one surface with GR byte ordering */
|
| 125 |
+
cudaEglColorFormatAYUV = 18, /**< Y, U, V, A four channels in one surface, interleaved as VUYA. */
|
| 126 |
+
cudaEglColorFormatYVU444SemiPlanar = 19, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
|
| 127 |
+
cudaEglColorFormatYVU422SemiPlanar = 20, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 128 |
+
cudaEglColorFormatYVU420SemiPlanar = 21, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 129 |
+
cudaEglColorFormatY10V10U10_444SemiPlanar = 22, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
|
| 130 |
+
cudaEglColorFormatY10V10U10_420SemiPlanar = 23, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 131 |
+
cudaEglColorFormatY12V12U12_444SemiPlanar = 24, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
|
| 132 |
+
cudaEglColorFormatY12V12U12_420SemiPlanar = 25, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 133 |
+
cudaEglColorFormatVYUY_ER = 26, /**< Extended Range Y, U, V in one surface, interleaved as YVYU in one channel. */
|
| 134 |
+
cudaEglColorFormatUYVY_ER = 27, /**< Extended Range Y, U, V in one surface, interleaved as YUYV in one channel. */
|
| 135 |
+
cudaEglColorFormatYUYV_ER = 28, /**< Extended Range Y, U, V in one surface, interleaved as UYVY in one channel. */
|
| 136 |
+
cudaEglColorFormatYVYU_ER = 29, /**< Extended Range Y, U, V in one surface, interleaved as VYUY in one channel. */
|
| 137 |
+
cudaEglColorFormatYUVA_ER = 31, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as AVUY. */
|
| 138 |
+
cudaEglColorFormatAYUV_ER = 32, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as VUYA. */
|
| 139 |
+
cudaEglColorFormatYUV444Planar_ER = 33, /**< Extended Range Y, U, V in three surfaces, U/V width = Y width, U/V height = Y height. */
|
| 140 |
+
cudaEglColorFormatYUV422Planar_ER = 34, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 141 |
+
cudaEglColorFormatYUV420Planar_ER = 35, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 142 |
+
cudaEglColorFormatYUV444SemiPlanar_ER = 36, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = Y width, U/V height = Y height. */
|
| 143 |
+
cudaEglColorFormatYUV422SemiPlanar_ER = 37, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 144 |
+
cudaEglColorFormatYUV420SemiPlanar_ER = 38, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 145 |
+
cudaEglColorFormatYVU444Planar_ER = 39, /**< Extended Range Y, V, U in three surfaces, U/V width = Y width, U/V height = Y height. */
|
| 146 |
+
cudaEglColorFormatYVU422Planar_ER = 40, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 147 |
+
cudaEglColorFormatYVU420Planar_ER = 41, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 148 |
+
cudaEglColorFormatYVU444SemiPlanar_ER = 42, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
|
| 149 |
+
cudaEglColorFormatYVU422SemiPlanar_ER = 43, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 150 |
+
cudaEglColorFormatYVU420SemiPlanar_ER = 44, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 151 |
+
cudaEglColorFormatBayerRGGB = 45, /**< Bayer format - one channel in one surface with interleaved RGGB ordering. */
|
| 152 |
+
cudaEglColorFormatBayerBGGR = 46, /**< Bayer format - one channel in one surface with interleaved BGGR ordering. */
|
| 153 |
+
cudaEglColorFormatBayerGRBG = 47, /**< Bayer format - one channel in one surface with interleaved GRBG ordering. */
|
| 154 |
+
cudaEglColorFormatBayerGBRG = 48, /**< Bayer format - one channel in one surface with interleaved GBRG ordering. */
|
| 155 |
+
cudaEglColorFormatBayer10RGGB = 49, /**< Bayer10 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
|
| 156 |
+
cudaEglColorFormatBayer10BGGR = 50, /**< Bayer10 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
|
| 157 |
+
cudaEglColorFormatBayer10GRBG = 51, /**< Bayer10 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
|
| 158 |
+
cudaEglColorFormatBayer10GBRG = 52, /**< Bayer10 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
|
| 159 |
+
cudaEglColorFormatBayer12RGGB = 53, /**< Bayer12 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 160 |
+
cudaEglColorFormatBayer12BGGR = 54, /**< Bayer12 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 161 |
+
cudaEglColorFormatBayer12GRBG = 55, /**< Bayer12 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 162 |
+
cudaEglColorFormatBayer12GBRG = 56, /**< Bayer12 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 163 |
+
cudaEglColorFormatBayer14RGGB = 57, /**< Bayer14 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
|
| 164 |
+
cudaEglColorFormatBayer14BGGR = 58, /**< Bayer14 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
|
| 165 |
+
cudaEglColorFormatBayer14GRBG = 59, /**< Bayer14 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
|
| 166 |
+
cudaEglColorFormatBayer14GBRG = 60, /**< Bayer14 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
|
| 167 |
+
cudaEglColorFormatBayer20RGGB = 61, /**< Bayer20 format - one channel in one surface with interleaved RGGB ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
|
| 168 |
+
cudaEglColorFormatBayer20BGGR = 62, /**< Bayer20 format - one channel in one surface with interleaved BGGR ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
|
| 169 |
+
cudaEglColorFormatBayer20GRBG = 63, /**< Bayer20 format - one channel in one surface with interleaved GRBG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
|
| 170 |
+
cudaEglColorFormatBayer20GBRG = 64, /**< Bayer20 format - one channel in one surface with interleaved GBRG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
|
| 171 |
+
cudaEglColorFormatYVU444Planar = 65, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
|
| 172 |
+
cudaEglColorFormatYVU422Planar = 66, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */
|
| 173 |
+
cudaEglColorFormatYVU420Planar = 67, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 174 |
+
cudaEglColorFormatBayerIspRGGB = 68, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved RGGB ordering and mapped to opaque integer datatype. */
|
| 175 |
+
cudaEglColorFormatBayerIspBGGR = 69, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved BGGR ordering and mapped to opaque integer datatype. */
|
| 176 |
+
cudaEglColorFormatBayerIspGRBG = 70, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GRBG ordering and mapped to opaque integer datatype. */
|
| 177 |
+
cudaEglColorFormatBayerIspGBRG = 71, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GBRG ordering and mapped to opaque integer datatype. */
|
| 178 |
+
cudaEglColorFormatBayerBCCR = 72, /**< Bayer format - one channel in one surface with interleaved BCCR ordering. */
|
| 179 |
+
cudaEglColorFormatBayerRCCB = 73, /**< Bayer format - one channel in one surface with interleaved RCCB ordering. */
|
| 180 |
+
cudaEglColorFormatBayerCRBC = 74, /**< Bayer format - one channel in one surface with interleaved CRBC ordering. */
|
| 181 |
+
cudaEglColorFormatBayerCBRC = 75, /**< Bayer format - one channel in one surface with interleaved CBRC ordering. */
|
| 182 |
+
cudaEglColorFormatBayer10CCCC = 76, /**< Bayer10 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
|
| 183 |
+
cudaEglColorFormatBayer12BCCR = 77, /**< Bayer12 format - one channel in one surface with interleaved BCCR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 184 |
+
cudaEglColorFormatBayer12RCCB = 78, /**< Bayer12 format - one channel in one surface with interleaved RCCB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 185 |
+
cudaEglColorFormatBayer12CRBC = 79, /**< Bayer12 format - one channel in one surface with interleaved CRBC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 186 |
+
cudaEglColorFormatBayer12CBRC = 80, /**< Bayer12 format - one channel in one surface with interleaved CBRC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 187 |
+
cudaEglColorFormatBayer12CCCC = 81, /**< Bayer12 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
|
| 188 |
+
cudaEglColorFormatY = 82, /**< Color format for single Y plane. */
|
| 189 |
+
cudaEglColorFormatYUV420SemiPlanar_2020 = 83, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 190 |
+
cudaEglColorFormatYVU420SemiPlanar_2020 = 84, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 191 |
+
cudaEglColorFormatYUV420Planar_2020 = 85, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 192 |
+
cudaEglColorFormatYVU420Planar_2020 = 86, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 193 |
+
cudaEglColorFormatYUV420SemiPlanar_709 = 87, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 194 |
+
cudaEglColorFormatYVU420SemiPlanar_709 = 88, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 195 |
+
cudaEglColorFormatYUV420Planar_709 = 89, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 196 |
+
cudaEglColorFormatYVU420Planar_709 = 90, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 197 |
+
cudaEglColorFormatY10V10U10_420SemiPlanar_709 = 91, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 198 |
+
cudaEglColorFormatY10V10U10_420SemiPlanar_2020 = 92, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 199 |
+
cudaEglColorFormatY10V10U10_422SemiPlanar_2020 = 93, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */
|
| 200 |
+
cudaEglColorFormatY10V10U10_422SemiPlanar = 94, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */
|
| 201 |
+
cudaEglColorFormatY10V10U10_422SemiPlanar_709 = 95, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */
|
| 202 |
+
cudaEglColorFormatY_ER = 96, /**< Extended Range Color format for single Y plane. */
|
| 203 |
+
cudaEglColorFormatY_709_ER = 97, /**< Extended Range Color format for single Y plane. */
|
| 204 |
+
cudaEglColorFormatY10_ER = 98, /**< Extended Range Color format for single Y10 plane. */
|
| 205 |
+
cudaEglColorFormatY10_709_ER = 99, /**< Extended Range Color format for single Y10 plane. */
|
| 206 |
+
cudaEglColorFormatY12_ER = 100, /**< Extended Range Color format for single Y12 plane. */
|
| 207 |
+
cudaEglColorFormatY12_709_ER = 101, /**< Extended Range Color format for single Y12 plane. */
|
| 208 |
+
cudaEglColorFormatYUVA = 102, /**< Y, U, V, A four channels in one surface, interleaved as AVUY. */
|
| 209 |
+
cudaEglColorFormatYVYU = 104, /**< Y, U, V in one surface, interleaved as YVYU in one channel. */
|
| 210 |
+
cudaEglColorFormatVYUY = 105, /**< Y, U, V in one surface, interleaved as VYUY in one channel. */
|
| 211 |
+
cudaEglColorFormatY10V10U10_420SemiPlanar_ER = 106, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 212 |
+
cudaEglColorFormatY10V10U10_420SemiPlanar_709_ER = 107, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 213 |
+
cudaEglColorFormatY10V10U10_444SemiPlanar_ER = 108, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
|
| 214 |
+
cudaEglColorFormatY10V10U10_444SemiPlanar_709_ER = 109, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
|
| 215 |
+
cudaEglColorFormatY12V12U12_420SemiPlanar_ER = 110, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 216 |
+
cudaEglColorFormatY12V12U12_420SemiPlanar_709_ER = 111, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
|
| 217 |
+
cudaEglColorFormatY12V12U12_444SemiPlanar_ER = 112, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
|
| 218 |
+
cudaEglColorFormatY12V12U12_444SemiPlanar_709_ER = 113, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
|
| 219 |
+
} cudaEglColorFormat;
|
| 220 |
+
|
| 221 |
+
/**
|
| 222 |
+
* CUDA EGL Plane Descriptor - structure defining each plane of a CUDA EGLFrame
|
| 223 |
+
*/
|
| 224 |
+
typedef struct cudaEglPlaneDesc_st {
|
| 225 |
+
unsigned int width; /**< Width of plane */
|
| 226 |
+
unsigned int height; /**< Height of plane */
|
| 227 |
+
unsigned int depth; /**< Depth of plane */
|
| 228 |
+
unsigned int pitch; /**< Pitch of plane */
|
| 229 |
+
unsigned int numChannels; /**< Number of channels for the plane */
|
| 230 |
+
struct cudaChannelFormatDesc channelDesc; /**< Channel Format Descriptor */
|
| 231 |
+
unsigned int reserved[4]; /**< Reserved for future use */
|
| 232 |
+
} cudaEglPlaneDesc;
|
| 233 |
+
|
| 234 |
+
/**
|
| 235 |
+
* CUDA EGLFrame Descriptor - structure defining one frame of EGL.
|
| 236 |
+
*
|
| 237 |
+
* Each frame may contain one or more planes depending on whether the surface is Multiplanar or not.
|
| 238 |
+
* Each plane of EGLFrame is represented by ::cudaEglPlaneDesc which is defined as:
|
| 239 |
+
* \code
|
| 240 |
+
* typedef struct cudaEglPlaneDesc_st {
|
| 241 |
+
* unsigned int width;
|
| 242 |
+
* unsigned int height;
|
| 243 |
+
* unsigned int depth;
|
| 244 |
+
* unsigned int pitch;
|
| 245 |
+
* unsigned int numChannels;
|
| 246 |
+
* struct cudaChannelFormatDesc channelDesc;
|
| 247 |
+
* unsigned int reserved[4];
|
| 248 |
+
* } cudaEglPlaneDesc;
|
| 249 |
+
* \endcode
|
| 250 |
+
|
| 251 |
+
*/
|
| 252 |
+
typedef struct cudaEglFrame_st {
|
| 253 |
+
union {
|
| 254 |
+
cudaArray_t pArray[CUDA_EGL_MAX_PLANES]; /**< Array of CUDA arrays corresponding to each plane*/
|
| 255 |
+
struct cudaPitchedPtr pPitch[CUDA_EGL_MAX_PLANES]; /**< Array of Pointers corresponding to each plane*/
|
| 256 |
+
} frame;
|
| 257 |
+
cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES]; /**< CUDA EGL Plane Descriptor ::cudaEglPlaneDesc*/
|
| 258 |
+
unsigned int planeCount; /**< Number of planes */
|
| 259 |
+
cudaEglFrameType frameType; /**< Array or Pitch */
|
| 260 |
+
cudaEglColorFormat eglColorFormat; /**< CUDA EGL Color Format*/
|
| 261 |
+
} cudaEglFrame;
|
| 262 |
+
|
| 263 |
+
/**
|
| 264 |
+
* CUDA EGLSream Connection
|
| 265 |
+
*/
|
| 266 |
+
typedef struct CUeglStreamConnection_st *cudaEglStreamConnection;
|
| 267 |
+
|
| 268 |
+
/** @} */ /* END CUDART_TYPES */
|
| 269 |
+
|
| 270 |
+
/**
|
| 271 |
+
* \addtogroup CUDART_EGL EGL Interoperability
|
| 272 |
+
* This section describes the EGL interoperability functions of the CUDA
|
| 273 |
+
* runtime application programming interface.
|
| 274 |
+
*
|
| 275 |
+
* @{
|
| 276 |
+
*/
|
| 277 |
+
|
| 278 |
+
/**
|
| 279 |
+
* \brief Registers an EGL image
|
| 280 |
+
*
|
| 281 |
+
* Registers the EGLImageKHR specified by \p image for access by
|
| 282 |
+
* CUDA. A handle to the registered object is returned as \p pCudaResource.
|
| 283 |
+
* Additional Mapping/Unmapping is not required for the registered resource and
|
| 284 |
+
* ::cudaGraphicsResourceGetMappedEglFrame can be directly called on the \p pCudaResource.
|
| 285 |
+
*
|
| 286 |
+
* The application will be responsible for synchronizing access to shared objects.
|
| 287 |
+
* The application must ensure that any pending operation which access the objects have completed
|
| 288 |
+
* before passing control to CUDA. This may be accomplished by issuing and waiting for
|
| 289 |
+
* glFinish command on all GLcontexts (for OpenGL and likewise for other APIs).
|
| 290 |
+
* The application will be also responsible for ensuring that any pending operation on the
|
| 291 |
+
* registered CUDA resource has completed prior to executing subsequent commands in other APIs
|
| 292 |
+
* accesing the same memory objects.
|
| 293 |
+
* This can be accomplished by calling cuCtxSynchronize or cuEventSynchronize (preferably).
|
| 294 |
+
*
|
| 295 |
+
* The surface's intended usage is specified using \p flags, as follows:
|
| 296 |
+
*
|
| 297 |
+
* - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
|
| 298 |
+
* resource will be used. It is therefore assumed that this resource will be
|
| 299 |
+
* read from and written to by CUDA. This is the default value.
|
| 300 |
+
* - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
|
| 301 |
+
* will not write to this resource.
|
| 302 |
+
* - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
|
| 303 |
+
* CUDA will not read from this resource and will write over the
|
| 304 |
+
* entire contents of the resource, so none of the data previously
|
| 305 |
+
* stored in the resource will be preserved.
|
| 306 |
+
*
|
| 307 |
+
* The EGLImageKHR is an object which can be used to create EGLImage target resource. It is defined as a void pointer.
|
| 308 |
+
* typedef void* EGLImageKHR
|
| 309 |
+
*
|
| 310 |
+
* \param pCudaResource - Pointer to the returned object handle
|
| 311 |
+
* \param image - An EGLImageKHR image which can be used to create target resource.
|
| 312 |
+
* \param flags - Map flags
|
| 313 |
+
*
|
| 314 |
+
* \return
|
| 315 |
+
* ::cudaSuccess,
|
| 316 |
+
* ::cudaErrorInvalidResourceHandle,
|
| 317 |
+
* ::cudaErrorInvalidValue,
|
| 318 |
+
* ::cudaErrorUnknown
|
| 319 |
+
*
|
| 320 |
+
* \sa
|
| 321 |
+
* ::cudaGraphicsUnregisterResource,
|
| 322 |
+
* ::cudaGraphicsResourceGetMappedEglFrame,
|
| 323 |
+
* ::cuGraphicsEGLRegisterImage
|
| 324 |
+
*/
|
| 325 |
+
extern __host__ cudaError_t CUDARTAPI cudaGraphicsEGLRegisterImage(struct cudaGraphicsResource **pCudaResource, EGLImageKHR image, unsigned int flags);
|
| 326 |
+
|
| 327 |
+
/**
|
| 328 |
+
* \brief Connect CUDA to EGLStream as a consumer.
|
| 329 |
+
*
|
| 330 |
+
* Connect CUDA as a consumer to EGLStreamKHR specified by \p eglStream.
|
| 331 |
+
*
|
| 332 |
+
* The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
|
| 333 |
+
* API to another.
|
| 334 |
+
*
|
| 335 |
+
* \param conn - Pointer to the returned connection handle
|
| 336 |
+
* \param eglStream - EGLStreamKHR handle
|
| 337 |
+
*
|
| 338 |
+
* \return
|
| 339 |
+
* ::cudaSuccess,
|
| 340 |
+
* ::cudaErrorInvalidValue,
|
| 341 |
+
* ::cudaErrorUnknown
|
| 342 |
+
*
|
| 343 |
+
* \sa
|
| 344 |
+
* ::cudaEGLStreamConsumerDisconnect,
|
| 345 |
+
* ::cudaEGLStreamConsumerAcquireFrame,
|
| 346 |
+
* ::cudaEGLStreamConsumerReleaseFrame,
|
| 347 |
+
* ::cuEGLStreamConsumerConnect
|
| 348 |
+
*/
|
| 349 |
+
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerConnect(cudaEglStreamConnection *conn, EGLStreamKHR eglStream);
|
| 350 |
+
|
| 351 |
+
/**
|
| 352 |
+
* \brief Connect CUDA to EGLStream as a consumer with given flags.
|
| 353 |
+
*
|
| 354 |
+
* Connect CUDA as a consumer to EGLStreamKHR specified by \p stream with specified \p flags defined by
|
| 355 |
+
* ::cudaEglResourceLocationFlags.
|
| 356 |
+
*
|
| 357 |
+
* The flags specify whether the consumer wants to access frames from system memory or video memory.
|
| 358 |
+
* Default is ::cudaEglResourceLocationVidmem.
|
| 359 |
+
*
|
| 360 |
+
* \param conn - Pointer to the returned connection handle
|
| 361 |
+
* \param eglStream - EGLStreamKHR handle
|
| 362 |
+
* \param flags - Flags denote intended location - system or video.
|
| 363 |
+
*
|
| 364 |
+
* \return
|
| 365 |
+
* ::cudaSuccess,
|
| 366 |
+
* ::cudaErrorInvalidValue,
|
| 367 |
+
* ::cudaErrorUnknown
|
| 368 |
+
*
|
| 369 |
+
* \sa
|
| 370 |
+
* ::cudaEGLStreamConsumerDisconnect,
|
| 371 |
+
* ::cudaEGLStreamConsumerAcquireFrame,
|
| 372 |
+
* ::cudaEGLStreamConsumerReleaseFrame,
|
| 373 |
+
* ::cuEGLStreamConsumerConnectWithFlags
|
| 374 |
+
*/
|
| 375 |
+
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerConnectWithFlags(cudaEglStreamConnection *conn, EGLStreamKHR eglStream, unsigned int flags);
|
| 376 |
+
|
| 377 |
+
/**
|
| 378 |
+
* \brief Disconnect CUDA as a consumer to EGLStream .
|
| 379 |
+
*
|
| 380 |
+
* Disconnect CUDA as a consumer to EGLStreamKHR.
|
| 381 |
+
*
|
| 382 |
+
* \param conn - Conection to disconnect.
|
| 383 |
+
*
|
| 384 |
+
* \return
|
| 385 |
+
* ::cudaSuccess,
|
| 386 |
+
* ::cudaErrorInvalidValue,
|
| 387 |
+
* ::cudaErrorUnknown
|
| 388 |
+
*
|
| 389 |
+
* \sa
|
| 390 |
+
* ::cudaEGLStreamConsumerConnect,
|
| 391 |
+
* ::cudaEGLStreamConsumerAcquireFrame,
|
| 392 |
+
* ::cudaEGLStreamConsumerReleaseFrame,
|
| 393 |
+
* ::cuEGLStreamConsumerDisconnect
|
| 394 |
+
*/
|
| 395 |
+
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerDisconnect(cudaEglStreamConnection *conn);
|
| 396 |
+
|
| 397 |
+
/**
|
| 398 |
+
* \brief Acquire an image frame from the EGLStream with CUDA as a consumer.
|
| 399 |
+
*
|
| 400 |
+
* Acquire an image frame from EGLStreamKHR.
|
| 401 |
+
* ::cudaGraphicsResourceGetMappedEglFrame can be called on \p pCudaResource to get
|
| 402 |
+
* ::cudaEglFrame.
|
| 403 |
+
*
|
| 404 |
+
* \param conn - Connection on which to acquire
|
| 405 |
+
* \param pCudaResource - CUDA resource on which the EGLStream frame will be mapped for use.
|
| 406 |
+
* \param pStream - CUDA stream for synchronization and any data migrations
|
| 407 |
+
* implied by ::cudaEglResourceLocationFlags.
|
| 408 |
+
* \param timeout - Desired timeout in usec.
|
| 409 |
+
*
|
| 410 |
+
* \return
|
| 411 |
+
* ::cudaSuccess,
|
| 412 |
+
* ::cudaErrorInvalidValue,
|
| 413 |
+
* ::cudaErrorUnknown,
|
| 414 |
+
* ::cudaErrorLaunchTimeout
|
| 415 |
+
*
|
| 416 |
+
* \sa
|
| 417 |
+
* ::cudaEGLStreamConsumerConnect,
|
| 418 |
+
* ::cudaEGLStreamConsumerDisconnect,
|
| 419 |
+
* ::cudaEGLStreamConsumerReleaseFrame,
|
| 420 |
+
* ::cuEGLStreamConsumerAcquireFrame
|
| 421 |
+
*/
|
| 422 |
+
|
| 423 |
+
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerAcquireFrame(cudaEglStreamConnection *conn,
|
| 424 |
+
cudaGraphicsResource_t *pCudaResource, cudaStream_t *pStream, unsigned int timeout);
|
| 425 |
+
/**
|
| 426 |
+
* \brief Releases the last frame acquired from the EGLStream.
|
| 427 |
+
*
|
| 428 |
+
* Release the acquired image frame specified by \p pCudaResource to EGLStreamKHR.
|
| 429 |
+
*
|
| 430 |
+
* \param conn - Connection on which to release
|
| 431 |
+
* \param pCudaResource - CUDA resource whose corresponding frame is to be released
|
| 432 |
+
* \param pStream - CUDA stream on which release will be done.
|
| 433 |
+
*
|
| 434 |
+
* \return
|
| 435 |
+
* ::cudaSuccess,
|
| 436 |
+
* ::cudaErrorInvalidValue,
|
| 437 |
+
* ::cudaErrorUnknown
|
| 438 |
+
*
|
| 439 |
+
* \sa
|
| 440 |
+
* ::cudaEGLStreamConsumerConnect,
|
| 441 |
+
* ::cudaEGLStreamConsumerDisconnect,
|
| 442 |
+
* ::cudaEGLStreamConsumerAcquireFrame,
|
| 443 |
+
* ::cuEGLStreamConsumerReleaseFrame
|
| 444 |
+
*/
|
| 445 |
+
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerReleaseFrame(cudaEglStreamConnection *conn,
|
| 446 |
+
cudaGraphicsResource_t pCudaResource, cudaStream_t *pStream);
|
| 447 |
+
|
| 448 |
+
/**
|
| 449 |
+
* \brief Connect CUDA to EGLStream as a producer.
|
| 450 |
+
*
|
| 451 |
+
* Connect CUDA as a producer to EGLStreamKHR specified by \p stream.
|
| 452 |
+
*
|
| 453 |
+
* The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
|
| 454 |
+
* API to another.
|
| 455 |
+
*
|
| 456 |
+
* \param conn - Pointer to the returned connection handle
|
| 457 |
+
* \param eglStream - EGLStreamKHR handle
|
| 458 |
+
* \param width - width of the image to be submitted to the stream
|
| 459 |
+
* \param height - height of the image to be submitted to the stream
|
| 460 |
+
*
|
| 461 |
+
* \return
|
| 462 |
+
* ::cudaSuccess,
|
| 463 |
+
* ::cudaErrorInvalidValue,
|
| 464 |
+
* ::cudaErrorUnknown
|
| 465 |
+
*
|
| 466 |
+
* \sa
|
| 467 |
+
* ::cudaEGLStreamProducerDisconnect,
|
| 468 |
+
* ::cudaEGLStreamProducerPresentFrame,
|
| 469 |
+
* ::cudaEGLStreamProducerReturnFrame,
|
| 470 |
+
* ::cuEGLStreamProducerConnect
|
| 471 |
+
*/
|
| 472 |
+
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerConnect(cudaEglStreamConnection *conn,
|
| 473 |
+
EGLStreamKHR eglStream, EGLint width, EGLint height);
|
| 474 |
+
|
| 475 |
+
/**
|
| 476 |
+
* \brief Disconnect CUDA as a producer to EGLStream .
|
| 477 |
+
*
|
| 478 |
+
* Disconnect CUDA as a producer to EGLStreamKHR.
|
| 479 |
+
*
|
| 480 |
+
* \param conn - Conection to disconnect.
|
| 481 |
+
*
|
| 482 |
+
* \return
|
| 483 |
+
* ::cudaSuccess,
|
| 484 |
+
* ::cudaErrorInvalidValue,
|
| 485 |
+
* ::cudaErrorUnknown
|
| 486 |
+
*
|
| 487 |
+
* \sa
|
| 488 |
+
* ::cudaEGLStreamProducerConnect,
|
| 489 |
+
* ::cudaEGLStreamProducerPresentFrame,
|
| 490 |
+
* ::cudaEGLStreamProducerReturnFrame,
|
| 491 |
+
* ::cuEGLStreamProducerDisconnect
|
| 492 |
+
*/
|
| 493 |
+
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerDisconnect(cudaEglStreamConnection *conn);
|
| 494 |
+
|
| 495 |
+
/**
|
| 496 |
+
* \brief Present a CUDA eglFrame to the EGLStream with CUDA as a producer.
|
| 497 |
+
*
|
| 498 |
+
* The ::cudaEglFrame is defined as:
|
| 499 |
+
* \code
|
| 500 |
+
* typedef struct cudaEglFrame_st {
|
| 501 |
+
* union {
|
| 502 |
+
* cudaArray_t pArray[CUDA_EGL_MAX_PLANES];
|
| 503 |
+
* struct cudaPitchedPtr pPitch[CUDA_EGL_MAX_PLANES];
|
| 504 |
+
* } frame;
|
| 505 |
+
* cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES];
|
| 506 |
+
* unsigned int planeCount;
|
| 507 |
+
* cudaEglFrameType frameType;
|
| 508 |
+
* cudaEglColorFormat eglColorFormat;
|
| 509 |
+
* } cudaEglFrame;
|
| 510 |
+
* \endcode
|
| 511 |
+
*
|
| 512 |
+
* For ::cudaEglFrame of type ::cudaEglFrameTypePitch, the application may present sub-region of a memory
|
| 513 |
+
* allocation. In that case, ::cudaPitchedPtr::ptr will specify the start address of the sub-region in
|
| 514 |
+
* the allocation and ::cudaEglPlaneDesc will specify the dimensions of the sub-region.
|
| 515 |
+
*
|
| 516 |
+
* \param conn - Connection on which to present the CUDA array
|
| 517 |
+
* \param eglframe - CUDA Eglstream Proucer Frame handle to be sent to the consumer over EglStream.
|
| 518 |
+
* \param pStream - CUDA stream on which to present the frame.
|
| 519 |
+
*
|
| 520 |
+
* \return
|
| 521 |
+
* ::cudaSuccess,
|
| 522 |
+
* ::cudaErrorInvalidValue,
|
| 523 |
+
* ::cudaErrorUnknown
|
| 524 |
+
*
|
| 525 |
+
* \sa
|
| 526 |
+
* ::cudaEGLStreamProducerConnect,
|
| 527 |
+
* ::cudaEGLStreamProducerDisconnect,
|
| 528 |
+
* ::cudaEGLStreamProducerReturnFrame,
|
| 529 |
+
* ::cuEGLStreamProducerPresentFrame
|
| 530 |
+
*/
|
| 531 |
+
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerPresentFrame(cudaEglStreamConnection *conn,
|
| 532 |
+
cudaEglFrame eglframe, cudaStream_t *pStream);
|
| 533 |
+
|
| 534 |
+
/**
|
| 535 |
+
* \brief Return the CUDA eglFrame to the EGLStream last released by the consumer.
|
| 536 |
+
*
|
| 537 |
+
* This API can potentially return cudaErrorLaunchTimeout if the consumer has not
|
| 538 |
+
* returned a frame to EGL stream. If timeout is returned the application can retry.
|
| 539 |
+
*
|
| 540 |
+
* \param conn - Connection on which to present the CUDA array
|
| 541 |
+
* \param eglframe - CUDA Eglstream Proucer Frame handle returned from the consumer over EglStream.
|
| 542 |
+
* \param pStream - CUDA stream on which to return the frame.
|
| 543 |
+
*
|
| 544 |
+
* \return
|
| 545 |
+
* ::cudaSuccess,
|
| 546 |
+
* ::cudaErrorLaunchTimeout,
|
| 547 |
+
* ::cudaErrorInvalidValue,
|
| 548 |
+
* ::cudaErrorUnknown
|
| 549 |
+
*
|
| 550 |
+
* \sa
|
| 551 |
+
* ::cudaEGLStreamProducerConnect,
|
| 552 |
+
* ::cudaEGLStreamProducerDisconnect,
|
| 553 |
+
* ::cudaEGLStreamProducerPresentFrame,
|
| 554 |
+
* ::cuEGLStreamProducerReturnFrame
|
| 555 |
+
*/
|
| 556 |
+
extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerReturnFrame(cudaEglStreamConnection *conn,
|
| 557 |
+
cudaEglFrame *eglframe, cudaStream_t *pStream);
|
| 558 |
+
|
| 559 |
+
/**
|
| 560 |
+
* \brief Get an eglFrame through which to access a registered EGL graphics resource.
|
| 561 |
+
*
|
| 562 |
+
* Returns in \p *eglFrame an eglFrame pointer through which the registered graphics resource
|
| 563 |
+
* \p resource may be accessed.
|
| 564 |
+
* This API can only be called for EGL graphics resources.
|
| 565 |
+
*
|
| 566 |
+
* The ::cudaEglFrame is defined as
|
| 567 |
+
* \code
|
| 568 |
+
* typedef struct cudaEglFrame_st {
|
| 569 |
+
* union {
|
| 570 |
+
* cudaArray_t pArray[CUDA_EGL_MAX_PLANES];
|
| 571 |
+
* struct cudaPitchedPtr pPitch[CUDA_EGL_MAX_PLANES];
|
| 572 |
+
* } frame;
|
| 573 |
+
* cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES];
|
| 574 |
+
* unsigned int planeCount;
|
| 575 |
+
* cudaEglFrameType frameType;
|
| 576 |
+
* cudaEglColorFormat eglColorFormat;
|
| 577 |
+
* } cudaEglFrame;
|
| 578 |
+
* \endcode
|
| 579 |
+
*
|
| 580 |
+
*
|
| 581 |
+
* \param eglFrame - Returned eglFrame.
|
| 582 |
+
* \param resource - Registered resource to access.
|
| 583 |
+
* \param index - Index for cubemap surfaces.
|
| 584 |
+
* \param mipLevel - Mipmap level for the subresource to access.
|
| 585 |
+
*
|
| 586 |
+
* \return
|
| 587 |
+
* ::cudaSuccess,
|
| 588 |
+
* ::cudaErrorInvalidValue,
|
| 589 |
+
* ::cudaErrorUnknown
|
| 590 |
+
*
|
| 591 |
+
* \note Note that in case of multiplanar \p *eglFrame, pitch of only first plane (unsigned int cudaEglPlaneDesc::pitch) is to be considered by the application.
|
| 592 |
+
*
|
| 593 |
+
* \sa
|
| 594 |
+
* ::cudaGraphicsSubResourceGetMappedArray,
|
| 595 |
+
* ::cudaGraphicsResourceGetMappedPointer,
|
| 596 |
+
* ::cuGraphicsResourceGetMappedEglFrame
|
| 597 |
+
*/
|
| 598 |
+
extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedEglFrame(cudaEglFrame* eglFrame,
|
| 599 |
+
cudaGraphicsResource_t resource, unsigned int index, unsigned int mipLevel);
|
| 600 |
+
|
| 601 |
+
/**
|
| 602 |
+
* \brief Creates an event from EGLSync object
|
| 603 |
+
*
|
| 604 |
+
* Creates an event *phEvent from an EGLSyncKHR eglSync with the flages specified
|
| 605 |
+
* via \p flags. Valid flags include:
|
| 606 |
+
* - ::cudaEventDefault: Default event creation flag.
|
| 607 |
+
* - ::cudaEventBlockingSync: Specifies that the created event should use blocking
|
| 608 |
+
* synchronization. A CPU thread that uses ::cudaEventSynchronize() to wait on
|
| 609 |
+
* an event created with this flag will block until the event has actually
|
| 610 |
+
* been completed.
|
| 611 |
+
*
|
| 612 |
+
* ::cudaEventRecord and TimingData are not supported for events created from EGLSync.
|
| 613 |
+
*
|
| 614 |
+
* The EGLSyncKHR is an opaque handle to an EGL sync object.
|
| 615 |
+
* typedef void* EGLSyncKHR
|
| 616 |
+
*
|
| 617 |
+
* \param phEvent - Returns newly created event
|
| 618 |
+
* \param eglSync - Opaque handle to EGLSync object
|
| 619 |
+
* \param flags - Event creation flags
|
| 620 |
+
*
|
| 621 |
+
* \return
|
| 622 |
+
* ::cudaSuccess,
|
| 623 |
+
* ::cudaErrorInitializationError,
|
| 624 |
+
* ::cudaErrorInvalidValue,
|
| 625 |
+
* ::cudaErrorLaunchFailure,
|
| 626 |
+
* ::cudaErrorMemoryAllocation
|
| 627 |
+
*
|
| 628 |
+
* \sa
|
| 629 |
+
* ::cudaEventQuery,
|
| 630 |
+
* ::cudaEventSynchronize,
|
| 631 |
+
* ::cudaEventDestroy
|
| 632 |
+
*/
|
| 633 |
+
extern __host__ cudaError_t CUDARTAPI cudaEventCreateFromEGLSync(cudaEvent_t *phEvent, EGLSyncKHR eglSync, unsigned int flags);
|
| 634 |
+
|
| 635 |
+
/** @} */ /* END CUDART_EGL */
|
| 636 |
+
|
| 637 |
+
#if defined(__cplusplus)
|
| 638 |
+
}
|
| 639 |
+
#endif /* __cplusplus */
|
| 640 |
+
|
| 641 |
+
#endif /* __CUDA_EGL_INTEROP_H__ */
|
| 642 |
+
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp8.hpp
ADDED
|
@@ -0,0 +1,1546 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2022 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_FP8_HPP__)
|
| 51 |
+
#define __CUDA_FP8_HPP__
|
| 52 |
+
|
| 53 |
+
#if !defined(__CUDA_FP8_H__)
|
| 54 |
+
#error "Do not include this file directly. Instead, include cuda_fp8.h."
|
| 55 |
+
#endif
|
| 56 |
+
|
| 57 |
+
/* C++ header for std::memcpy (used for type punning in host-side
|
| 58 |
+
* implementations). When compiling as a CUDA source file memcpy is provided
|
| 59 |
+
* implicitly. !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
|
| 60 |
+
*/
|
| 61 |
+
#if defined(__cplusplus) && !defined(__CUDACC__)
|
| 62 |
+
#include <cstring>
|
| 63 |
+
#elif !defined(__cplusplus) && !defined(__CUDACC__)
|
| 64 |
+
#include <string.h>
|
| 65 |
+
#endif /* defined(__cplusplus) && !defined(__CUDACC__) */
|
| 66 |
+
|
| 67 |
+
/* Set up structure-alignment attribute */
|
| 68 |
+
#if !(defined __CUDA_ALIGN__)
|
| 69 |
+
#if defined(__CUDACC__)
|
| 70 |
+
#define __CUDA_ALIGN__(align) __align__(align)
|
| 71 |
+
#else
|
| 72 |
+
/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas"
|
| 73 |
+
* is available) */
|
| 74 |
+
#if __cplusplus >= 201103L
|
| 75 |
+
#define __CUDA_ALIGN__(n) \
|
| 76 |
+
alignas(n) /* C++11 kindly gives us a keyword for this */
|
| 77 |
+
#else /* !defined(__CPP_VERSION_AT_LEAST_11_FP8)*/
|
| 78 |
+
#if defined(__GNUC__)
|
| 79 |
+
#define __CUDA_ALIGN__(n) __attribute__((aligned(n)))
|
| 80 |
+
#elif defined(_MSC_VER)
|
| 81 |
+
#define __CUDA_ALIGN__(n) __declspec(align(n))
|
| 82 |
+
#else
|
| 83 |
+
#define __CUDA_ALIGN__(n)
|
| 84 |
+
#endif /* defined(__GNUC__) */
|
| 85 |
+
#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
|
| 86 |
+
#endif /* defined(__CUDACC__) */
|
| 87 |
+
#endif /* !(defined __CUDA_ALIGN__) */
|
| 88 |
+
|
| 89 |
+
#if !(defined __CPP_VERSION_AT_LEAST_11_FP8)
|
| 90 |
+
/* need c++11 for explicit operators */
|
| 91 |
+
#define __CUDA_NO_FP8_CONVERSION_OPERATORS__
|
| 92 |
+
#endif
|
| 93 |
+
|
| 94 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
|
| 95 |
+
__nv_cvt_double_to_fp8(const double x, const __nv_saturation_t saturate,
|
| 96 |
+
const __nv_fp8_interpretation_t fp8_interpretation) {
|
| 97 |
+
unsigned char res;
|
| 98 |
+
unsigned long long int xbits;
|
| 99 |
+
|
| 100 |
+
#if defined(__CUDACC__) || (!defined __cplusplus)
|
| 101 |
+
(void)memcpy(&xbits, &x, sizeof(x));
|
| 102 |
+
#else
|
| 103 |
+
(void)std::memcpy(&xbits, &x, sizeof(x));
|
| 104 |
+
#endif
|
| 105 |
+
unsigned char FP8_MAXNORM;
|
| 106 |
+
unsigned char FP8_MANTISSA_MASK;
|
| 107 |
+
unsigned short int FP8_EXP_BIAS;
|
| 108 |
+
unsigned long long int FP8_SIGNIFICAND_BITS;
|
| 109 |
+
const unsigned long long int DP_INF_BITS = 0x7FF0000000000000ULL;
|
| 110 |
+
unsigned long long int FP8_MINDENORM_O2;
|
| 111 |
+
unsigned long long int FP8_OVERFLOW_THRESHOLD;
|
| 112 |
+
unsigned long long int FP8_MINNORM;
|
| 113 |
+
|
| 114 |
+
if (fp8_interpretation == __NV_E4M3) {
|
| 115 |
+
FP8_EXP_BIAS = 7U;
|
| 116 |
+
FP8_SIGNIFICAND_BITS = 4ULL;
|
| 117 |
+
FP8_MANTISSA_MASK = 0x7U;
|
| 118 |
+
FP8_MINDENORM_O2 = 0x3F50000000000000ULL; // mindenorm/2 = 2^-10
|
| 119 |
+
FP8_OVERFLOW_THRESHOLD =
|
| 120 |
+
0x407D000000000000ULL; // maxnorm + 1/2ulp = 0x1.Cp+8 + 0x1p+4
|
| 121 |
+
FP8_MAXNORM = 0x7EU;
|
| 122 |
+
FP8_MINNORM = 0x3F90000000000000ULL; // minnorm = 2^-6
|
| 123 |
+
} else { //__NV_E5M2
|
| 124 |
+
FP8_EXP_BIAS = 15U;
|
| 125 |
+
FP8_SIGNIFICAND_BITS = 3ULL;
|
| 126 |
+
FP8_MANTISSA_MASK = 0x3U;
|
| 127 |
+
FP8_MINDENORM_O2 = 0x3EE0000000000000ULL; // mindenorm/2 = 2^-17
|
| 128 |
+
FP8_OVERFLOW_THRESHOLD =
|
| 129 |
+
0x40EE000000000000ULL -
|
| 130 |
+
1ULL; // maxnorm + 1/2ulp = 0x1.Ep+15, and -1 to have common code
|
| 131 |
+
FP8_MAXNORM = 0x7BU;
|
| 132 |
+
FP8_MINNORM = 0x3F10000000000000ULL; // minnorm = 2^-14
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
// 1/2 LSB of the target format, positioned in double precision mantissa
|
| 136 |
+
// helpful in midpoints detection during round-to-nearest-even step
|
| 137 |
+
const unsigned long long int FP8_DP_HALF_ULP =
|
| 138 |
+
(unsigned long long int)1ULL << (53ULL - FP8_SIGNIFICAND_BITS - 1ULL);
|
| 139 |
+
// prepare sign bit in target format
|
| 140 |
+
unsigned char sign = (unsigned char)((xbits >> 63ULL) << 7U);
|
| 141 |
+
// prepare exponent field in target format
|
| 142 |
+
unsigned char exp =
|
| 143 |
+
(unsigned char)((((unsigned short int)(xbits >> 52ULL)) & 0x7FFU) -
|
| 144 |
+
1023U + FP8_EXP_BIAS);
|
| 145 |
+
// round mantissa to target format width, rounding towards zero
|
| 146 |
+
unsigned char mantissa =
|
| 147 |
+
(unsigned char)(xbits >> (53ULL - FP8_SIGNIFICAND_BITS)) &
|
| 148 |
+
FP8_MANTISSA_MASK;
|
| 149 |
+
unsigned long long int absx = xbits & 0x7FFFFFFFFFFFFFFFULL;
|
| 150 |
+
|
| 151 |
+
if (absx <= FP8_MINDENORM_O2) {
|
| 152 |
+
// zero or underflow
|
| 153 |
+
res = 0U;
|
| 154 |
+
} else if (absx > DP_INF_BITS) {
|
| 155 |
+
// NaN
|
| 156 |
+
if (fp8_interpretation == __NV_E4M3) {
|
| 157 |
+
res = 0x7FU;
|
| 158 |
+
} else {
|
| 159 |
+
// NaN --> QNaN
|
| 160 |
+
res = 0x7EU | mantissa;
|
| 161 |
+
}
|
| 162 |
+
} else if (absx > FP8_OVERFLOW_THRESHOLD) {
|
| 163 |
+
if (saturate == __NV_SATFINITE) {
|
| 164 |
+
res = FP8_MAXNORM;
|
| 165 |
+
} else {
|
| 166 |
+
// __NV_NOSAT
|
| 167 |
+
if (fp8_interpretation == __NV_E4M3) {
|
| 168 |
+
// no Inf in E4M3
|
| 169 |
+
res = 0x7FU; // NaN
|
| 170 |
+
} else {
|
| 171 |
+
res = 0x7CU; // Inf in E5M2
|
| 172 |
+
}
|
| 173 |
+
}
|
| 174 |
+
} else if (absx >= FP8_MINNORM) {
|
| 175 |
+
res = (unsigned char)((exp << (FP8_SIGNIFICAND_BITS - 1U)) | mantissa);
|
| 176 |
+
// rounded-off bits
|
| 177 |
+
unsigned long long int round =
|
| 178 |
+
xbits & ((FP8_DP_HALF_ULP << 1ULL) - 1ULL);
|
| 179 |
+
// round-to-nearest-even adjustment
|
| 180 |
+
if ((round > FP8_DP_HALF_ULP) ||
|
| 181 |
+
((round == FP8_DP_HALF_ULP) && (mantissa & 1U))) {
|
| 182 |
+
res = (unsigned char)(res + 1U);
|
| 183 |
+
}
|
| 184 |
+
} else // Denormal range
|
| 185 |
+
{
|
| 186 |
+
unsigned char shift = (unsigned char)(1U - exp);
|
| 187 |
+
// add implicit leading bit
|
| 188 |
+
mantissa |= (unsigned char)(1U << (FP8_SIGNIFICAND_BITS - 1U));
|
| 189 |
+
// additional round-off due to denormalization
|
| 190 |
+
res = (unsigned char)(mantissa >> shift);
|
| 191 |
+
|
| 192 |
+
// rounded-off bits, including implicit leading bit
|
| 193 |
+
unsigned long long int round =
|
| 194 |
+
(xbits | ((unsigned long long int)1ULL << (53ULL - 1ULL))) &
|
| 195 |
+
((FP8_DP_HALF_ULP << (shift + 1ULL)) - 1ULL);
|
| 196 |
+
// round-to-nearest-even adjustment
|
| 197 |
+
if ((round > (FP8_DP_HALF_ULP << shift)) ||
|
| 198 |
+
((round == (FP8_DP_HALF_ULP << shift)) && (res & 1U))) {
|
| 199 |
+
res = (unsigned char)(res + 1U);
|
| 200 |
+
}
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
res |= sign;
|
| 204 |
+
|
| 205 |
+
return (__nv_fp8_storage_t)res;
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
|
| 209 |
+
__nv_cvt_double2_to_fp8x2(const double2 x, const __nv_saturation_t saturate,
|
| 210 |
+
const __nv_fp8_interpretation_t fp8_interpretation) {
|
| 211 |
+
__nv_fp8x2_storage_t storage = (__nv_fp8x2_storage_t)__nv_cvt_double_to_fp8(
|
| 212 |
+
x.y, saturate, fp8_interpretation);
|
| 213 |
+
storage = (__nv_fp8x2_storage_t)(storage << 8U);
|
| 214 |
+
storage = (__nv_fp8x2_storage_t)(storage |
|
| 215 |
+
__nv_cvt_double_to_fp8(
|
| 216 |
+
x.x, saturate, fp8_interpretation));
|
| 217 |
+
return storage;
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
|
| 221 |
+
__nv_cvt_float_to_fp8(const float x, const __nv_saturation_t saturate,
|
| 222 |
+
const __nv_fp8_interpretation_t fp8_interpretation) {
|
| 223 |
+
__nv_fp8_storage_t res = 0U;
|
| 224 |
+
#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
|
| 225 |
+
if (saturate == __NV_SATFINITE) {
|
| 226 |
+
__nv_fp8x2_storage_t storage;
|
| 227 |
+
if (fp8_interpretation == __NV_E5M2) {
|
| 228 |
+
asm("{cvt.rn.satfinite.e5m2x2.f32 %0, %2, %1;}\n"
|
| 229 |
+
: "=h"(storage)
|
| 230 |
+
: "f"(x), "f"(0.0f));
|
| 231 |
+
} else {
|
| 232 |
+
asm("{cvt.rn.satfinite.e4m3x2.f32 %0, %2, %1;}\n"
|
| 233 |
+
: "=h"(storage)
|
| 234 |
+
: "f"(x), "f"(0.0f));
|
| 235 |
+
}
|
| 236 |
+
res = (__nv_fp8_storage_t)storage;
|
| 237 |
+
} else
|
| 238 |
+
#endif
|
| 239 |
+
{
|
| 240 |
+
unsigned int xbits;
|
| 241 |
+
#if defined(__CUDACC__) || (!defined __cplusplus)
|
| 242 |
+
(void)memcpy(&xbits, &x, sizeof(x));
|
| 243 |
+
#else
|
| 244 |
+
(void)std::memcpy(&xbits, &x, sizeof(x));
|
| 245 |
+
#endif
|
| 246 |
+
|
| 247 |
+
// isnan
|
| 248 |
+
if ((xbits & 0x7FFFFFFFU) > 0x7F800000U) {
|
| 249 |
+
// Canonical NaN
|
| 250 |
+
xbits = 0x7FFFFFFFU;
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
float fx;
|
| 254 |
+
#if defined(__CUDACC__) || (!defined __cplusplus)
|
| 255 |
+
(void)memcpy(&fx, &xbits, sizeof(xbits));
|
| 256 |
+
#else
|
| 257 |
+
(void)std::memcpy(&fx, &xbits, sizeof(xbits));
|
| 258 |
+
#endif
|
| 259 |
+
|
| 260 |
+
const double dx = (double)fx;
|
| 261 |
+
res = __nv_cvt_double_to_fp8(dx, saturate, fp8_interpretation);
|
| 262 |
+
}
|
| 263 |
+
return res;
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
|
| 267 |
+
__nv_cvt_float2_to_fp8x2(const float2 x, const __nv_saturation_t saturate,
|
| 268 |
+
const __nv_fp8_interpretation_t fp8_interpretation) {
|
| 269 |
+
__nv_fp8x2_storage_t storage;
|
| 270 |
+
#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
|
| 271 |
+
if (saturate == __NV_SATFINITE) {
|
| 272 |
+
if (fp8_interpretation == __NV_E5M2) {
|
| 273 |
+
asm("{cvt.rn.satfinite.e5m2x2.f32 %0, %2, %1;}\n"
|
| 274 |
+
: "=h"(storage)
|
| 275 |
+
: "f"(x.x), "f"(x.y));
|
| 276 |
+
} else {
|
| 277 |
+
asm("{cvt.rn.satfinite.e4m3x2.f32 %0, %2, %1;}\n"
|
| 278 |
+
: "=h"(storage)
|
| 279 |
+
: "f"(x.x), "f"(x.y));
|
| 280 |
+
}
|
| 281 |
+
} else
|
| 282 |
+
#endif
|
| 283 |
+
{
|
| 284 |
+
storage = (__nv_fp8x2_storage_t)__nv_cvt_float_to_fp8(
|
| 285 |
+
x.y, saturate, fp8_interpretation);
|
| 286 |
+
storage = (__nv_fp8x2_storage_t)(storage << 8U);
|
| 287 |
+
storage = (__nv_fp8x2_storage_t)(storage | __nv_cvt_float_to_fp8(
|
| 288 |
+
x.x, saturate,
|
| 289 |
+
fp8_interpretation));
|
| 290 |
+
}
|
| 291 |
+
return storage;
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ float
|
| 295 |
+
__internal_halfraw_to_float(const __half_raw x) {
|
| 296 |
+
float f;
|
| 297 |
+
#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
|
| 298 |
+
asm("{cvt.f32.f16 %0, %1;}\n" : "=f"(f) : "h"(x.x));
|
| 299 |
+
#else
|
| 300 |
+
const unsigned int ux = (unsigned int)x.x;
|
| 301 |
+
unsigned int sign = (ux >> 15U) & 1U;
|
| 302 |
+
unsigned int exponent = (ux >> 10U) & 0x1fU;
|
| 303 |
+
unsigned int mantissa = (ux & 0x3ffU) << 13U;
|
| 304 |
+
if (exponent == 0x1fU) { /* NaN or Inf */
|
| 305 |
+
/* discard sign of a NaN */
|
| 306 |
+
sign = ((mantissa != 0U) ? (sign >> 1U) : sign);
|
| 307 |
+
mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U);
|
| 308 |
+
exponent = 0xffU;
|
| 309 |
+
} else if (exponent == 0U) { /* Denorm or Zero */
|
| 310 |
+
if (mantissa != 0U) {
|
| 311 |
+
unsigned int msb;
|
| 312 |
+
exponent = 0x71U;
|
| 313 |
+
do {
|
| 314 |
+
msb = (mantissa & 0x400000U);
|
| 315 |
+
mantissa <<= 1U; /* normalize */
|
| 316 |
+
--exponent;
|
| 317 |
+
} while (msb == 0U);
|
| 318 |
+
mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
|
| 319 |
+
}
|
| 320 |
+
} else {
|
| 321 |
+
exponent += 0x70U;
|
| 322 |
+
}
|
| 323 |
+
const unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa);
|
| 324 |
+
#if defined(__CUDACC__) || (!defined __cplusplus)
|
| 325 |
+
(void)memcpy(&f, &u, sizeof(u));
|
| 326 |
+
#else
|
| 327 |
+
(void)std::memcpy(&f, &u, sizeof(u));
|
| 328 |
+
#endif
|
| 329 |
+
#endif /* (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) */
|
| 330 |
+
return f;
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ float2
|
| 334 |
+
__internal_halfraw2_to_float2(const __half2_raw x) {
|
| 335 |
+
__half_raw raw;
|
| 336 |
+
float2 res;
|
| 337 |
+
raw.x = x.x;
|
| 338 |
+
res.x = __internal_halfraw_to_float(raw);
|
| 339 |
+
raw.x = x.y;
|
| 340 |
+
res.y = __internal_halfraw_to_float(raw);
|
| 341 |
+
return res;
|
| 342 |
+
}
|
| 343 |
+
|
| 344 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
|
| 345 |
+
__nv_cvt_halfraw_to_fp8(const __half_raw x, const __nv_saturation_t saturate,
|
| 346 |
+
const __nv_fp8_interpretation_t fp8_interpretation) {
|
| 347 |
+
__nv_fp8_storage_t res = 0U;
|
| 348 |
+
#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
|
| 349 |
+
if (saturate == __NV_SATFINITE) {
|
| 350 |
+
unsigned int half2_storage = (unsigned int)(x.x);
|
| 351 |
+
__nv_fp8x2_storage_t tmp;
|
| 352 |
+
if (fp8_interpretation == __NV_E5M2) {
|
| 353 |
+
asm("{cvt.rn.satfinite.e5m2x2.f16x2 %0, %1;}\n"
|
| 354 |
+
: "=h"(tmp)
|
| 355 |
+
: "r"(half2_storage));
|
| 356 |
+
} else {
|
| 357 |
+
asm("{cvt.rn.satfinite.e4m3x2.f16x2 %0, %1;}\n"
|
| 358 |
+
: "=h"(tmp)
|
| 359 |
+
: "r"(half2_storage));
|
| 360 |
+
}
|
| 361 |
+
res = (__nv_fp8_storage_t)tmp;
|
| 362 |
+
} else
|
| 363 |
+
#endif
|
| 364 |
+
{
|
| 365 |
+
float fx = __internal_halfraw_to_float(x);
|
| 366 |
+
res = __nv_cvt_float_to_fp8(fx, saturate, fp8_interpretation);
|
| 367 |
+
}
|
| 368 |
+
return res;
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_halfraw2_to_fp8x2(
|
| 372 |
+
const __half2_raw x, const __nv_saturation_t saturate,
|
| 373 |
+
const __nv_fp8_interpretation_t fp8_interpretation) {
|
| 374 |
+
__nv_fp8x2_storage_t tmp;
|
| 375 |
+
#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
|
| 376 |
+
if (saturate == __NV_SATFINITE) {
|
| 377 |
+
unsigned int half2_storage;
|
| 378 |
+
(void)memcpy(&half2_storage, &x, sizeof(x));
|
| 379 |
+
|
| 380 |
+
if (fp8_interpretation == __NV_E5M2) {
|
| 381 |
+
asm("{cvt.rn.satfinite.e5m2x2.f16x2 %0, %1;}\n"
|
| 382 |
+
: "=h"(tmp)
|
| 383 |
+
: "r"(half2_storage));
|
| 384 |
+
} else {
|
| 385 |
+
asm("{cvt.rn.satfinite.e4m3x2.f16x2 %0, %1;}\n"
|
| 386 |
+
: "=h"(tmp)
|
| 387 |
+
: "r"(half2_storage));
|
| 388 |
+
}
|
| 389 |
+
} else
|
| 390 |
+
#endif
|
| 391 |
+
{
|
| 392 |
+
__half_raw raw;
|
| 393 |
+
raw.x = x.x;
|
| 394 |
+
__nv_fp8_storage_t lo =
|
| 395 |
+
__nv_cvt_halfraw_to_fp8(raw, saturate, fp8_interpretation);
|
| 396 |
+
raw.x = x.y;
|
| 397 |
+
__nv_fp8_storage_t hi =
|
| 398 |
+
__nv_cvt_halfraw_to_fp8(raw, saturate, fp8_interpretation);
|
| 399 |
+
tmp = hi;
|
| 400 |
+
tmp = (__nv_fp8x2_storage_t)(tmp << 8U);
|
| 401 |
+
tmp = (__nv_fp8x2_storage_t)(tmp | lo);
|
| 402 |
+
}
|
| 403 |
+
return tmp;
|
| 404 |
+
}
|
| 405 |
+
|
| 406 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ float
|
| 407 |
+
__internal_bf16raw_to_float(const __nv_bfloat16_raw x) {
|
| 408 |
+
const unsigned int ux = ((unsigned int)x.x) << 16U;
|
| 409 |
+
float fx;
|
| 410 |
+
#if defined(__CUDACC__) || (!defined __cplusplus)
|
| 411 |
+
(void)memcpy(&fx, &ux, sizeof(ux));
|
| 412 |
+
#else
|
| 413 |
+
(void)std::memcpy(&fx, &ux, sizeof(ux));
|
| 414 |
+
#endif
|
| 415 |
+
return fx;
|
| 416 |
+
}
|
| 417 |
+
|
| 418 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_bfloat16_raw
|
| 419 |
+
__internal_float_to_bf16raw_rz(const float x) {
|
| 420 |
+
unsigned int ux;
|
| 421 |
+
__nv_bfloat16_raw r;
|
| 422 |
+
#if defined(__CUDACC__) || (!defined __cplusplus)
|
| 423 |
+
(void)memcpy(&ux, &x, sizeof(x));
|
| 424 |
+
#else
|
| 425 |
+
(void)std::memcpy(&ux, &x, sizeof(x));
|
| 426 |
+
#endif
|
| 427 |
+
r.x = (unsigned short int)(ux >> 16U);
|
| 428 |
+
return r;
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_bfloat16raw_to_fp8(
|
| 432 |
+
const __nv_bfloat16_raw x, const __nv_saturation_t saturate,
|
| 433 |
+
const __nv_fp8_interpretation_t fp8_interpretation) {
|
| 434 |
+
const float fx = __internal_bf16raw_to_float(x);
|
| 435 |
+
const __nv_fp8_storage_t res =
|
| 436 |
+
__nv_cvt_float_to_fp8(fx, saturate, fp8_interpretation);
|
| 437 |
+
return res;
|
| 438 |
+
}
|
| 439 |
+
|
| 440 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
|
| 441 |
+
__nv_cvt_bfloat16raw2_to_fp8x2(
|
| 442 |
+
const __nv_bfloat162_raw x, const __nv_saturation_t saturate,
|
| 443 |
+
const __nv_fp8_interpretation_t fp8_interpretation) {
|
| 444 |
+
__nv_bfloat16_raw raw;
|
| 445 |
+
raw.x = x.y;
|
| 446 |
+
__nv_fp8x2_storage_t storage =
|
| 447 |
+
(__nv_fp8x2_storage_t)__nv_cvt_bfloat16raw_to_fp8(raw, saturate,
|
| 448 |
+
fp8_interpretation);
|
| 449 |
+
storage = (__nv_fp8x2_storage_t)(storage << 8U);
|
| 450 |
+
raw.x = x.x;
|
| 451 |
+
storage = (__nv_fp8x2_storage_t)(storage |
|
| 452 |
+
__nv_cvt_bfloat16raw_to_fp8(
|
| 453 |
+
raw, saturate, fp8_interpretation));
|
| 454 |
+
return storage;
|
| 455 |
+
}
|
| 456 |
+
|
| 457 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw
|
| 458 |
+
__nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x,
|
| 459 |
+
const __nv_fp8_interpretation_t fp8_interpretation);
|
| 460 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __half_raw
|
| 461 |
+
__nv_cvt_fp8_to_halfraw(const __nv_fp8_storage_t x,
|
| 462 |
+
const __nv_fp8_interpretation_t fp8_interpretation) {
|
| 463 |
+
__half_raw res;
|
| 464 |
+
res.x = 0U;
|
| 465 |
+
#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
|
| 466 |
+
res.x =
|
| 467 |
+
__nv_cvt_fp8x2_to_halfraw2((__nv_fp8x2_storage_t)x, fp8_interpretation)
|
| 468 |
+
.x;
|
| 469 |
+
#else
|
| 470 |
+
unsigned short int ur = (unsigned short int)x;
|
| 471 |
+
ur = (unsigned short int)(ur << 8U);
|
| 472 |
+
|
| 473 |
+
if (fp8_interpretation == __NV_E5M2) {
|
| 474 |
+
if ((ur & 0x7FFFU) > 0x7C00U) {
|
| 475 |
+
/* If NaN, return canonical NaN */
|
| 476 |
+
ur = 0x7FFFU;
|
| 477 |
+
}
|
| 478 |
+
} else { // __NV_E4M3
|
| 479 |
+
unsigned short int sign = ur & 0x8000U;
|
| 480 |
+
unsigned short int exponent =
|
| 481 |
+
(unsigned short int)(((ur & 0x7800U) >> 1U) + 0x2000U);
|
| 482 |
+
unsigned short int mantissa = (ur & 0x0700U) >> 1U;
|
| 483 |
+
unsigned char absx = 0x7FU & (unsigned char)x;
|
| 484 |
+
|
| 485 |
+
if (absx == 0x7FU) // NaN
|
| 486 |
+
{
|
| 487 |
+
ur = 0x7FFFU; // fp16 canonical NaN, discard sign
|
| 488 |
+
} else if (exponent == 0x2000U) {
|
| 489 |
+
// zero or denormal
|
| 490 |
+
if (mantissa != 0U) {
|
| 491 |
+
// normalize
|
| 492 |
+
mantissa = (unsigned short int)(mantissa << 1U);
|
| 493 |
+
while ((mantissa & 0x0400U) == 0U) {
|
| 494 |
+
mantissa = (unsigned short int)(mantissa << 1U);
|
| 495 |
+
exponent = (unsigned short int)(exponent - 0x0400U);
|
| 496 |
+
}
|
| 497 |
+
// discard implicit leading bit
|
| 498 |
+
mantissa &= 0x03FFU;
|
| 499 |
+
} else { // Zero
|
| 500 |
+
exponent = 0U;
|
| 501 |
+
}
|
| 502 |
+
|
| 503 |
+
ur = (sign | exponent) | mantissa;
|
| 504 |
+
} else {
|
| 505 |
+
ur = (sign | exponent) | mantissa;
|
| 506 |
+
}
|
| 507 |
+
}
|
| 508 |
+
res.x = ur;
|
| 509 |
+
#endif
|
| 510 |
+
return res;
|
| 511 |
+
}
|
| 512 |
+
|
| 513 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw
|
| 514 |
+
__nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x,
|
| 515 |
+
const __nv_fp8_interpretation_t fp8_interpretation) {
|
| 516 |
+
__half2_raw res;
|
| 517 |
+
#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
|
| 518 |
+
unsigned int half2_storage;
|
| 519 |
+
if (fp8_interpretation == __NV_E5M2) {
|
| 520 |
+
asm("{cvt.rn.f16x2.e5m2x2 %0, %1;}\n" : "=r"(half2_storage) : "h"(x));
|
| 521 |
+
} else {
|
| 522 |
+
asm("{cvt.rn.f16x2.e4m3x2 %0, %1;}\n" : "=r"(half2_storage) : "h"(x));
|
| 523 |
+
}
|
| 524 |
+
(void)memcpy(&res, &half2_storage, sizeof(half2_storage));
|
| 525 |
+
#else
|
| 526 |
+
res.x =
|
| 527 |
+
__nv_cvt_fp8_to_halfraw((__nv_fp8_storage_t)x, fp8_interpretation).x;
|
| 528 |
+
res.y = __nv_cvt_fp8_to_halfraw((__nv_fp8_storage_t)(x >> 8U),
|
| 529 |
+
fp8_interpretation)
|
| 530 |
+
.x;
|
| 531 |
+
#endif
|
| 532 |
+
return res;
|
| 533 |
+
}
|
| 534 |
+
|
| 535 |
+
/* All other definitions in this file are only visible to C++ compilers */
|
| 536 |
+
#if defined(__cplusplus)
|
| 537 |
+
|
| 538 |
+
/**
|
| 539 |
+
* \defgroup CUDA_MATH_FP8_E5M2_STRUCT C++ struct for handling fp8 data type of e5m2 kind.
|
| 540 |
+
* \ingroup CUDA_MATH_INTRINSIC_FP8
|
| 541 |
+
*/
|
| 542 |
+
|
| 543 |
+
/**
|
| 544 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 545 |
+
* \brief __nv_fp8_e5m2 datatype
|
| 546 |
+
*
|
| 547 |
+
* \details This structure implements the datatype for handling
|
| 548 |
+
* \p fp8 floating-point numbers of \p e5m2 kind:
|
| 549 |
+
* with 1 sign, 5 exponent, 1 implicit and 2 explicit mantissa bits.
|
| 550 |
+
*
|
| 551 |
+
* The structure implements converting constructors and operators.
|
| 552 |
+
*/
|
| 553 |
+
struct __CUDA_ALIGN__(1) __nv_fp8_e5m2 {
|
| 554 |
+
public:
|
| 555 |
+
/**
|
| 556 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 557 |
+
* Storage variable contains the \p fp8 floating-point data.
|
| 558 |
+
*/
|
| 559 |
+
__nv_fp8_storage_t __x;
|
| 560 |
+
|
| 561 |
+
/**
|
| 562 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 563 |
+
* Constructor by default.
|
| 564 |
+
*/
|
| 565 |
+
#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
|
| 566 |
+
__nv_fp8_e5m2() = default;
|
| 567 |
+
#else
|
| 568 |
+
__CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2() {}
|
| 569 |
+
#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
|
| 570 |
+
|
| 571 |
+
#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
|
| 572 |
+
|
| 573 |
+
/* Construct from wider FP types */
|
| 574 |
+
/* Note we do avoid constructor init-list because of special host/device
|
| 575 |
+
* compilation rules */
|
| 576 |
+
|
| 577 |
+
/**
|
| 578 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 579 |
+
* Constructor from \p __half data type, relies on \p __NV_SATFINITE
|
| 580 |
+
* behavior for out-of-range values.
|
| 581 |
+
*/
|
| 582 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const __half f) {
|
| 583 |
+
__x = __nv_cvt_halfraw_to_fp8(static_cast<__half_raw>(f),
|
| 584 |
+
__NV_SATFINITE, __NV_E5M2);
|
| 585 |
+
}
|
| 586 |
+
/**
|
| 587 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 588 |
+
* Constructor from \p __nv_bfloat16 data type, relies on \p __NV_SATFINITE
|
| 589 |
+
* behavior for out-of-range values.
|
| 590 |
+
*/
|
| 591 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const __nv_bfloat16 f) {
|
| 592 |
+
__x = __nv_cvt_bfloat16raw_to_fp8(static_cast<__nv_bfloat16_raw>(f),
|
| 593 |
+
__NV_SATFINITE, __NV_E5M2);
|
| 594 |
+
}
|
| 595 |
+
/**
|
| 596 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 597 |
+
* Constructor from \p float data type, relies on \p __NV_SATFINITE behavior
|
| 598 |
+
* for out-of-range values.
|
| 599 |
+
*/
|
| 600 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const float f) {
|
| 601 |
+
__x = __nv_cvt_float_to_fp8(f, __NV_SATFINITE, __NV_E5M2);
|
| 602 |
+
}
|
| 603 |
+
/**
|
| 604 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 605 |
+
* Constructor from \p double data type, relies on \p __NV_SATFINITE
|
| 606 |
+
* behavior for out-of-range values.
|
| 607 |
+
*/
|
| 608 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const double f) {
|
| 609 |
+
__x = __nv_cvt_double_to_fp8(f, __NV_SATFINITE, __NV_E5M2);
|
| 610 |
+
}
|
| 611 |
+
|
| 612 |
+
/* Converts from integral */
|
| 613 |
+
|
| 614 |
+
/**
|
| 615 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 616 |
+
* Constructor from \p unsigned \p short \p int data type, relies on \p
|
| 617 |
+
* __NV_SATFINITE behavior for out-of-range values.
|
| 618 |
+
*/
|
| 619 |
+
explicit __CUDA_HOSTDEVICE_FP8__
|
| 620 |
+
__nv_fp8_e5m2(const unsigned short int val) {
|
| 621 |
+
__x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
|
| 622 |
+
}
|
| 623 |
+
/**
|
| 624 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 625 |
+
* Constructor from \p unsigned \p int data type, relies on \p
|
| 626 |
+
* __NV_SATFINITE behavior for out-of-range values.
|
| 627 |
+
*/
|
| 628 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const unsigned int val) {
|
| 629 |
+
__x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
|
| 630 |
+
}
|
| 631 |
+
/**
|
| 632 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 633 |
+
* Constructor from \p unsigned \p long \p long \p int data type, relies on
|
| 634 |
+
* \p __NV_SATFINITE behavior for out-of-range values.
|
| 635 |
+
*/
|
| 636 |
+
explicit __CUDA_HOSTDEVICE_FP8__
|
| 637 |
+
__nv_fp8_e5m2(const unsigned long long int val) {
|
| 638 |
+
__x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
|
| 639 |
+
}
|
| 640 |
+
|
| 641 |
+
/**
|
| 642 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 643 |
+
* Constructor from \p short \p int data type.
|
| 644 |
+
*/
|
| 645 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const short int val) {
|
| 646 |
+
__x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
|
| 647 |
+
}
|
| 648 |
+
/**
|
| 649 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 650 |
+
* Constructor from \p int data type, relies on \p __NV_SATFINITE behavior
|
| 651 |
+
* for out-of-range values.
|
| 652 |
+
*/
|
| 653 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const int val) {
|
| 654 |
+
__x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
|
| 655 |
+
}
|
| 656 |
+
/**
|
| 657 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 658 |
+
* Constructor from \p long \p long \p int data type, relies on \p
|
| 659 |
+
* __NV_SATFINITE behavior for out-of-range values.
|
| 660 |
+
*/
|
| 661 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const long long int val) {
|
| 662 |
+
__x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
|
| 663 |
+
}
|
| 664 |
+
|
| 665 |
+
#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
|
| 666 |
+
/* Widening FP converts */
|
| 667 |
+
/**
|
| 668 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 669 |
+
* Conversion operator to \p __half data type.
|
| 670 |
+
*/
|
| 671 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator __half() const {
|
| 672 |
+
return static_cast<__half>(__nv_cvt_fp8_to_halfraw(__x, __NV_E5M2));
|
| 673 |
+
}
|
| 674 |
+
/**
|
| 675 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 676 |
+
* Conversion operator to \p float data type.
|
| 677 |
+
*/
|
| 678 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator float() const {
|
| 679 |
+
return __internal_halfraw_to_float(
|
| 680 |
+
__nv_cvt_fp8_to_halfraw(__x, __NV_E5M2));
|
| 681 |
+
}
|
| 682 |
+
/**
|
| 683 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 684 |
+
* Conversion operator to \p __nv_bfloat16 data type.
|
| 685 |
+
*/
|
| 686 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator __nv_bfloat16() const {
|
| 687 |
+
return static_cast<__nv_bfloat16>(
|
| 688 |
+
__internal_float_to_bf16raw_rz(float(*this)));
|
| 689 |
+
}
|
| 690 |
+
/**
|
| 691 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 692 |
+
* Conversion operator to \p double data type.
|
| 693 |
+
*/
|
| 694 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator double() const {
|
| 695 |
+
return static_cast<double>(float(*this));
|
| 696 |
+
}
|
| 697 |
+
|
| 698 |
+
/* Convert to integral */
|
| 699 |
+
|
| 700 |
+
/**
|
| 701 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 702 |
+
* Conversion operator to \p unsigned \p char data type.
|
| 703 |
+
* Clamps negative and too large inputs to the output range.
|
| 704 |
+
* \p NaN inputs convert to \p zero.
|
| 705 |
+
*/
|
| 706 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned char() const {
|
| 707 |
+
unsigned char i;
|
| 708 |
+
const float f = float(*this);
|
| 709 |
+
const unsigned char max_val = 0xFFU;
|
| 710 |
+
const unsigned char min_val = 0U;
|
| 711 |
+
const unsigned char bits = (*this).__x;
|
| 712 |
+
// saturation fixup
|
| 713 |
+
if ((bits & 0x7FU) > 0x7CU) {
|
| 714 |
+
// NaN
|
| 715 |
+
i = 0;
|
| 716 |
+
} else if (f > static_cast<float>(max_val)) {
|
| 717 |
+
// saturate maximum
|
| 718 |
+
i = max_val;
|
| 719 |
+
} else if (f < static_cast<float>(min_val)) {
|
| 720 |
+
// saturate minimum
|
| 721 |
+
i = min_val;
|
| 722 |
+
} else {
|
| 723 |
+
// normal value
|
| 724 |
+
i = static_cast<unsigned char>(f);
|
| 725 |
+
}
|
| 726 |
+
return i;
|
| 727 |
+
}
|
| 728 |
+
/**
|
| 729 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 730 |
+
* Conversion operator to \p unsigned \p short \p int data type.
|
| 731 |
+
* Clamps negative and too large inputs to the output range.
|
| 732 |
+
* \p NaN inputs convert to \p zero.
|
| 733 |
+
*/
|
| 734 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned short int() const {
|
| 735 |
+
return __half2ushort_rz(__half(*this));
|
| 736 |
+
}
|
| 737 |
+
/**
|
| 738 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 739 |
+
* Conversion operator to \p unsigned \p int data type.
|
| 740 |
+
* Clamps negative and too large inputs to the output range.
|
| 741 |
+
* \p NaN inputs convert to \p zero.
|
| 742 |
+
*/
|
| 743 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned int() const {
|
| 744 |
+
return __half2uint_rz(__half(*this));
|
| 745 |
+
}
|
| 746 |
+
/**
|
| 747 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 748 |
+
* Conversion operator to \p unsigned \p long \p long \p int data type.
|
| 749 |
+
* Clamps negative and too large inputs to the output range.
|
| 750 |
+
* \p NaN inputs convert to \p 0x8000000000000000ULL.
|
| 751 |
+
*/
|
| 752 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long long int() const {
|
| 753 |
+
return __half2ull_rz(__half(*this));
|
| 754 |
+
}
|
| 755 |
+
|
| 756 |
+
/**
|
| 757 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 758 |
+
* Conversion operator to \p signed \p char data type.
|
| 759 |
+
* Clamps too large inputs to the output range.
|
| 760 |
+
* \p NaN inputs convert to \p zero.
|
| 761 |
+
*/
|
| 762 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator signed char() const {
|
| 763 |
+
signed char i;
|
| 764 |
+
const float f = float(*this);
|
| 765 |
+
const signed char max_val = (signed char)0x7FU;
|
| 766 |
+
const signed char min_val = (signed char)0x80U;
|
| 767 |
+
const unsigned char bits = (*this).__x;
|
| 768 |
+
// saturation fixup
|
| 769 |
+
if ((bits & 0x7FU) > 0x7CU) {
|
| 770 |
+
// NaN
|
| 771 |
+
i = 0;
|
| 772 |
+
} else if (f > static_cast<float>(max_val)) {
|
| 773 |
+
// saturate maximum
|
| 774 |
+
i = max_val;
|
| 775 |
+
} else if (f < static_cast<float>(min_val)) {
|
| 776 |
+
// saturate minimum
|
| 777 |
+
i = min_val;
|
| 778 |
+
} else {
|
| 779 |
+
// normal value
|
| 780 |
+
i = static_cast<signed char>(f);
|
| 781 |
+
}
|
| 782 |
+
return i;
|
| 783 |
+
}
|
| 784 |
+
/**
|
| 785 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 786 |
+
* Conversion operator to \p short \p int data type.
|
| 787 |
+
* Clamps too large inputs to the output range.
|
| 788 |
+
* \p NaN inputs convert to \p zero.
|
| 789 |
+
*/
|
| 790 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator short int() const {
|
| 791 |
+
return __half2short_rz(__half(*this));
|
| 792 |
+
}
|
| 793 |
+
/**
|
| 794 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 795 |
+
* Conversion operator to \p int data type.
|
| 796 |
+
* Clamps too large inputs to the output range.
|
| 797 |
+
* \p NaN inputs convert to \p zero.
|
| 798 |
+
*/
|
| 799 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator int() const {
|
| 800 |
+
return __half2int_rz(__half(*this));
|
| 801 |
+
}
|
| 802 |
+
/**
|
| 803 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 804 |
+
* Conversion operator to \p long \p long \p int data type.
|
| 805 |
+
* Clamps too large inputs to the output range.
|
| 806 |
+
* \p NaN inputs convert to \p 0x8000000000000000LL.
|
| 807 |
+
*/
|
| 808 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator long long int() const {
|
| 809 |
+
return __half2ll_rz(__half(*this));
|
| 810 |
+
}
|
| 811 |
+
|
| 812 |
+
/**
|
| 813 |
+
* \ingroup CUDA_MATH_FP8_E5M2_STRUCT
|
| 814 |
+
* Conversion operator to \p bool data type.
|
| 815 |
+
* +0 and -0 inputs convert to \p false.
|
| 816 |
+
* Non-zero inputs convert to \p true.
|
| 817 |
+
*/
|
| 818 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator bool() const {
|
| 819 |
+
return (__x & 0x7FU) != 0U;
|
| 820 |
+
}
|
| 821 |
+
#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
|
| 822 |
+
#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
|
| 823 |
+
};
|
| 824 |
+
|
| 825 |
+
/**
|
| 826 |
+
* \defgroup CUDA_MATH_FP8X2_E5M2_STRUCT C++ struct for handling vector type of two fp8 values of e5m2 kind.
|
| 827 |
+
* \ingroup CUDA_MATH_INTRINSIC_FP8
|
| 828 |
+
*/
|
| 829 |
+
|
| 830 |
+
/**
|
| 831 |
+
* \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
|
| 832 |
+
* \brief __nv_fp8x2_e5m2 datatype
|
| 833 |
+
*
|
| 834 |
+
* \details This structure implements the datatype for handling two
|
| 835 |
+
* \p fp8 floating-point numbers of \p e5m2 kind each:
|
| 836 |
+
* with 1 sign, 5 exponent, 1 implicit and 2 explicit mantissa bits.
|
| 837 |
+
*
|
| 838 |
+
* The structure implements converting constructors and operators.
|
| 839 |
+
*/
|
| 840 |
+
struct __CUDA_ALIGN__(2) __nv_fp8x2_e5m2 {
|
| 841 |
+
public:
|
| 842 |
+
/**
|
| 843 |
+
* \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
|
| 844 |
+
* Storage variable contains the vector of two \p fp8 floating-point data
|
| 845 |
+
* values.
|
| 846 |
+
*/
|
| 847 |
+
__nv_fp8x2_storage_t __x;
|
| 848 |
+
|
| 849 |
+
/**
|
| 850 |
+
* \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
|
| 851 |
+
* Constructor by default.
|
| 852 |
+
*/
|
| 853 |
+
#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
|
| 854 |
+
__nv_fp8x2_e5m2() = default;
|
| 855 |
+
#else
|
| 856 |
+
__CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2() {}
|
| 857 |
+
#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
|
| 858 |
+
|
| 859 |
+
#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
|
| 860 |
+
|
| 861 |
+
/* Construct from wider types */
|
| 862 |
+
|
| 863 |
+
/**
|
| 864 |
+
* \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
|
| 865 |
+
* Constructor from \p __half2 data type, relies on \p __NV_SATFINITE
|
| 866 |
+
* behavior for out-of-range values.
|
| 867 |
+
*/
|
| 868 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const __half2 f) {
|
| 869 |
+
__x = __nv_cvt_halfraw2_to_fp8x2(static_cast<__half2_raw>(f),
|
| 870 |
+
__NV_SATFINITE, __NV_E5M2);
|
| 871 |
+
}
|
| 872 |
+
/**
|
| 873 |
+
* \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
|
| 874 |
+
* Constructor from \p __nv_bfloat162 data type, relies on \p __NV_SATFINITE
|
| 875 |
+
* behavior for out-of-range values.
|
| 876 |
+
*/
|
| 877 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const __nv_bfloat162 f) {
|
| 878 |
+
__x = __nv_cvt_bfloat16raw2_to_fp8x2(static_cast<__nv_bfloat162_raw>(f),
|
| 879 |
+
__NV_SATFINITE, __NV_E5M2);
|
| 880 |
+
}
|
| 881 |
+
/**
|
| 882 |
+
* \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
|
| 883 |
+
* Constructor from \p float2 data type, relies on \p __NV_SATFINITE
|
| 884 |
+
* behavior for out-of-range values.
|
| 885 |
+
*/
|
| 886 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const float2 f) {
|
| 887 |
+
__x = __nv_cvt_float2_to_fp8x2(f, __NV_SATFINITE, __NV_E5M2);
|
| 888 |
+
}
|
| 889 |
+
/**
|
| 890 |
+
* \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
|
| 891 |
+
* Constructor from \p double2 data type, relies on \p __NV_SATFINITE
|
| 892 |
+
* behavior for out-of-range values.
|
| 893 |
+
*/
|
| 894 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const double2 f) {
|
| 895 |
+
__x = __nv_cvt_double2_to_fp8x2(f, __NV_SATFINITE, __NV_E5M2);
|
| 896 |
+
}
|
| 897 |
+
|
| 898 |
+
#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
|
| 899 |
+
/* Widening converts */
|
| 900 |
+
/**
|
| 901 |
+
* \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
|
| 902 |
+
* Conversion operator to \p __half2 data type.
|
| 903 |
+
*/
|
| 904 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator __half2() const {
|
| 905 |
+
return static_cast<__half2>(__nv_cvt_fp8x2_to_halfraw2(__x, __NV_E5M2));
|
| 906 |
+
}
|
| 907 |
+
/**
|
| 908 |
+
* \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
|
| 909 |
+
* Conversion operator to \p float2 data type.
|
| 910 |
+
*/
|
| 911 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator float2() const {
|
| 912 |
+
return __internal_halfraw2_to_float2(
|
| 913 |
+
__nv_cvt_fp8x2_to_halfraw2(__x, __NV_E5M2));
|
| 914 |
+
}
|
| 915 |
+
#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
|
| 916 |
+
#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
|
| 917 |
+
};
|
| 918 |
+
|
| 919 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ unsigned int
|
| 920 |
+
__internal_pack_u16x2_to_u32(const unsigned short int src_lo,
|
| 921 |
+
const unsigned short int src_hi) {
|
| 922 |
+
unsigned int dst;
|
| 923 |
+
#if (defined __CUDACC__) && (defined __CUDA_ARCH__)
|
| 924 |
+
asm("{ mov.b32 %0, {%1,%2};}\n" : "=r"(dst) : "h"(src_lo), "h"(src_hi));
|
| 925 |
+
#else
|
| 926 |
+
dst = (static_cast<unsigned int>(src_hi) << 16U) |
|
| 927 |
+
static_cast<unsigned int>(src_lo);
|
| 928 |
+
#endif
|
| 929 |
+
return dst;
|
| 930 |
+
}
|
| 931 |
+
|
| 932 |
+
/**
|
| 933 |
+
* \defgroup CUDA_MATH_FP8X4_E5M2_STRUCT C++ struct for handling vector type of four fp8 values of e5m2 kind.
|
| 934 |
+
* \ingroup CUDA_MATH_INTRINSIC_FP8
|
| 935 |
+
*/
|
| 936 |
+
|
| 937 |
+
/**
|
| 938 |
+
* \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
|
| 939 |
+
* \brief __nv_fp8x4_e5m2 datatype
|
| 940 |
+
*
|
| 941 |
+
* \details This structure implements the datatype for handling four
|
| 942 |
+
* \p fp8 floating-point numbers of \p e5m2 kind each:
|
| 943 |
+
* with 1 sign, 5 exponent, 1 implicit and 2 explicit mantissa bits.
|
| 944 |
+
*
|
| 945 |
+
* The structure implements converting constructors and operators.
|
| 946 |
+
*/
|
| 947 |
+
struct __CUDA_ALIGN__(4) __nv_fp8x4_e5m2 {
|
| 948 |
+
public:
|
| 949 |
+
/**
|
| 950 |
+
* \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
|
| 951 |
+
* Storage variable contains the vector of four \p fp8 floating-point data
|
| 952 |
+
* values.
|
| 953 |
+
*/
|
| 954 |
+
__nv_fp8x4_storage_t __x;
|
| 955 |
+
|
| 956 |
+
/**
|
| 957 |
+
* \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
|
| 958 |
+
* Constructor by default.
|
| 959 |
+
*/
|
| 960 |
+
#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
|
| 961 |
+
__nv_fp8x4_e5m2() = default;
|
| 962 |
+
#else
|
| 963 |
+
__CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2() {}
|
| 964 |
+
#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
|
| 965 |
+
|
| 966 |
+
#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
|
| 967 |
+
|
| 968 |
+
/* Construct from wider types */
|
| 969 |
+
|
| 970 |
+
/**
|
| 971 |
+
* \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
|
| 972 |
+
* Constructor from a pair of \p __half2 data type values,
|
| 973 |
+
* relies on \p __NV_SATFINITE behavior for out-of-range values.
|
| 974 |
+
*/
|
| 975 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const __half2 flo,
|
| 976 |
+
const __half2 fhi) {
|
| 977 |
+
const __nv_fp8x2_storage_t rlo = __nv_cvt_halfraw2_to_fp8x2(
|
| 978 |
+
static_cast<__half2_raw>(flo), __NV_SATFINITE, __NV_E5M2);
|
| 979 |
+
const __nv_fp8x2_storage_t rhi = __nv_cvt_halfraw2_to_fp8x2(
|
| 980 |
+
static_cast<__half2_raw>(fhi), __NV_SATFINITE, __NV_E5M2);
|
| 981 |
+
__x = __internal_pack_u16x2_to_u32(rlo, rhi);
|
| 982 |
+
}
|
| 983 |
+
/**
|
| 984 |
+
* \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
|
| 985 |
+
* Constructor from a pair of \p __nv_bfloat162 data type values,
|
| 986 |
+
* relies on \p __NV_SATFINITE behavior for out-of-range values.
|
| 987 |
+
*/
|
| 988 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const __nv_bfloat162 flo,
|
| 989 |
+
const __nv_bfloat162 fhi) {
|
| 990 |
+
const __nv_fp8x2_storage_t rlo = __nv_cvt_bfloat16raw2_to_fp8x2(
|
| 991 |
+
static_cast<__nv_bfloat162_raw>(flo), __NV_SATFINITE, __NV_E5M2);
|
| 992 |
+
const __nv_fp8x2_storage_t rhi = __nv_cvt_bfloat16raw2_to_fp8x2(
|
| 993 |
+
static_cast<__nv_bfloat162_raw>(fhi), __NV_SATFINITE, __NV_E5M2);
|
| 994 |
+
__x = __internal_pack_u16x2_to_u32(rlo, rhi);
|
| 995 |
+
}
|
| 996 |
+
/**
|
| 997 |
+
* \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
|
| 998 |
+
* Constructor from \p float4 vector data type,
|
| 999 |
+
* relies on \p __NV_SATFINITE behavior for out-of-range values.
|
| 1000 |
+
*/
|
| 1001 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const float4 f) {
|
| 1002 |
+
const float2 flo = {f.x, f.y};
|
| 1003 |
+
const float2 fhi = {f.z, f.w};
|
| 1004 |
+
const __nv_fp8x2_storage_t rlo =
|
| 1005 |
+
__nv_cvt_float2_to_fp8x2(flo, __NV_SATFINITE, __NV_E5M2);
|
| 1006 |
+
const __nv_fp8x2_storage_t rhi =
|
| 1007 |
+
__nv_cvt_float2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E5M2);
|
| 1008 |
+
__x = __internal_pack_u16x2_to_u32(rlo, rhi);
|
| 1009 |
+
}
|
| 1010 |
+
/**
|
| 1011 |
+
* \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
|
| 1012 |
+
* Constructor from \p double4 vector data type,
|
| 1013 |
+
* relies on \p __NV_SATFINITE behavior for out-of-range values.
|
| 1014 |
+
*/
|
| 1015 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const double4 f) {
|
| 1016 |
+
const double2 flo = {f.x, f.y};
|
| 1017 |
+
const double2 fhi = {f.z, f.w};
|
| 1018 |
+
const __nv_fp8x2_storage_t rlo =
|
| 1019 |
+
__nv_cvt_double2_to_fp8x2(flo, __NV_SATFINITE, __NV_E5M2);
|
| 1020 |
+
const __nv_fp8x2_storage_t rhi =
|
| 1021 |
+
__nv_cvt_double2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E5M2);
|
| 1022 |
+
__x = __internal_pack_u16x2_to_u32(rlo, rhi);
|
| 1023 |
+
}
|
| 1024 |
+
|
| 1025 |
+
#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
|
| 1026 |
+
/* Widening converts */
|
| 1027 |
+
|
| 1028 |
+
/**
|
| 1029 |
+
* \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
|
| 1030 |
+
* Conversion operator to \p float4 vector data type.
|
| 1031 |
+
*/
|
| 1032 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator float4() const {
|
| 1033 |
+
const __nv_fp8x2_storage_t slo = static_cast<__nv_fp8x2_storage_t>(__x);
|
| 1034 |
+
const __nv_fp8x2_storage_t shi =
|
| 1035 |
+
static_cast<__nv_fp8x2_storage_t>(__x >> 16U);
|
| 1036 |
+
float2 rlo = __internal_halfraw2_to_float2(
|
| 1037 |
+
__nv_cvt_fp8x2_to_halfraw2(slo, __NV_E5M2));
|
| 1038 |
+
float2 rhi = __internal_halfraw2_to_float2(
|
| 1039 |
+
__nv_cvt_fp8x2_to_halfraw2(shi, __NV_E5M2));
|
| 1040 |
+
float4 res = {rlo.x, rlo.y, rhi.x, rhi.y};
|
| 1041 |
+
return res;
|
| 1042 |
+
}
|
| 1043 |
+
#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
|
| 1044 |
+
#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
|
| 1045 |
+
};
|
| 1046 |
+
|
| 1047 |
+
/**
|
| 1048 |
+
* \defgroup CUDA_MATH_FP8_E4M3_STRUCT C++ struct for handling fp8 data type of e4m3 kind.
|
| 1049 |
+
* \ingroup CUDA_MATH_INTRINSIC_FP8
|
| 1050 |
+
*/
|
| 1051 |
+
|
| 1052 |
+
/**
|
| 1053 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1054 |
+
* \brief __nv_fp8_e4m3 datatype
|
| 1055 |
+
*
|
| 1056 |
+
* \details This structure implements the datatype for storing
|
| 1057 |
+
* \p fp8 floating-point numbers of \p e4m3 kind:
|
| 1058 |
+
* with 1 sign, 4 exponent, 1 implicit and 3 explicit mantissa bits.
|
| 1059 |
+
* The encoding doesn't support Infinity.
|
| 1060 |
+
* NaNs are limited to 0x7F and 0xFF values.
|
| 1061 |
+
*
|
| 1062 |
+
* The structure implements converting constructors and operators.
|
| 1063 |
+
*/
|
| 1064 |
+
struct __CUDA_ALIGN__(1) __nv_fp8_e4m3 {
|
| 1065 |
+
public:
|
| 1066 |
+
/**
|
| 1067 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1068 |
+
* Storage variable contains the \p fp8 floating-point data.
|
| 1069 |
+
*/
|
| 1070 |
+
__nv_fp8_storage_t __x;
|
| 1071 |
+
|
| 1072 |
+
/**
|
| 1073 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1074 |
+
* Constructor by default.
|
| 1075 |
+
*/
|
| 1076 |
+
#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
|
| 1077 |
+
__nv_fp8_e4m3() = default;
|
| 1078 |
+
#else
|
| 1079 |
+
__CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3() {}
|
| 1080 |
+
#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
|
| 1081 |
+
|
| 1082 |
+
#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
|
| 1083 |
+
|
| 1084 |
+
/* Construct from wider FP types */
|
| 1085 |
+
/* Note we do avoid constructor init-list because of special host/device
|
| 1086 |
+
* compilation rules */
|
| 1087 |
+
|
| 1088 |
+
/**
|
| 1089 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1090 |
+
* Constructor from \p __half data type, relies on \p __NV_SATFINITE
|
| 1091 |
+
* behavior for out-of-range values.
|
| 1092 |
+
*/
|
| 1093 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const __half f) {
|
| 1094 |
+
__x = __nv_cvt_halfraw_to_fp8(static_cast<__half_raw>(f),
|
| 1095 |
+
__NV_SATFINITE, __NV_E4M3);
|
| 1096 |
+
}
|
| 1097 |
+
/**
|
| 1098 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1099 |
+
* Constructor from \p __nv_bfloat16 data type, relies on \p __NV_SATFINITE
|
| 1100 |
+
* behavior for out-of-range values.
|
| 1101 |
+
*/
|
| 1102 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const __nv_bfloat16 f) {
|
| 1103 |
+
__x = __nv_cvt_bfloat16raw_to_fp8(static_cast<__nv_bfloat16_raw>(f),
|
| 1104 |
+
__NV_SATFINITE, __NV_E4M3);
|
| 1105 |
+
}
|
| 1106 |
+
/**
|
| 1107 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1108 |
+
* Constructor from \p float data type, relies on \p __NV_SATFINITE behavior
|
| 1109 |
+
* for out-of-range values.
|
| 1110 |
+
*/
|
| 1111 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const float f) {
|
| 1112 |
+
__x = __nv_cvt_float_to_fp8(f, __NV_SATFINITE, __NV_E4M3);
|
| 1113 |
+
}
|
| 1114 |
+
/**
|
| 1115 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1116 |
+
* Constructor from \p double data type, relies on \p __NV_SATFINITE
|
| 1117 |
+
* behavior for out-of-range values.
|
| 1118 |
+
*/
|
| 1119 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const double f) {
|
| 1120 |
+
__x = __nv_cvt_double_to_fp8(f, __NV_SATFINITE, __NV_E4M3);
|
| 1121 |
+
}
|
| 1122 |
+
|
| 1123 |
+
/* Converts from integral */
|
| 1124 |
+
|
| 1125 |
+
/**
|
| 1126 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1127 |
+
* Constructor from \p unsigned \p short \p int data type, relies on \p
|
| 1128 |
+
* __NV_SATFINITE behavior for out-of-range values.
|
| 1129 |
+
*/
|
| 1130 |
+
explicit __CUDA_HOSTDEVICE_FP8__
|
| 1131 |
+
__nv_fp8_e4m3(const unsigned short int val) {
|
| 1132 |
+
__x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
|
| 1133 |
+
}
|
| 1134 |
+
/**
|
| 1135 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1136 |
+
* Constructor from \p unsigned \p int data type, relies on \p
|
| 1137 |
+
* __NV_SATFINITE behavior for out-of-range values.
|
| 1138 |
+
*/
|
| 1139 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const unsigned int val) {
|
| 1140 |
+
__x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
|
| 1141 |
+
}
|
| 1142 |
+
/**
|
| 1143 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1144 |
+
* Constructor from \p unsigned \p long \p long \p int data type, relies on
|
| 1145 |
+
* \p __NV_SATFINITE behavior for out-of-range values.
|
| 1146 |
+
*/
|
| 1147 |
+
explicit __CUDA_HOSTDEVICE_FP8__
|
| 1148 |
+
__nv_fp8_e4m3(const unsigned long long int val) {
|
| 1149 |
+
__x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
|
| 1150 |
+
}
|
| 1151 |
+
|
| 1152 |
+
/**
|
| 1153 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1154 |
+
* Constructor from \p short \p int data type, relies on \p
|
| 1155 |
+
* __NV_SATFINITE behavior for out-of-range values.
|
| 1156 |
+
*/
|
| 1157 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const short int val) {
|
| 1158 |
+
__x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
|
| 1159 |
+
}
|
| 1160 |
+
/**
|
| 1161 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1162 |
+
* Constructor from \p int data type, relies on \p __NV_SATFINITE behavior
|
| 1163 |
+
* for out-of-range values.
|
| 1164 |
+
*/
|
| 1165 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const int val) {
|
| 1166 |
+
__x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
|
| 1167 |
+
}
|
| 1168 |
+
/**
|
| 1169 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1170 |
+
* Constructor from \p long \p long \p int data type, relies on \p
|
| 1171 |
+
* __NV_SATFINITE behavior for out-of-range values.
|
| 1172 |
+
*/
|
| 1173 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const long long int val) {
|
| 1174 |
+
__x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
|
| 1175 |
+
}
|
| 1176 |
+
|
| 1177 |
+
#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
|
| 1178 |
+
/* Widening FP converts */
|
| 1179 |
+
/**
|
| 1180 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1181 |
+
* Conversion operator to \p __half data type.
|
| 1182 |
+
*/
|
| 1183 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator __half() const {
|
| 1184 |
+
return static_cast<__half>(__nv_cvt_fp8_to_halfraw(__x, __NV_E4M3));
|
| 1185 |
+
}
|
| 1186 |
+
/**
|
| 1187 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1188 |
+
* Conversion operator to \p float data type.
|
| 1189 |
+
*/
|
| 1190 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator float() const {
|
| 1191 |
+
return __internal_halfraw_to_float(
|
| 1192 |
+
__nv_cvt_fp8_to_halfraw(__x, __NV_E4M3));
|
| 1193 |
+
}
|
| 1194 |
+
/**
|
| 1195 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1196 |
+
* Conversion operator to \p __nv_bfloat16 data type.
|
| 1197 |
+
*/
|
| 1198 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator __nv_bfloat16() const {
|
| 1199 |
+
return static_cast<__nv_bfloat16>(
|
| 1200 |
+
__internal_float_to_bf16raw_rz(float(*this)));
|
| 1201 |
+
}
|
| 1202 |
+
/**
|
| 1203 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1204 |
+
* Conversion operator to \p double data type.
|
| 1205 |
+
*/
|
| 1206 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator double() const {
|
| 1207 |
+
return static_cast<double>(float(*this));
|
| 1208 |
+
}
|
| 1209 |
+
|
| 1210 |
+
/* Convert to integral */
|
| 1211 |
+
|
| 1212 |
+
/**
|
| 1213 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1214 |
+
* Conversion operator to \p unsigned \p char data type.
|
| 1215 |
+
* Clamps negative and too large inputs to the output range.
|
| 1216 |
+
* \p NaN inputs convert to \p zero.
|
| 1217 |
+
*/
|
| 1218 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned char() const {
|
| 1219 |
+
unsigned char i;
|
| 1220 |
+
const float f = float(*this);
|
| 1221 |
+
const unsigned char max_val = 0xFFU;
|
| 1222 |
+
const unsigned char min_val = 0U;
|
| 1223 |
+
const unsigned char bits = (*this).__x;
|
| 1224 |
+
// saturation fixup
|
| 1225 |
+
if ((bits & 0x7FU) == 0x7FU) {
|
| 1226 |
+
// NaN
|
| 1227 |
+
i = 0;
|
| 1228 |
+
} else if (f > static_cast<float>(max_val)) {
|
| 1229 |
+
// saturate maximum
|
| 1230 |
+
i = max_val;
|
| 1231 |
+
} else if (f < static_cast<float>(min_val)) {
|
| 1232 |
+
// saturate minimum
|
| 1233 |
+
i = min_val;
|
| 1234 |
+
} else {
|
| 1235 |
+
// normal value
|
| 1236 |
+
i = static_cast<unsigned char>(f);
|
| 1237 |
+
}
|
| 1238 |
+
return i;
|
| 1239 |
+
}
|
| 1240 |
+
|
| 1241 |
+
/**
|
| 1242 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1243 |
+
* Conversion operator to \p unsigned \p short \p int data type.
|
| 1244 |
+
* Clamps negative inputs to zero.
|
| 1245 |
+
* \p NaN inputs convert to \p zero.
|
| 1246 |
+
*/
|
| 1247 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned short int() const {
|
| 1248 |
+
return __half2ushort_rz(__half(*this));
|
| 1249 |
+
}
|
| 1250 |
+
/**
|
| 1251 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1252 |
+
* Conversion operator to \p unsigned \p int data type.
|
| 1253 |
+
* Clamps negative inputs to zero.
|
| 1254 |
+
* \p NaN inputs convert to \p zero.
|
| 1255 |
+
*/
|
| 1256 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned int() const {
|
| 1257 |
+
return __half2uint_rz(__half(*this));
|
| 1258 |
+
}
|
| 1259 |
+
/**
|
| 1260 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1261 |
+
* Conversion operator to \p unsigned \p long \p long \p int data type.
|
| 1262 |
+
* Clamps negative inputs to zero.
|
| 1263 |
+
* \p NaN inputs convert to \p 0x8000000000000000ULL.
|
| 1264 |
+
*/
|
| 1265 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long long int() const {
|
| 1266 |
+
return __half2ull_rz(__half(*this));
|
| 1267 |
+
}
|
| 1268 |
+
|
| 1269 |
+
/**
|
| 1270 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1271 |
+
* Conversion operator to \p signed \p char data type.
|
| 1272 |
+
* Clamps too large inputs to the output range.
|
| 1273 |
+
* \p NaN inputs convert to \p zero.
|
| 1274 |
+
*/
|
| 1275 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator signed char() const {
|
| 1276 |
+
signed char i;
|
| 1277 |
+
const float f = float(*this);
|
| 1278 |
+
const signed char max_val = (signed char)0x7FU;
|
| 1279 |
+
const signed char min_val = (signed char)0x80U;
|
| 1280 |
+
const unsigned char bits = (*this).__x;
|
| 1281 |
+
// saturation fixup
|
| 1282 |
+
if ((bits & 0x7FU) == 0x7FU) {
|
| 1283 |
+
// NaN
|
| 1284 |
+
i = 0;
|
| 1285 |
+
} else if (f > static_cast<float>(max_val)) {
|
| 1286 |
+
// saturate maximum
|
| 1287 |
+
i = max_val;
|
| 1288 |
+
} else if (f < static_cast<float>(min_val)) {
|
| 1289 |
+
// saturate minimum
|
| 1290 |
+
i = min_val;
|
| 1291 |
+
} else {
|
| 1292 |
+
// normal value
|
| 1293 |
+
i = static_cast<signed char>(f);
|
| 1294 |
+
}
|
| 1295 |
+
return i;
|
| 1296 |
+
}
|
| 1297 |
+
/**
|
| 1298 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1299 |
+
* Conversion operator to \p short \p int data type.
|
| 1300 |
+
* \p NaN inputs convert to \p zero.
|
| 1301 |
+
*/
|
| 1302 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator short int() const {
|
| 1303 |
+
return __half2short_rz(__half(*this));
|
| 1304 |
+
}
|
| 1305 |
+
/**
|
| 1306 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1307 |
+
* Conversion operator to \p int data type.
|
| 1308 |
+
* \p NaN inputs convert to \p zero.
|
| 1309 |
+
*/
|
| 1310 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator int() const {
|
| 1311 |
+
return __half2int_rz(__half(*this));
|
| 1312 |
+
}
|
| 1313 |
+
/**
|
| 1314 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1315 |
+
* Conversion operator to \p long \p long \p int data type.
|
| 1316 |
+
* \p NaN inputs convert to \p 0x8000000000000000LL.
|
| 1317 |
+
*/
|
| 1318 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator long long int() const {
|
| 1319 |
+
return __half2ll_rz(__half(*this));
|
| 1320 |
+
}
|
| 1321 |
+
|
| 1322 |
+
/**
|
| 1323 |
+
* \ingroup CUDA_MATH_FP8_E4M3_STRUCT
|
| 1324 |
+
* Conversion operator to \p bool data type.
|
| 1325 |
+
* +0 and -0 inputs convert to \p false.
|
| 1326 |
+
* Non-zero inputs convert to \p true.
|
| 1327 |
+
*/
|
| 1328 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator bool() const {
|
| 1329 |
+
return (__x & 0x7FU) != 0U;
|
| 1330 |
+
}
|
| 1331 |
+
#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
|
| 1332 |
+
#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
|
| 1333 |
+
};
|
| 1334 |
+
|
| 1335 |
+
/**
|
| 1336 |
+
* \defgroup CUDA_MATH_FP8X2_E4M3_STRUCT C++ struct for handling vector type of two fp8 values of e4m3 kind.
|
| 1337 |
+
* \ingroup CUDA_MATH_INTRINSIC_FP8
|
| 1338 |
+
*/
|
| 1339 |
+
|
| 1340 |
+
/**
|
| 1341 |
+
* \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
|
| 1342 |
+
* \brief __nv_fp8x2_e4m3 datatype
|
| 1343 |
+
*
|
| 1344 |
+
* \details This structure implements the datatype for storage
|
| 1345 |
+
* and operations on the vector of two \p fp8 values of \p e4m3 kind each:
|
| 1346 |
+
* with 1 sign, 4 exponent, 1 implicit and 3 explicit mantissa bits.
|
| 1347 |
+
* The encoding doesn't support Infinity.
|
| 1348 |
+
* NaNs are limited to 0x7F and 0xFF values.
|
| 1349 |
+
*/
|
| 1350 |
+
struct __CUDA_ALIGN__(2) __nv_fp8x2_e4m3 {
|
| 1351 |
+
public:
|
| 1352 |
+
/**
|
| 1353 |
+
* \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
|
| 1354 |
+
* Storage variable contains the vector of two \p fp8 floating-point data
|
| 1355 |
+
* values.
|
| 1356 |
+
*/
|
| 1357 |
+
__nv_fp8x2_storage_t __x;
|
| 1358 |
+
|
| 1359 |
+
/**
|
| 1360 |
+
* \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
|
| 1361 |
+
* Constructor by default.
|
| 1362 |
+
*/
|
| 1363 |
+
#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
|
| 1364 |
+
__nv_fp8x2_e4m3() = default;
|
| 1365 |
+
#else
|
| 1366 |
+
__CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3() {}
|
| 1367 |
+
#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
|
| 1368 |
+
|
| 1369 |
+
#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
|
| 1370 |
+
|
| 1371 |
+
/* Construct from wider types */
|
| 1372 |
+
|
| 1373 |
+
/**
|
| 1374 |
+
* \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
|
| 1375 |
+
* Constructor from \p __half2 data type, relies on \p __NV_SATFINITE
|
| 1376 |
+
* behavior for out-of-range values.
|
| 1377 |
+
*/
|
| 1378 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const __half2 f) {
|
| 1379 |
+
__x = __nv_cvt_halfraw2_to_fp8x2(static_cast<__half2_raw>(f),
|
| 1380 |
+
__NV_SATFINITE, __NV_E4M3);
|
| 1381 |
+
}
|
| 1382 |
+
/**
|
| 1383 |
+
* \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
|
| 1384 |
+
* Constructor from \p __nv_bfloat162 data type, relies on \p __NV_SATFINITE
|
| 1385 |
+
* behavior for out-of-range values.
|
| 1386 |
+
*/
|
| 1387 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const __nv_bfloat162 f) {
|
| 1388 |
+
__x = __nv_cvt_bfloat16raw2_to_fp8x2(static_cast<__nv_bfloat162_raw>(f),
|
| 1389 |
+
__NV_SATFINITE, __NV_E4M3);
|
| 1390 |
+
}
|
| 1391 |
+
/**
|
| 1392 |
+
* \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
|
| 1393 |
+
* Constructor from \p float2 data type, relies on \p __NV_SATFINITE
|
| 1394 |
+
* behavior for out-of-range values.
|
| 1395 |
+
*/
|
| 1396 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const float2 f) {
|
| 1397 |
+
__x = __nv_cvt_float2_to_fp8x2(f, __NV_SATFINITE, __NV_E4M3);
|
| 1398 |
+
}
|
| 1399 |
+
/**
|
| 1400 |
+
* \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
|
| 1401 |
+
* Constructor from \p double2 data type, relies on \p __NV_SATFINITE
|
| 1402 |
+
* behavior for out-of-range values.
|
| 1403 |
+
*/
|
| 1404 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const double2 f) {
|
| 1405 |
+
__x = __nv_cvt_double2_to_fp8x2(f, __NV_SATFINITE, __NV_E4M3);
|
| 1406 |
+
}
|
| 1407 |
+
|
| 1408 |
+
#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
|
| 1409 |
+
/* Widening converts */
|
| 1410 |
+
/**
|
| 1411 |
+
* \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
|
| 1412 |
+
* Conversion operator to \p __half2 data type.
|
| 1413 |
+
*/
|
| 1414 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator __half2() const {
|
| 1415 |
+
return static_cast<__half2>(__nv_cvt_fp8x2_to_halfraw2(__x, __NV_E4M3));
|
| 1416 |
+
}
|
| 1417 |
+
/**
|
| 1418 |
+
* \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
|
| 1419 |
+
* Conversion operator to \p float2 data type.
|
| 1420 |
+
*/
|
| 1421 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator float2() const {
|
| 1422 |
+
return __internal_halfraw2_to_float2(
|
| 1423 |
+
__nv_cvt_fp8x2_to_halfraw2(__x, __NV_E4M3));
|
| 1424 |
+
}
|
| 1425 |
+
#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
|
| 1426 |
+
#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
|
| 1427 |
+
};
|
| 1428 |
+
|
| 1429 |
+
/**
|
| 1430 |
+
* \defgroup CUDA_MATH_FP8X4_E4M3_STRUCT C++ struct for handling vector type of four fp8 values of e4m3 kind.
|
| 1431 |
+
* \ingroup CUDA_MATH_INTRINSIC_FP8
|
| 1432 |
+
*/
|
| 1433 |
+
|
| 1434 |
+
/**
|
| 1435 |
+
* \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
|
| 1436 |
+
* \brief __nv_fp8x4_e4m3 datatype
|
| 1437 |
+
*
|
| 1438 |
+
* \details This structure implements the datatype for storage
|
| 1439 |
+
* and operations on the vector of four \p fp8 values of \p e4m3 kind each:
|
| 1440 |
+
* with 1 sign, 4 exponent, 1 implicit and 3 explicit mantissa bits.
|
| 1441 |
+
* The encoding doesn't support Infinity.
|
| 1442 |
+
* NaNs are limited to 0x7F and 0xFF values.
|
| 1443 |
+
*/
|
| 1444 |
+
struct __CUDA_ALIGN__(4) __nv_fp8x4_e4m3 {
|
| 1445 |
+
public:
|
| 1446 |
+
/**
|
| 1447 |
+
* \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
|
| 1448 |
+
* Storage variable contains the vector of four \p fp8 floating-point data
|
| 1449 |
+
* values.
|
| 1450 |
+
*/
|
| 1451 |
+
__nv_fp8x4_storage_t __x;
|
| 1452 |
+
|
| 1453 |
+
/**
|
| 1454 |
+
* \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
|
| 1455 |
+
* Constructor by default.
|
| 1456 |
+
*/
|
| 1457 |
+
#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
|
| 1458 |
+
__nv_fp8x4_e4m3() = default;
|
| 1459 |
+
#else
|
| 1460 |
+
__CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3() {}
|
| 1461 |
+
#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
|
| 1462 |
+
|
| 1463 |
+
#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
|
| 1464 |
+
|
| 1465 |
+
/* Construct from wider types */
|
| 1466 |
+
|
| 1467 |
+
/**
|
| 1468 |
+
* \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
|
| 1469 |
+
* Constructor from a pair of \p __half2 data type values,
|
| 1470 |
+
* relies on \p __NV_SATFINITE behavior for out-of-range values.
|
| 1471 |
+
*/
|
| 1472 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const __half2 flo,
|
| 1473 |
+
const __half2 fhi) {
|
| 1474 |
+
const __nv_fp8x2_storage_t rlo = __nv_cvt_halfraw2_to_fp8x2(
|
| 1475 |
+
static_cast<__half2_raw>(flo), __NV_SATFINITE, __NV_E4M3);
|
| 1476 |
+
const __nv_fp8x2_storage_t rhi = __nv_cvt_halfraw2_to_fp8x2(
|
| 1477 |
+
static_cast<__half2_raw>(fhi), __NV_SATFINITE, __NV_E4M3);
|
| 1478 |
+
__x = __internal_pack_u16x2_to_u32(rlo, rhi);
|
| 1479 |
+
}
|
| 1480 |
+
/**
|
| 1481 |
+
* \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
|
| 1482 |
+
* Constructor from a pair of \p __nv_bfloat162 data type values,
|
| 1483 |
+
* relies on \p __NV_SATFINITE behavior for out-of-range values.
|
| 1484 |
+
*/
|
| 1485 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const __nv_bfloat162 flo,
|
| 1486 |
+
const __nv_bfloat162 fhi) {
|
| 1487 |
+
const __nv_fp8x2_storage_t rlo = __nv_cvt_bfloat16raw2_to_fp8x2(
|
| 1488 |
+
static_cast<__nv_bfloat162_raw>(flo), __NV_SATFINITE, __NV_E4M3);
|
| 1489 |
+
const __nv_fp8x2_storage_t rhi = __nv_cvt_bfloat16raw2_to_fp8x2(
|
| 1490 |
+
static_cast<__nv_bfloat162_raw>(fhi), __NV_SATFINITE, __NV_E4M3);
|
| 1491 |
+
__x = __internal_pack_u16x2_to_u32(rlo, rhi);
|
| 1492 |
+
}
|
| 1493 |
+
/**
|
| 1494 |
+
* \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
|
| 1495 |
+
* Constructor from \p float4 vector data type,
|
| 1496 |
+
* relies on \p __NV_SATFINITE behavior for out-of-range values.
|
| 1497 |
+
*/
|
| 1498 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const float4 f) {
|
| 1499 |
+
const float2 flo = {f.x, f.y};
|
| 1500 |
+
const float2 fhi = {f.z, f.w};
|
| 1501 |
+
const __nv_fp8x2_storage_t rlo =
|
| 1502 |
+
__nv_cvt_float2_to_fp8x2(flo, __NV_SATFINITE, __NV_E4M3);
|
| 1503 |
+
const __nv_fp8x2_storage_t rhi =
|
| 1504 |
+
__nv_cvt_float2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E4M3);
|
| 1505 |
+
__x = __internal_pack_u16x2_to_u32(rlo, rhi);
|
| 1506 |
+
}
|
| 1507 |
+
/**
|
| 1508 |
+
* \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
|
| 1509 |
+
* Constructor from \p double4 vector data type,
|
| 1510 |
+
* relies on \p __NV_SATFINITE behavior for out-of-range values.
|
| 1511 |
+
*/
|
| 1512 |
+
explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const double4 f) {
|
| 1513 |
+
const double2 flo = {f.x, f.y};
|
| 1514 |
+
const double2 fhi = {f.z, f.w};
|
| 1515 |
+
const __nv_fp8x2_storage_t rlo =
|
| 1516 |
+
__nv_cvt_double2_to_fp8x2(flo, __NV_SATFINITE, __NV_E4M3);
|
| 1517 |
+
const __nv_fp8x2_storage_t rhi =
|
| 1518 |
+
__nv_cvt_double2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E4M3);
|
| 1519 |
+
__x = __internal_pack_u16x2_to_u32(rlo, rhi);
|
| 1520 |
+
}
|
| 1521 |
+
|
| 1522 |
+
#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
|
| 1523 |
+
/* Widening converts */
|
| 1524 |
+
|
| 1525 |
+
/**
|
| 1526 |
+
* \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
|
| 1527 |
+
* Conversion operator to \p float4 vector data type.
|
| 1528 |
+
*/
|
| 1529 |
+
explicit __CUDA_HOSTDEVICE_FP8__ operator float4() const {
|
| 1530 |
+
const __nv_fp8x2_storage_t slo = static_cast<__nv_fp8x2_storage_t>(__x);
|
| 1531 |
+
const __nv_fp8x2_storage_t shi =
|
| 1532 |
+
static_cast<__nv_fp8x2_storage_t>(__x >> 16U);
|
| 1533 |
+
float2 rlo = __internal_halfraw2_to_float2(
|
| 1534 |
+
__nv_cvt_fp8x2_to_halfraw2(slo, __NV_E4M3));
|
| 1535 |
+
float2 rhi = __internal_halfraw2_to_float2(
|
| 1536 |
+
__nv_cvt_fp8x2_to_halfraw2(shi, __NV_E4M3));
|
| 1537 |
+
float4 res = {rlo.x, rlo.y, rhi.x, rhi.y};
|
| 1538 |
+
return res;
|
| 1539 |
+
}
|
| 1540 |
+
#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
|
| 1541 |
+
#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
|
| 1542 |
+
};
|
| 1543 |
+
|
| 1544 |
+
#endif /* defined(__cplusplus) */
|
| 1545 |
+
|
| 1546 |
+
#endif /* end of include guard: __CUDA_FP8_HPP__ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline_helpers.h
ADDED
|
@@ -0,0 +1,373 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef _CUDA_PIPELINE_HELPERS_H_
|
| 51 |
+
# define _CUDA_PIPELINE_HELPERS_H_
|
| 52 |
+
|
| 53 |
+
# define _CUDA_PIPELINE_NAMESPACE nvcuda::experimental
|
| 54 |
+
# define _CUDA_PIPELINE_BEGIN_NAMESPACE namespace nvcuda { namespace experimental {
|
| 55 |
+
# define _CUDA_PIPELINE_END_NAMESPACE } }
|
| 56 |
+
|
| 57 |
+
# define _CUDA_PIPELINE_INTERNAL_NAMESPACE _CUDA_PIPELINE_NAMESPACE::__pipeline_internal
|
| 58 |
+
# define _CUDA_PIPELINE_BEGIN_INTERNAL_NAMESPACE _CUDA_PIPELINE_BEGIN_NAMESPACE namespace __pipeline_internal {
|
| 59 |
+
# define _CUDA_PIPELINE_END_INTERNAL_NAMESPACE } _CUDA_PIPELINE_END_NAMESPACE
|
| 60 |
+
|
| 61 |
+
# if !defined(_CUDA_PIPELINE_QUALIFIER)
|
| 62 |
+
# define _CUDA_PIPELINE_QUALIFIER inline __device__
|
| 63 |
+
# endif
|
| 64 |
+
# if !defined(_CUDA_PIPELINE_STATIC_QUALIFIER)
|
| 65 |
+
# define _CUDA_PIPELINE_STATIC_QUALIFIER static inline __device__
|
| 66 |
+
# endif
|
| 67 |
+
|
| 68 |
+
# if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
|
| 69 |
+
# define _CUDA_PIPELINE_ARCH_700_OR_LATER
|
| 70 |
+
# endif
|
| 71 |
+
|
| 72 |
+
# if (__CUDA_ARCH__ >= 800)
|
| 73 |
+
# define _CUDA_PIPELINE_HAS_ASYNC_COPY 1
|
| 74 |
+
# else
|
| 75 |
+
# define _CUDA_PIPELINE_HAS_ASYNC_COPY 0
|
| 76 |
+
# endif
|
| 77 |
+
|
| 78 |
+
# if !defined(_CUDA_PIPELINE_MAX_STAGES)
|
| 79 |
+
# define _CUDA_PIPELINE_MAX_STAGES 8
|
| 80 |
+
# endif
|
| 81 |
+
|
| 82 |
+
# if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && (_MSC_VER >= 1900)))
|
| 83 |
+
# define _CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER
|
| 84 |
+
# endif
|
| 85 |
+
|
| 86 |
+
# if !defined(_CUDA_PIPELINE_DEBUG)
|
| 87 |
+
# if defined(__CUDACC_DEBUG__)
|
| 88 |
+
# define _CUDA_PIPELINE_DEBUG 1
|
| 89 |
+
# else
|
| 90 |
+
# define _CUDA_PIPELINE_DEBUG 0
|
| 91 |
+
# endif
|
| 92 |
+
# endif
|
| 93 |
+
|
| 94 |
+
# if defined(_CUDA_PIPELINE_DEBUG) && (_CUDA_PIPELINE_DEBUG == 1) && !defined(NDEBUG)
|
| 95 |
+
# if !defined(__CUDACC_RTC__)
|
| 96 |
+
# include <cassert>
|
| 97 |
+
# endif
|
| 98 |
+
# define _CUDA_PIPELINE_ASSERT(x) assert((x));
|
| 99 |
+
# define _CUDA_PIPELINE_ABORT() assert(0);
|
| 100 |
+
# else
|
| 101 |
+
# define _CUDA_PIPELINE_ASSERT(x)
|
| 102 |
+
# define _CUDA_PIPELINE_ABORT() __trap();
|
| 103 |
+
# endif
|
| 104 |
+
|
| 105 |
+
# if defined(_CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER)
|
| 106 |
+
# define _CUDA_PIPELINE_STATIC_ASSERT(c, m) static_assert(c, m)
|
| 107 |
+
# else
|
| 108 |
+
# define _CUDA_PIPELINE_STATIC_ASSERT(c, m)
|
| 109 |
+
# endif
|
| 110 |
+
|
| 111 |
+
# if (defined(_MSC_VER) && !defined(_WIN64)) || defined(__arm__)
|
| 112 |
+
# define _CUDA_PIPELINE_ASM_PTR_CONSTRAINT "r"
|
| 113 |
+
# else
|
| 114 |
+
# define _CUDA_PIPELINE_ASM_PTR_CONSTRAINT "l"
|
| 115 |
+
# endif
|
| 116 |
+
|
| 117 |
+
# if defined(__CUDACC_RTC__)
|
| 118 |
+
typedef unsigned int uint32_t;
|
| 119 |
+
typedef unsigned long long uint64_t;
|
| 120 |
+
typedef uint64_t uintptr_t;
|
| 121 |
+
# else
|
| 122 |
+
# include <stdint.h>
|
| 123 |
+
# endif
|
| 124 |
+
|
| 125 |
+
_CUDA_PIPELINE_BEGIN_INTERNAL_NAMESPACE
|
| 126 |
+
|
| 127 |
+
_CUDA_PIPELINE_STATIC_ASSERT(sizeof(short) == 2, "Size mismatch for type 'short'");
|
| 128 |
+
_CUDA_PIPELINE_STATIC_ASSERT(sizeof(int) == 4, "Size mismatch for type 'int'");
|
| 129 |
+
_CUDA_PIPELINE_STATIC_ASSERT(sizeof(int2) == 8, "Size mismatch for type 'int2'");
|
| 130 |
+
_CUDA_PIPELINE_STATIC_ASSERT(sizeof(int4) == 16, "Size mismatch for type 'int4'");
|
| 131 |
+
|
| 132 |
+
extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void *);
|
| 133 |
+
|
| 134 |
+
template<size_t CopySize, size_t SourceSize>
|
| 135 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 136 |
+
void pipeline_memcpy_sync(void* __restrict__ dst, const void* __restrict__ src)
|
| 137 |
+
{
|
| 138 |
+
_CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
|
| 139 |
+
_CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
|
| 140 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
|
| 141 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
|
| 142 |
+
|
| 143 |
+
char* const d = reinterpret_cast<char*>(dst);
|
| 144 |
+
const char* const s = reinterpret_cast<const char*>(src);
|
| 145 |
+
|
| 146 |
+
size_t copy_step_size;
|
| 147 |
+
if (SourceSize == 0) {
|
| 148 |
+
copy_step_size = CopySize;
|
| 149 |
+
} else if (SourceSize == 2 || SourceSize == 4 || SourceSize == 8 || SourceSize == 16) {
|
| 150 |
+
copy_step_size = SourceSize;
|
| 151 |
+
} else {
|
| 152 |
+
copy_step_size = 1;
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
for (size_t i = 0; i < CopySize; i += copy_step_size) {
|
| 156 |
+
const bool copy_source = SourceSize && (i < SourceSize);
|
| 157 |
+
|
| 158 |
+
switch (copy_step_size) {
|
| 159 |
+
case 1:
|
| 160 |
+
d[i] = copy_source ? s[i] : char();
|
| 161 |
+
break;
|
| 162 |
+
case 2:
|
| 163 |
+
*reinterpret_cast<short*>(d + i) = copy_source ? *reinterpret_cast<const short*>(s + i) : short();
|
| 164 |
+
break;
|
| 165 |
+
case 4:
|
| 166 |
+
*reinterpret_cast<int*>(d + i) = copy_source ? *reinterpret_cast<const int*>(s + i) : int();
|
| 167 |
+
break;
|
| 168 |
+
case 8:
|
| 169 |
+
*reinterpret_cast<int2*>(d + i) = copy_source ? *reinterpret_cast<const int2*>(s + i) : int2();
|
| 170 |
+
break;
|
| 171 |
+
case 16:
|
| 172 |
+
*reinterpret_cast<int4*>(d + i) = copy_source ? *reinterpret_cast<const int4*>(s + i) : int4();
|
| 173 |
+
break;
|
| 174 |
+
}
|
| 175 |
+
}
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
template<bool UseHwAsyncCopy>
|
| 179 |
+
struct ImplementationChooser;
|
| 180 |
+
|
| 181 |
+
template<>
|
| 182 |
+
struct ImplementationChooser<true> {
|
| 183 |
+
template<size_t CopySize, size_t SourceSize>
|
| 184 |
+
struct CpAsyncChooser {
|
| 185 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 186 |
+
void cp_async(void* __restrict__ dst, const void* __restrict__ src)
|
| 187 |
+
{
|
| 188 |
+
asm volatile ("cp.async.ca.shared.global [%0], [%1], %2, %3;"
|
| 189 |
+
:
|
| 190 |
+
: "r"(__nvvm_get_smem_pointer(dst)), _CUDA_PIPELINE_ASM_PTR_CONSTRAINT(src), "n"(CopySize),
|
| 191 |
+
"n"(SourceSize)
|
| 192 |
+
: "memory");
|
| 193 |
+
}
|
| 194 |
+
};
|
| 195 |
+
|
| 196 |
+
template<size_t SourceSize>
|
| 197 |
+
struct CpAsyncChooser<16, SourceSize> {
|
| 198 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 199 |
+
void cp_async(void* __restrict__ dst, const void* __restrict__ src)
|
| 200 |
+
{
|
| 201 |
+
asm volatile ("cp.async.cg.shared.global [%0], [%1], %2, %3;"
|
| 202 |
+
:
|
| 203 |
+
: "r"(__nvvm_get_smem_pointer(dst)), _CUDA_PIPELINE_ASM_PTR_CONSTRAINT(src), "n"(16), "n"(SourceSize)
|
| 204 |
+
: "memory");
|
| 205 |
+
}
|
| 206 |
+
};
|
| 207 |
+
|
| 208 |
+
template<size_t CopySize, size_t SourceSize>
|
| 209 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 210 |
+
void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src)
|
| 211 |
+
{
|
| 212 |
+
_CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
|
| 213 |
+
_CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
|
| 214 |
+
_CUDA_PIPELINE_ASSERT(__isShared(dst));
|
| 215 |
+
_CUDA_PIPELINE_ASSERT(__isGlobal(src));
|
| 216 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
|
| 217 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
|
| 218 |
+
|
| 219 |
+
CpAsyncChooser<CopySize, SourceSize>::cp_async(dst, src);
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 223 |
+
void pipeline_commit()
|
| 224 |
+
{
|
| 225 |
+
asm volatile ("cp.async.commit_group;");
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
template<unsigned N>
|
| 229 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 230 |
+
void pipeline_wait_prior()
|
| 231 |
+
{
|
| 232 |
+
asm volatile ("cp.async.wait_group %0;"
|
| 233 |
+
:
|
| 234 |
+
: "n"(N < _CUDA_PIPELINE_MAX_STAGES ? N : _CUDA_PIPELINE_MAX_STAGES));
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 238 |
+
void pipeline_arrive_on(uint64_t* barrier)
|
| 239 |
+
{
|
| 240 |
+
_CUDA_PIPELINE_ASSERT(__isShared(barrier));
|
| 241 |
+
|
| 242 |
+
asm volatile ("cp.async.mbarrier.arrive.shared.b64 [%0];"
|
| 243 |
+
:
|
| 244 |
+
: "r"(__nvvm_get_smem_pointer(barrier)));
|
| 245 |
+
}
|
| 246 |
+
};
|
| 247 |
+
|
| 248 |
+
template<>
|
| 249 |
+
struct ImplementationChooser<false> {
|
| 250 |
+
template<size_t CopySize, size_t SourceSize>
|
| 251 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 252 |
+
void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src)
|
| 253 |
+
{
|
| 254 |
+
_CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
|
| 255 |
+
_CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
|
| 256 |
+
_CUDA_PIPELINE_ASSERT(__isShared(dst));
|
| 257 |
+
_CUDA_PIPELINE_ASSERT(__isGlobal(src));
|
| 258 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
|
| 259 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
|
| 260 |
+
|
| 261 |
+
pipeline_memcpy_sync<CopySize, SourceSize>(dst, src);
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 265 |
+
void pipeline_commit()
|
| 266 |
+
{
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
template<unsigned N>
|
| 270 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 271 |
+
void pipeline_wait_prior()
|
| 272 |
+
{
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 276 |
+
void pipeline_arrive_on(uint64_t* barrier)
|
| 277 |
+
{
|
| 278 |
+
}
|
| 279 |
+
};
|
| 280 |
+
|
| 281 |
+
template<size_t CopySize, size_t SourceSize>
|
| 282 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 283 |
+
void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src)
|
| 284 |
+
{
|
| 285 |
+
_CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
|
| 286 |
+
_CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
|
| 287 |
+
_CUDA_PIPELINE_ASSERT(__isShared(dst));
|
| 288 |
+
_CUDA_PIPELINE_ASSERT(__isGlobal(src));
|
| 289 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
|
| 290 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
|
| 291 |
+
|
| 292 |
+
ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_memcpy_async<CopySize, SourceSize>(dst, src);
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 296 |
+
void pipeline_commit()
|
| 297 |
+
{
|
| 298 |
+
ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_commit();
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
template<unsigned N>
|
| 302 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 303 |
+
void pipeline_wait_prior()
|
| 304 |
+
{
|
| 305 |
+
ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_wait_prior<N>();
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 309 |
+
void pipeline_arrive_on(uint64_t* barrier)
|
| 310 |
+
{
|
| 311 |
+
ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_arrive_on(barrier);
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
template<size_t CopySize, size_t SourceSize>
|
| 315 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 316 |
+
void pipeline_copy_strict(void* __restrict__ dst, const void* __restrict__ src)
|
| 317 |
+
{
|
| 318 |
+
_CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
|
| 319 |
+
_CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size.");
|
| 320 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
|
| 321 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
|
| 322 |
+
|
| 323 |
+
if (__isGlobal(src) && __isShared(dst)) {
|
| 324 |
+
pipeline_memcpy_async<CopySize, SourceSize>(dst, src);
|
| 325 |
+
} else {
|
| 326 |
+
pipeline_memcpy_sync<CopySize, SourceSize>(dst, src);
|
| 327 |
+
}
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
template<size_t CopySize, size_t Align>
|
| 331 |
+
_CUDA_PIPELINE_QUALIFIER
|
| 332 |
+
void pipeline_copy_relaxed(void* __restrict__ dst, const void* __restrict__ src)
|
| 333 |
+
{
|
| 334 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (Align - 1)));
|
| 335 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (Align - 1)));
|
| 336 |
+
|
| 337 |
+
const char* s = reinterpret_cast<const char*>(src);
|
| 338 |
+
char* d = reinterpret_cast<char*>(dst);
|
| 339 |
+
size_t remaining = CopySize;
|
| 340 |
+
|
| 341 |
+
while (remaining) {
|
| 342 |
+
if ((Align >= 16) && (remaining >= 16)) {
|
| 343 |
+
pipeline_copy_strict<16, 16>(dst, src);
|
| 344 |
+
d += 16;
|
| 345 |
+
s += 16;
|
| 346 |
+
remaining -= 16;
|
| 347 |
+
} else if ((Align >= 8) && (remaining >= 8)) {
|
| 348 |
+
pipeline_copy_strict<8, 8>(dst, src);
|
| 349 |
+
d += 8;
|
| 350 |
+
s += 8;
|
| 351 |
+
remaining -= 8;
|
| 352 |
+
} else if ((Align >= 4) && (remaining >= 4)) {
|
| 353 |
+
pipeline_copy_strict<4, 4>(dst, src);
|
| 354 |
+
d += 4;
|
| 355 |
+
s += 4;
|
| 356 |
+
remaining -= 4;
|
| 357 |
+
} else if ((Align >= 2) && (remaining >= 2)) {
|
| 358 |
+
*reinterpret_cast<short*>(d) = *reinterpret_cast<const short*>(s);
|
| 359 |
+
d += 2;
|
| 360 |
+
s += 2;
|
| 361 |
+
remaining -= 2;
|
| 362 |
+
} else {
|
| 363 |
+
*d = *s;
|
| 364 |
+
d += 1;
|
| 365 |
+
s += 1;
|
| 366 |
+
remaining -= 1;
|
| 367 |
+
}
|
| 368 |
+
}
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
_CUDA_PIPELINE_END_INTERNAL_NAMESPACE
|
| 372 |
+
|
| 373 |
+
#endif /* !_CUDA_PIPELINE_HELPERS_H_ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline_primitives.h
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef _CUDA_PIPELINE_PRIMITIVES_H_
|
| 51 |
+
# define _CUDA_PIPELINE_PRIMITIVES_H_
|
| 52 |
+
|
| 53 |
+
# include "cuda_pipeline_helpers.h"
|
| 54 |
+
|
| 55 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 56 |
+
void __pipeline_memcpy_async(void* __restrict__ dst_shared, const void* __restrict__ src_global, size_t size_and_align,
|
| 57 |
+
size_t zfill = 0)
|
| 58 |
+
{
|
| 59 |
+
_CUDA_PIPELINE_ASSERT(size_and_align == 4 || size_and_align == 8 || size_and_align == 16);
|
| 60 |
+
_CUDA_PIPELINE_ASSERT(zfill <= size_and_align);
|
| 61 |
+
_CUDA_PIPELINE_ASSERT(__isShared(dst_shared));
|
| 62 |
+
_CUDA_PIPELINE_ASSERT(__isGlobal(src_global));
|
| 63 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst_shared) & (size_and_align - 1)));
|
| 64 |
+
_CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src_global) & (size_and_align - 1)));
|
| 65 |
+
|
| 66 |
+
switch (size_and_align) {
|
| 67 |
+
case 16:
|
| 68 |
+
switch (zfill) {
|
| 69 |
+
case 0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 16>(dst_shared, src_global); return;
|
| 70 |
+
case 1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 15>(dst_shared, src_global); return;
|
| 71 |
+
case 2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 14>(dst_shared, src_global); return;
|
| 72 |
+
case 3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 13>(dst_shared, src_global); return;
|
| 73 |
+
case 4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 12>(dst_shared, src_global); return;
|
| 74 |
+
case 5: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 11>(dst_shared, src_global); return;
|
| 75 |
+
case 6: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 10>(dst_shared, src_global); return;
|
| 76 |
+
case 7: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 9>(dst_shared, src_global); return;
|
| 77 |
+
case 8: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 8>(dst_shared, src_global); return;
|
| 78 |
+
case 9: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 7>(dst_shared, src_global); return;
|
| 79 |
+
case 10: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 6>(dst_shared, src_global); return;
|
| 80 |
+
case 11: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 5>(dst_shared, src_global); return;
|
| 81 |
+
case 12: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 4>(dst_shared, src_global); return;
|
| 82 |
+
case 13: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 3>(dst_shared, src_global); return;
|
| 83 |
+
case 14: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 2>(dst_shared, src_global); return;
|
| 84 |
+
case 15: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 1>(dst_shared, src_global); return;
|
| 85 |
+
case 16: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 0>(dst_shared, src_global); return;
|
| 86 |
+
default: _CUDA_PIPELINE_ABORT(); return;
|
| 87 |
+
}
|
| 88 |
+
case 8:
|
| 89 |
+
switch (zfill) {
|
| 90 |
+
case 0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 8>(dst_shared, src_global); return;
|
| 91 |
+
case 1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 7>(dst_shared, src_global); return;
|
| 92 |
+
case 2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 6>(dst_shared, src_global); return;
|
| 93 |
+
case 3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 5>(dst_shared, src_global); return;
|
| 94 |
+
case 4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 4>(dst_shared, src_global); return;
|
| 95 |
+
case 5: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 3>(dst_shared, src_global); return;
|
| 96 |
+
case 6: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 2>(dst_shared, src_global); return;
|
| 97 |
+
case 7: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 1>(dst_shared, src_global); return;
|
| 98 |
+
case 8: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 0>(dst_shared, src_global); return;
|
| 99 |
+
default: _CUDA_PIPELINE_ABORT(); return;
|
| 100 |
+
}
|
| 101 |
+
case 4:
|
| 102 |
+
switch (zfill) {
|
| 103 |
+
case 0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 4>(dst_shared, src_global); return;
|
| 104 |
+
case 1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 3>(dst_shared, src_global); return;
|
| 105 |
+
case 2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 2>(dst_shared, src_global); return;
|
| 106 |
+
case 3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 1>(dst_shared, src_global); return;
|
| 107 |
+
case 4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 0>(dst_shared, src_global); return;
|
| 108 |
+
default: _CUDA_PIPELINE_ABORT(); return;
|
| 109 |
+
}
|
| 110 |
+
default:
|
| 111 |
+
_CUDA_PIPELINE_ABORT();
|
| 112 |
+
return;
|
| 113 |
+
}
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 117 |
+
void __pipeline_commit()
|
| 118 |
+
{
|
| 119 |
+
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_commit();
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 123 |
+
void __pipeline_wait_prior(size_t prior)
|
| 124 |
+
{
|
| 125 |
+
switch (prior) {
|
| 126 |
+
case 0 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<0>(); return;
|
| 127 |
+
case 1 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<1>(); return;
|
| 128 |
+
case 2 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<2>(); return;
|
| 129 |
+
case 3 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<3>(); return;
|
| 130 |
+
case 4 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<4>(); return;
|
| 131 |
+
case 5 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<5>(); return;
|
| 132 |
+
case 6 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<6>(); return;
|
| 133 |
+
case 7 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<7>(); return;
|
| 134 |
+
default : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<8>(); return;
|
| 135 |
+
}
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
|
| 139 |
+
# include "cuda_awbarrier_primitives.h"
|
| 140 |
+
|
| 141 |
+
_CUDA_PIPELINE_STATIC_QUALIFIER
|
| 142 |
+
void __pipeline_arrive_on(__mbarrier_t* barrier)
|
| 143 |
+
{
|
| 144 |
+
_CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(barrier);
|
| 145 |
+
}
|
| 146 |
+
# endif
|
| 147 |
+
|
| 148 |
+
#endif /* !_CUDA_PIPELINE_PRIMITIVES_H_ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_types.h
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__DEVICE_TYPES_H__)
|
| 51 |
+
#define __DEVICE_TYPES_H__
|
| 52 |
+
|
| 53 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 54 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 55 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__
|
| 56 |
+
#endif
|
| 57 |
+
|
| 58 |
+
#ifndef __DOXYGEN_ONLY__
|
| 59 |
+
#include "crt/host_defines.h"
|
| 60 |
+
#endif
|
| 61 |
+
|
| 62 |
+
/*******************************************************************************
|
| 63 |
+
* *
|
| 64 |
+
* *
|
| 65 |
+
* *
|
| 66 |
+
*******************************************************************************/
|
| 67 |
+
|
| 68 |
+
enum __device_builtin__ cudaRoundMode
|
| 69 |
+
{
|
| 70 |
+
cudaRoundNearest,
|
| 71 |
+
cudaRoundZero,
|
| 72 |
+
cudaRoundPosInf,
|
| 73 |
+
cudaRoundMinInf
|
| 74 |
+
};
|
| 75 |
+
|
| 76 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__)
|
| 77 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 78 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__
|
| 79 |
+
#endif
|
| 80 |
+
|
| 81 |
+
#endif /* !__DEVICE_TYPES_H__ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/host_defines.h
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 51 |
+
#if defined(_MSC_VER)
|
| 52 |
+
#pragma message("host_defines.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 53 |
+
#else
|
| 54 |
+
#warning "host_defines.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 55 |
+
#endif
|
| 56 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 57 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
#include "crt/host_defines.h"
|
| 61 |
+
|
| 62 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__)
|
| 63 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 64 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__
|
| 65 |
+
#endif
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/mma.h
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 51 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 52 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__
|
| 53 |
+
#endif
|
| 54 |
+
|
| 55 |
+
#include "crt/mma.h"
|
| 56 |
+
|
| 57 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__)
|
| 58 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 59 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__
|
| 60 |
+
#endif
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_61_intrinsics.h
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2016 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__SM_61_INTRINSICS_H__)
|
| 51 |
+
#define __SM_61_INTRINSICS_H__
|
| 52 |
+
|
| 53 |
+
#if defined(__CUDACC_RTC__)
|
| 54 |
+
#define __SM_61_INTRINSICS_DECL__ __device__
|
| 55 |
+
#else /* !__CUDACC_RTC__ */
|
| 56 |
+
#define __SM_61_INTRINSICS_DECL__ static __device__ __inline__
|
| 57 |
+
#endif /* __CUDACC_RTC__ */
|
| 58 |
+
|
| 59 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 60 |
+
|
| 61 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 610
|
| 62 |
+
|
| 63 |
+
/*******************************************************************************
|
| 64 |
+
* *
|
| 65 |
+
* *
|
| 66 |
+
* *
|
| 67 |
+
*******************************************************************************/
|
| 68 |
+
|
| 69 |
+
#include "cuda_runtime_api.h"
|
| 70 |
+
|
| 71 |
+
#ifndef __CUDA_ARCH__
|
| 72 |
+
#define __DEF_IF_HOST { }
|
| 73 |
+
#else /* !__CUDA_ARCH__ */
|
| 74 |
+
#define __DEF_IF_HOST ;
|
| 75 |
+
#endif /* __CUDA_ARCH__ */
|
| 76 |
+
|
| 77 |
+
/*******************************************************************************
|
| 78 |
+
* *
|
| 79 |
+
* Below are declarations of SM-6.1 intrinsics which are included as *
|
| 80 |
+
* source (instead of being built in to the compiler) *
|
| 81 |
+
* *
|
| 82 |
+
*******************************************************************************/
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
/******************************************************************************
|
| 86 |
+
* __dp2a *
|
| 87 |
+
******************************************************************************/
|
| 88 |
+
// Generic [_lo]
|
| 89 |
+
__SM_61_INTRINSICS_DECL__ int __dp2a_lo(int srcA, int srcB, int c) __DEF_IF_HOST
|
| 90 |
+
__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(unsigned int srcA, unsigned int srcB, unsigned int c) __DEF_IF_HOST
|
| 91 |
+
// Vector-style [_lo]
|
| 92 |
+
__SM_61_INTRINSICS_DECL__ int __dp2a_lo(short2 srcA, char4 srcB, int c) __DEF_IF_HOST
|
| 93 |
+
__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(ushort2 srcA, uchar4 srcB, unsigned int c) __DEF_IF_HOST
|
| 94 |
+
// Generic [_hi]
|
| 95 |
+
__SM_61_INTRINSICS_DECL__ int __dp2a_hi(int srcA, int srcB, int c) __DEF_IF_HOST
|
| 96 |
+
__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(unsigned int srcA, unsigned int srcB, unsigned int c) __DEF_IF_HOST
|
| 97 |
+
// Vector-style [_hi]
|
| 98 |
+
__SM_61_INTRINSICS_DECL__ int __dp2a_hi(short2 srcA, char4 srcB, int c) __DEF_IF_HOST
|
| 99 |
+
__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(ushort2 srcA, uchar4 srcB, unsigned int c) __DEF_IF_HOST
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
/******************************************************************************
|
| 103 |
+
* __dp4a *
|
| 104 |
+
******************************************************************************/
|
| 105 |
+
// Generic
|
| 106 |
+
__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, int srcB, int c) __DEF_IF_HOST
|
| 107 |
+
__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(unsigned int srcA, unsigned int srcB, unsigned int c) __DEF_IF_HOST
|
| 108 |
+
// Vector-style
|
| 109 |
+
__SM_61_INTRINSICS_DECL__ int __dp4a(char4 srcA, char4 srcB, int c) __DEF_IF_HOST
|
| 110 |
+
__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(uchar4 srcA, uchar4 srcB, unsigned int c) __DEF_IF_HOST
|
| 111 |
+
|
| 112 |
+
#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 610 */
|
| 113 |
+
|
| 114 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 115 |
+
|
| 116 |
+
#undef __DEF_IF_HOST
|
| 117 |
+
#undef __SM_61_INTRINSICS_DECL__
|
| 118 |
+
|
| 119 |
+
#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
|
| 120 |
+
#include "sm_61_intrinsics.hpp"
|
| 121 |
+
#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
|
| 122 |
+
|
| 123 |
+
#endif /* !__SM_61_INTRINSICS_H__ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_types.h
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__TEXTURE_TYPES_H__)
|
| 51 |
+
#define __TEXTURE_TYPES_H__
|
| 52 |
+
|
| 53 |
+
/*******************************************************************************
|
| 54 |
+
* *
|
| 55 |
+
* *
|
| 56 |
+
* *
|
| 57 |
+
*******************************************************************************/
|
| 58 |
+
|
| 59 |
+
#include "driver_types.h"
|
| 60 |
+
|
| 61 |
+
/**
|
| 62 |
+
* \addtogroup CUDART_TYPES
|
| 63 |
+
*
|
| 64 |
+
* @{
|
| 65 |
+
*/
|
| 66 |
+
|
| 67 |
+
/*******************************************************************************
|
| 68 |
+
* *
|
| 69 |
+
* *
|
| 70 |
+
* *
|
| 71 |
+
*******************************************************************************/
|
| 72 |
+
|
| 73 |
+
#define cudaTextureType1D 0x01
|
| 74 |
+
#define cudaTextureType2D 0x02
|
| 75 |
+
#define cudaTextureType3D 0x03
|
| 76 |
+
#define cudaTextureTypeCubemap 0x0C
|
| 77 |
+
#define cudaTextureType1DLayered 0xF1
|
| 78 |
+
#define cudaTextureType2DLayered 0xF2
|
| 79 |
+
#define cudaTextureTypeCubemapLayered 0xFC
|
| 80 |
+
|
| 81 |
+
/**
|
| 82 |
+
* CUDA texture address modes
|
| 83 |
+
*/
|
| 84 |
+
enum __device_builtin__ cudaTextureAddressMode
|
| 85 |
+
{
|
| 86 |
+
cudaAddressModeWrap = 0, /**< Wrapping address mode */
|
| 87 |
+
cudaAddressModeClamp = 1, /**< Clamp to edge address mode */
|
| 88 |
+
cudaAddressModeMirror = 2, /**< Mirror address mode */
|
| 89 |
+
cudaAddressModeBorder = 3 /**< Border address mode */
|
| 90 |
+
};
|
| 91 |
+
|
| 92 |
+
/**
|
| 93 |
+
* CUDA texture filter modes
|
| 94 |
+
*/
|
| 95 |
+
enum __device_builtin__ cudaTextureFilterMode
|
| 96 |
+
{
|
| 97 |
+
cudaFilterModePoint = 0, /**< Point filter mode */
|
| 98 |
+
cudaFilterModeLinear = 1 /**< Linear filter mode */
|
| 99 |
+
};
|
| 100 |
+
|
| 101 |
+
/**
|
| 102 |
+
* CUDA texture read modes
|
| 103 |
+
*/
|
| 104 |
+
enum __device_builtin__ cudaTextureReadMode
|
| 105 |
+
{
|
| 106 |
+
cudaReadModeElementType = 0, /**< Read texture as specified element type */
|
| 107 |
+
cudaReadModeNormalizedFloat = 1 /**< Read texture as normalized float */
|
| 108 |
+
};
|
| 109 |
+
|
| 110 |
+
/**
|
| 111 |
+
* CUDA texture reference
|
| 112 |
+
*/
|
| 113 |
+
struct __device_builtin__ textureReference
|
| 114 |
+
{
|
| 115 |
+
/**
|
| 116 |
+
* Indicates whether texture reads are normalized or not
|
| 117 |
+
*/
|
| 118 |
+
int normalized;
|
| 119 |
+
/**
|
| 120 |
+
* Texture filter mode
|
| 121 |
+
*/
|
| 122 |
+
enum cudaTextureFilterMode filterMode;
|
| 123 |
+
/**
|
| 124 |
+
* Texture address mode for up to 3 dimensions
|
| 125 |
+
*/
|
| 126 |
+
enum cudaTextureAddressMode addressMode[3];
|
| 127 |
+
/**
|
| 128 |
+
* Channel descriptor for the texture reference
|
| 129 |
+
*/
|
| 130 |
+
struct cudaChannelFormatDesc channelDesc;
|
| 131 |
+
/**
|
| 132 |
+
* Perform sRGB->linear conversion during texture read
|
| 133 |
+
*/
|
| 134 |
+
int sRGB;
|
| 135 |
+
/**
|
| 136 |
+
* Limit to the anisotropy ratio
|
| 137 |
+
*/
|
| 138 |
+
unsigned int maxAnisotropy;
|
| 139 |
+
/**
|
| 140 |
+
* Mipmap filter mode
|
| 141 |
+
*/
|
| 142 |
+
enum cudaTextureFilterMode mipmapFilterMode;
|
| 143 |
+
/**
|
| 144 |
+
* Offset applied to the supplied mipmap level
|
| 145 |
+
*/
|
| 146 |
+
float mipmapLevelBias;
|
| 147 |
+
/**
|
| 148 |
+
* Lower end of the mipmap level range to clamp access to
|
| 149 |
+
*/
|
| 150 |
+
float minMipmapLevelClamp;
|
| 151 |
+
/**
|
| 152 |
+
* Upper end of the mipmap level range to clamp access to
|
| 153 |
+
*/
|
| 154 |
+
float maxMipmapLevelClamp;
|
| 155 |
+
/**
|
| 156 |
+
* Disable any trilinear filtering optimizations.
|
| 157 |
+
*/
|
| 158 |
+
int disableTrilinearOptimization;
|
| 159 |
+
int __cudaReserved[14];
|
| 160 |
+
};
|
| 161 |
+
|
| 162 |
+
/**
|
| 163 |
+
* CUDA texture descriptor
|
| 164 |
+
*/
|
| 165 |
+
struct __device_builtin__ cudaTextureDesc
|
| 166 |
+
{
|
| 167 |
+
/**
|
| 168 |
+
* Texture address mode for up to 3 dimensions
|
| 169 |
+
*/
|
| 170 |
+
enum cudaTextureAddressMode addressMode[3];
|
| 171 |
+
/**
|
| 172 |
+
* Texture filter mode
|
| 173 |
+
*/
|
| 174 |
+
enum cudaTextureFilterMode filterMode;
|
| 175 |
+
/**
|
| 176 |
+
* Texture read mode
|
| 177 |
+
*/
|
| 178 |
+
enum cudaTextureReadMode readMode;
|
| 179 |
+
/**
|
| 180 |
+
* Perform sRGB->linear conversion during texture read
|
| 181 |
+
*/
|
| 182 |
+
int sRGB;
|
| 183 |
+
/**
|
| 184 |
+
* Texture Border Color
|
| 185 |
+
*/
|
| 186 |
+
float borderColor[4];
|
| 187 |
+
/**
|
| 188 |
+
* Indicates whether texture reads are normalized or not
|
| 189 |
+
*/
|
| 190 |
+
int normalizedCoords;
|
| 191 |
+
/**
|
| 192 |
+
* Limit to the anisotropy ratio
|
| 193 |
+
*/
|
| 194 |
+
unsigned int maxAnisotropy;
|
| 195 |
+
/**
|
| 196 |
+
* Mipmap filter mode
|
| 197 |
+
*/
|
| 198 |
+
enum cudaTextureFilterMode mipmapFilterMode;
|
| 199 |
+
/**
|
| 200 |
+
* Offset applied to the supplied mipmap level
|
| 201 |
+
*/
|
| 202 |
+
float mipmapLevelBias;
|
| 203 |
+
/**
|
| 204 |
+
* Lower end of the mipmap level range to clamp access to
|
| 205 |
+
*/
|
| 206 |
+
float minMipmapLevelClamp;
|
| 207 |
+
/**
|
| 208 |
+
* Upper end of the mipmap level range to clamp access to
|
| 209 |
+
*/
|
| 210 |
+
float maxMipmapLevelClamp;
|
| 211 |
+
/**
|
| 212 |
+
* Disable any trilinear filtering optimizations.
|
| 213 |
+
*/
|
| 214 |
+
int disableTrilinearOptimization;
|
| 215 |
+
};
|
| 216 |
+
|
| 217 |
+
struct __device_builtin__ cudaTextureDesc_v2
|
| 218 |
+
{
|
| 219 |
+
/**
|
| 220 |
+
* Texture address mode for up to 3 dimensions
|
| 221 |
+
*/
|
| 222 |
+
enum cudaTextureAddressMode addressMode[3];
|
| 223 |
+
/**
|
| 224 |
+
* Texture filter mode
|
| 225 |
+
*/
|
| 226 |
+
enum cudaTextureFilterMode filterMode;
|
| 227 |
+
/**
|
| 228 |
+
* Texture read mode
|
| 229 |
+
*/
|
| 230 |
+
enum cudaTextureReadMode readMode;
|
| 231 |
+
/**
|
| 232 |
+
* Perform sRGB->linear conversion during texture read
|
| 233 |
+
*/
|
| 234 |
+
int sRGB;
|
| 235 |
+
/**
|
| 236 |
+
* Texture Border Color
|
| 237 |
+
*/
|
| 238 |
+
float borderColor[4];
|
| 239 |
+
/**
|
| 240 |
+
* Indicates whether texture reads are normalized or not
|
| 241 |
+
*/
|
| 242 |
+
int normalizedCoords;
|
| 243 |
+
/**
|
| 244 |
+
* Limit to the anisotropy ratio
|
| 245 |
+
*/
|
| 246 |
+
unsigned int maxAnisotropy;
|
| 247 |
+
/**
|
| 248 |
+
* Mipmap filter mode
|
| 249 |
+
*/
|
| 250 |
+
enum cudaTextureFilterMode mipmapFilterMode;
|
| 251 |
+
/**
|
| 252 |
+
* Offset applied to the supplied mipmap level
|
| 253 |
+
*/
|
| 254 |
+
float mipmapLevelBias;
|
| 255 |
+
/**
|
| 256 |
+
* Lower end of the mipmap level range to clamp access to
|
| 257 |
+
*/
|
| 258 |
+
float minMipmapLevelClamp;
|
| 259 |
+
/**
|
| 260 |
+
* Upper end of the mipmap level range to clamp access to
|
| 261 |
+
*/
|
| 262 |
+
float maxMipmapLevelClamp;
|
| 263 |
+
/**
|
| 264 |
+
* Disable any trilinear filtering optimizations.
|
| 265 |
+
*/
|
| 266 |
+
int disableTrilinearOptimization;
|
| 267 |
+
/**
|
| 268 |
+
* Enable seamless cube map filtering.
|
| 269 |
+
*/
|
| 270 |
+
int seamlessCubemap;
|
| 271 |
+
};
|
| 272 |
+
|
| 273 |
+
/**
|
| 274 |
+
* An opaque value that represents a CUDA texture object
|
| 275 |
+
*/
|
| 276 |
+
typedef __device_builtin__ unsigned long long cudaTextureObject_t;
|
| 277 |
+
|
| 278 |
+
/** @} */
|
| 279 |
+
/** @} */ /* END CUDART_TYPES */
|
| 280 |
+
|
| 281 |
+
#endif /* !__TEXTURE_TYPES_H__ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/vector_functions.h
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__VECTOR_FUNCTIONS_H__)
|
| 51 |
+
#define __VECTOR_FUNCTIONS_H__
|
| 52 |
+
|
| 53 |
+
/*******************************************************************************
|
| 54 |
+
* *
|
| 55 |
+
* *
|
| 56 |
+
* *
|
| 57 |
+
*******************************************************************************/
|
| 58 |
+
|
| 59 |
+
#include "cuda_runtime_api.h"
|
| 60 |
+
|
| 61 |
+
#if defined(__CUDACC_RTC__)
|
| 62 |
+
#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
|
| 63 |
+
#else /* !__CUDACC_RTC__ */
|
| 64 |
+
#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
|
| 65 |
+
#endif /* __CUDACC_RTC__ */
|
| 66 |
+
|
| 67 |
+
/*******************************************************************************
|
| 68 |
+
* *
|
| 69 |
+
* *
|
| 70 |
+
* *
|
| 71 |
+
*******************************************************************************/
|
| 72 |
+
|
| 73 |
+
__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x);
|
| 74 |
+
|
| 75 |
+
__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x);
|
| 76 |
+
|
| 77 |
+
__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y);
|
| 78 |
+
|
| 79 |
+
__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y);
|
| 80 |
+
|
| 81 |
+
__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z);
|
| 82 |
+
|
| 83 |
+
__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z);
|
| 84 |
+
|
| 85 |
+
__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w);
|
| 86 |
+
|
| 87 |
+
__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w);
|
| 88 |
+
|
| 89 |
+
__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x);
|
| 90 |
+
|
| 91 |
+
__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x);
|
| 92 |
+
|
| 93 |
+
__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y);
|
| 94 |
+
|
| 95 |
+
__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y);
|
| 96 |
+
|
| 97 |
+
__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z);
|
| 98 |
+
|
| 99 |
+
__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z);
|
| 100 |
+
|
| 101 |
+
__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w);
|
| 102 |
+
|
| 103 |
+
__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
|
| 104 |
+
|
| 105 |
+
__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x);
|
| 106 |
+
|
| 107 |
+
__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x);
|
| 108 |
+
|
| 109 |
+
__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y);
|
| 110 |
+
|
| 111 |
+
__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y);
|
| 112 |
+
|
| 113 |
+
__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z);
|
| 114 |
+
|
| 115 |
+
__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z);
|
| 116 |
+
|
| 117 |
+
__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w);
|
| 118 |
+
|
| 119 |
+
__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w);
|
| 120 |
+
|
| 121 |
+
__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x);
|
| 122 |
+
|
| 123 |
+
__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x);
|
| 124 |
+
|
| 125 |
+
__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y);
|
| 126 |
+
|
| 127 |
+
__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y);
|
| 128 |
+
|
| 129 |
+
__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z);
|
| 130 |
+
|
| 131 |
+
__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z);
|
| 132 |
+
|
| 133 |
+
__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w);
|
| 134 |
+
|
| 135 |
+
__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w);
|
| 136 |
+
|
| 137 |
+
__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x);
|
| 138 |
+
|
| 139 |
+
__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y);
|
| 140 |
+
|
| 141 |
+
__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z);
|
| 142 |
+
|
| 143 |
+
__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w);
|
| 144 |
+
|
| 145 |
+
__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x);
|
| 146 |
+
|
| 147 |
+
__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x);
|
| 148 |
+
|
| 149 |
+
__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y);
|
| 150 |
+
|
| 151 |
+
__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y);
|
| 152 |
+
|
| 153 |
+
__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z);
|
| 154 |
+
|
| 155 |
+
__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z);
|
| 156 |
+
|
| 157 |
+
__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w);
|
| 158 |
+
|
| 159 |
+
__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w);
|
| 160 |
+
|
| 161 |
+
__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x);
|
| 162 |
+
|
| 163 |
+
__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y);
|
| 164 |
+
|
| 165 |
+
__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z);
|
| 166 |
+
|
| 167 |
+
__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w);
|
| 168 |
+
|
| 169 |
+
#undef __VECTOR_FUNCTIONS_DECL__
|
| 170 |
+
|
| 171 |
+
#if !defined(__CUDACC_RTC__)
|
| 172 |
+
#include "vector_functions.hpp"
|
| 173 |
+
#endif /* !__CUDACC_RTC__ */
|
| 174 |
+
|
| 175 |
+
#endif /* !__VECTOR_FUNCTIONS_H__ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_globals.h
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
#ifndef CURAND_GLOBALS_H
|
| 49 |
+
#define CURAND_GLOBALS_H
|
| 50 |
+
|
| 51 |
+
#define MAX_XOR_N (5)
|
| 52 |
+
#define SKIPAHEAD_BLOCKSIZE (4)
|
| 53 |
+
#define SKIPAHEAD_MASK ((1<<SKIPAHEAD_BLOCKSIZE)-1)
|
| 54 |
+
#define CURAND_2POW32 (4294967296.f)
|
| 55 |
+
#define CURAND_2POW32_DOUBLE (4294967296.)
|
| 56 |
+
#define CURAND_2POW32_INV (2.3283064e-10f)
|
| 57 |
+
#define CURAND_2POW32_INV_DOUBLE (2.3283064365386963e-10)
|
| 58 |
+
#define CURAND_2POW53_INV_DOUBLE (1.1102230246251565e-16)
|
| 59 |
+
#define CURAND_2POW32_INV_2PI (2.3283064e-10f * 6.2831855f)
|
| 60 |
+
#define CURAND_2PI (6.2831855f)
|
| 61 |
+
#define CURAND_2POW53_INV_2PI_DOUBLE (1.1102230246251565e-16 * 6.2831853071795860)
|
| 62 |
+
#define CURAND_PI_DOUBLE (3.1415926535897932)
|
| 63 |
+
#define CURAND_2PI_DOUBLE (6.2831853071795860)
|
| 64 |
+
#define CURAND_SQRT2 (-1.4142135f)
|
| 65 |
+
#define CURAND_SQRT2_DOUBLE (-1.4142135623730951)
|
| 66 |
+
|
| 67 |
+
#define SOBOL64_ITR_BINARY_DIVIDE 2
|
| 68 |
+
#define SOBOL_M2_BINARY_DIVIDE 10
|
| 69 |
+
#define MTGP32_M2_BINARY_DIVIDE 32
|
| 70 |
+
#define MAX_LAMBDA 400000
|
| 71 |
+
#define MIN_GAUSS_LAMBDA 2000
|
| 72 |
+
|
| 73 |
+
struct normal_args_st {
|
| 74 |
+
float mean;
|
| 75 |
+
float stddev;
|
| 76 |
+
};
|
| 77 |
+
|
| 78 |
+
typedef struct normal_args_st normal_args_t;
|
| 79 |
+
|
| 80 |
+
struct normal_args_double_st {
|
| 81 |
+
double mean;
|
| 82 |
+
double stddev;
|
| 83 |
+
};
|
| 84 |
+
|
| 85 |
+
typedef struct normal_args_double_st normal_args_double_t;
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
#endif
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_lognormal.h
ADDED
|
@@ -0,0 +1,697 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
/* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
#if !defined(CURAND_LOGNORMAL_H_)
|
| 52 |
+
#define CURAND_LOGNORMAL_H_
|
| 53 |
+
|
| 54 |
+
/**
|
| 55 |
+
* \defgroup DEVICE Device API
|
| 56 |
+
*
|
| 57 |
+
* @{
|
| 58 |
+
*/
|
| 59 |
+
|
| 60 |
+
#ifndef __CUDACC_RTC__
|
| 61 |
+
#include <math.h>
|
| 62 |
+
#endif // __CUDACC_RTC__
|
| 63 |
+
|
| 64 |
+
#include "curand_mrg32k3a.h"
|
| 65 |
+
#include "curand_mtgp32_kernel.h"
|
| 66 |
+
#include "curand_philox4x32_x.h"
|
| 67 |
+
|
| 68 |
+
/**
|
| 69 |
+
* \brief Return a log-normally distributed float from an XORWOW generator.
|
| 70 |
+
*
|
| 71 |
+
* Return a single log-normally distributed float derived from a normal
|
| 72 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 73 |
+
* from the XORWOW generator in \p state,
|
| 74 |
+
* increment position of generator by one.
|
| 75 |
+
*
|
| 76 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 77 |
+
* normally distributed results, transforms them to log-normal distribution,
|
| 78 |
+
* then returns them one at a time.
|
| 79 |
+
* See ::curand_log_normal2() for a more efficient version that returns
|
| 80 |
+
* both results at once.
|
| 81 |
+
*
|
| 82 |
+
* \param state - Pointer to state to update
|
| 83 |
+
* \param mean - Mean of the related normal distribution
|
| 84 |
+
* \param stddev - Standard deviation of the related normal distribution
|
| 85 |
+
*
|
| 86 |
+
* \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
|
| 87 |
+
*/
|
| 88 |
+
QUALIFIERS float curand_log_normal(curandStateXORWOW_t *state, float mean, float stddev)
|
| 89 |
+
{
|
| 90 |
+
if(state->boxmuller_flag != EXTRA_FLAG_LOG_NORMAL) {
|
| 91 |
+
unsigned int x, y;
|
| 92 |
+
x = curand(state);
|
| 93 |
+
y = curand(state);
|
| 94 |
+
float2 v = _curand_box_muller(x, y);
|
| 95 |
+
state->boxmuller_extra = expf(mean + (stddev * v.y));
|
| 96 |
+
state->boxmuller_flag = EXTRA_FLAG_LOG_NORMAL;
|
| 97 |
+
return expf(mean + (stddev * v.x));
|
| 98 |
+
}
|
| 99 |
+
state->boxmuller_flag = 0;
|
| 100 |
+
return state->boxmuller_extra;
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
/**
|
| 104 |
+
* \brief Return a log-normally distributed float from an Philox4_32_10 generator.
|
| 105 |
+
*
|
| 106 |
+
* Return a single log-normally distributed float derived from a normal
|
| 107 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 108 |
+
* from the Philox4_32_10 generator in \p state,
|
| 109 |
+
* increment position of generator by one.
|
| 110 |
+
*
|
| 111 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 112 |
+
* normally distributed results, transforms them to log-normal distribution,
|
| 113 |
+
* then returns them one at a time.
|
| 114 |
+
* See ::curand_log_normal2() for a more efficient version that returns
|
| 115 |
+
* both results at once.
|
| 116 |
+
*
|
| 117 |
+
* \param state - Pointer to state to update
|
| 118 |
+
* \param mean - Mean of the related normal distribution
|
| 119 |
+
* \param stddev - Standard deviation of the related normal distribution
|
| 120 |
+
*
|
| 121 |
+
* \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
|
| 122 |
+
*/
|
| 123 |
+
|
| 124 |
+
QUALIFIERS float curand_log_normal(curandStatePhilox4_32_10_t *state, float mean, float stddev)
|
| 125 |
+
{
|
| 126 |
+
if(state->boxmuller_flag != EXTRA_FLAG_LOG_NORMAL) {
|
| 127 |
+
unsigned int x, y;
|
| 128 |
+
x = curand(state);
|
| 129 |
+
y = curand(state);
|
| 130 |
+
float2 v = _curand_box_muller(x, y);
|
| 131 |
+
state->boxmuller_extra = expf(mean + (stddev * v.y));
|
| 132 |
+
state->boxmuller_flag = EXTRA_FLAG_LOG_NORMAL;
|
| 133 |
+
return expf(mean + (stddev * v.x));
|
| 134 |
+
}
|
| 135 |
+
state->boxmuller_flag = 0;
|
| 136 |
+
return state->boxmuller_extra;
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
/**
|
| 140 |
+
* \brief Return two normally distributed floats from an XORWOW generator.
|
| 141 |
+
*
|
| 142 |
+
* Return two log-normally distributed floats derived from a normal
|
| 143 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 144 |
+
* from the XORWOW generator in \p state,
|
| 145 |
+
* increment position of generator by two.
|
| 146 |
+
*
|
| 147 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 148 |
+
* normally distributed results, then transforms them to log-normal.
|
| 149 |
+
*
|
| 150 |
+
* \param state - Pointer to state to update
|
| 151 |
+
* \param mean - Mean of the related normal distribution
|
| 152 |
+
* \param stddev - Standard deviation of the related normal distribution
|
| 153 |
+
*
|
| 154 |
+
* \return Log-normally distributed float2 where each element is from a
|
| 155 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 156 |
+
*/
|
| 157 |
+
QUALIFIERS float2 curand_log_normal2(curandStateXORWOW_t *state, float mean, float stddev)
|
| 158 |
+
{
|
| 159 |
+
float2 v = curand_box_muller(state);
|
| 160 |
+
v.x = expf(mean + (stddev * v.x));
|
| 161 |
+
v.y = expf(mean + (stddev * v.y));
|
| 162 |
+
return v;
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
/**
|
| 166 |
+
* \brief Return two normally distributed floats from an Philox4_32_10 generator.
|
| 167 |
+
*
|
| 168 |
+
* Return two log-normally distributed floats derived from a normal
|
| 169 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 170 |
+
* from the Philox4_32_10 generator in \p state,
|
| 171 |
+
* increment position of generator by two.
|
| 172 |
+
*
|
| 173 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 174 |
+
* normally distributed results, then transforms them to log-normal.
|
| 175 |
+
*
|
| 176 |
+
* \param state - Pointer to state to update
|
| 177 |
+
* \param mean - Mean of the related normal distribution
|
| 178 |
+
* \param stddev - Standard deviation of the related normal distribution
|
| 179 |
+
*
|
| 180 |
+
* \return Log-normally distributed float2 where each element is from a
|
| 181 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 182 |
+
*/
|
| 183 |
+
QUALIFIERS float2 curand_log_normal2(curandStatePhilox4_32_10_t *state, float mean, float stddev)
|
| 184 |
+
{
|
| 185 |
+
float2 v = curand_box_muller(state);
|
| 186 |
+
v.x = expf(mean + (stddev * v.x));
|
| 187 |
+
v.y = expf(mean + (stddev * v.y));
|
| 188 |
+
return v;
|
| 189 |
+
}
|
| 190 |
+
/**
|
| 191 |
+
* \brief Return four normally distributed floats from an Philox4_32_10 generator.
|
| 192 |
+
*
|
| 193 |
+
* Return four log-normally distributed floats derived from a normal
|
| 194 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 195 |
+
* from the Philox4_32_10 generator in \p state,
|
| 196 |
+
* increment position of generator by four.
|
| 197 |
+
*
|
| 198 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 199 |
+
* normally distributed results, then transforms them to log-normal.
|
| 200 |
+
*
|
| 201 |
+
* \param state - Pointer to state to update
|
| 202 |
+
* \param mean - Mean of the related normal distribution
|
| 203 |
+
* \param stddev - Standard deviation of the related normal distribution
|
| 204 |
+
*
|
| 205 |
+
* \return Log-normally distributed float4 where each element is from a
|
| 206 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 207 |
+
*/
|
| 208 |
+
QUALIFIERS float4 curand_log_normal4(curandStatePhilox4_32_10_t *state, float mean, float stddev)
|
| 209 |
+
{
|
| 210 |
+
float4 v = curand_box_muller4(state);
|
| 211 |
+
v.x = expf(mean + (stddev * v.x));
|
| 212 |
+
v.y = expf(mean + (stddev * v.y));
|
| 213 |
+
v.z = expf(mean + (stddev * v.z));
|
| 214 |
+
v.w = expf(mean + (stddev * v.w));
|
| 215 |
+
return v;
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
/**
|
| 219 |
+
* \brief Return a log-normally distributed float from an MRG32k3a generator.
|
| 220 |
+
*
|
| 221 |
+
* Return a single log-normally distributed float derived from a normal
|
| 222 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 223 |
+
* from the MRG32k3a generator in \p state,
|
| 224 |
+
* increment position of generator by one.
|
| 225 |
+
*
|
| 226 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 227 |
+
* normally distributed results, transforms them to log-normal distribution,
|
| 228 |
+
* then returns them one at a time.
|
| 229 |
+
* See ::curand_log_normal2() for a more efficient version that returns
|
| 230 |
+
* both results at once.
|
| 231 |
+
*
|
| 232 |
+
* \param state - Pointer to state to update
|
| 233 |
+
* \param mean - Mean of the related normal distribution
|
| 234 |
+
* \param stddev - Standard deviation of the related normal distribution
|
| 235 |
+
*
|
| 236 |
+
* \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
|
| 237 |
+
*/
|
| 238 |
+
QUALIFIERS float curand_log_normal(curandStateMRG32k3a_t *state, float mean, float stddev)
|
| 239 |
+
{
|
| 240 |
+
if(state->boxmuller_flag != EXTRA_FLAG_LOG_NORMAL) {
|
| 241 |
+
float2 v = curand_box_muller_mrg(state);
|
| 242 |
+
state->boxmuller_extra = expf(mean + (stddev * v.y));
|
| 243 |
+
state->boxmuller_flag = EXTRA_FLAG_LOG_NORMAL;
|
| 244 |
+
return expf(mean + (stddev * v.x));
|
| 245 |
+
}
|
| 246 |
+
state->boxmuller_flag = 0;
|
| 247 |
+
return state->boxmuller_extra;
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
/**
|
| 251 |
+
* \brief Return two normally distributed floats from an MRG32k3a generator.
|
| 252 |
+
*
|
| 253 |
+
* Return two log-normally distributed floats derived from a normal
|
| 254 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 255 |
+
* from the MRG32k3a generator in \p state,
|
| 256 |
+
* increment position of generator by two.
|
| 257 |
+
*
|
| 258 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 259 |
+
* normally distributed results, then transforms them to log-normal.
|
| 260 |
+
*
|
| 261 |
+
* \param state - Pointer to state to update
|
| 262 |
+
* \param mean - Mean of the related normal distribution
|
| 263 |
+
* \param stddev - Standard deviation of the related normal distribution
|
| 264 |
+
*
|
| 265 |
+
* \return Log-normally distributed float2 where each element is from a
|
| 266 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 267 |
+
*/
|
| 268 |
+
QUALIFIERS float2 curand_log_normal2(curandStateMRG32k3a_t *state, float mean, float stddev)
|
| 269 |
+
{
|
| 270 |
+
float2 v = curand_box_muller_mrg(state);
|
| 271 |
+
v.x = expf(mean + (stddev * v.x));
|
| 272 |
+
v.y = expf(mean + (stddev * v.y));
|
| 273 |
+
return v;
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
/**
|
| 277 |
+
* \brief Return a log-normally distributed float from an MTGP32 generator.
|
| 278 |
+
*
|
| 279 |
+
* Return a single log-normally distributed float derived from a normal
|
| 280 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 281 |
+
* from the MTGP32 generator in \p state,
|
| 282 |
+
* increment position of generator.
|
| 283 |
+
*
|
| 284 |
+
* The implementation uses the inverse cumulative distribution function
|
| 285 |
+
* to generate a normally distributed result, then transforms the result
|
| 286 |
+
* to log-normal.
|
| 287 |
+
*
|
| 288 |
+
* \param state - Pointer to state to update
|
| 289 |
+
* \param mean - Mean of the related normal distribution
|
| 290 |
+
* \param stddev - Standard deviation of the related normal distribution
|
| 291 |
+
*
|
| 292 |
+
* \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
|
| 293 |
+
*/
|
| 294 |
+
QUALIFIERS float curand_log_normal(curandStateMtgp32_t *state, float mean, float stddev)
|
| 295 |
+
{
|
| 296 |
+
return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
/**
|
| 300 |
+
* \brief Return a log-normally distributed float from a Sobol32 generator.
|
| 301 |
+
*
|
| 302 |
+
* Return a single log-normally distributed float derived from a normal
|
| 303 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 304 |
+
* from the Sobol32 generator in \p state,
|
| 305 |
+
* increment position of generator by one.
|
| 306 |
+
*
|
| 307 |
+
* The implementation uses the inverse cumulative distribution function
|
| 308 |
+
* to generate a normally distributed result, then transforms the result
|
| 309 |
+
* to log-normal.
|
| 310 |
+
*
|
| 311 |
+
* \param state - Pointer to state to update
|
| 312 |
+
* \param mean - Mean of the related normal distribution
|
| 313 |
+
* \param stddev - Standard deviation of the related normal distribution
|
| 314 |
+
*
|
| 315 |
+
* \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
|
| 316 |
+
*/
|
| 317 |
+
QUALIFIERS float curand_log_normal(curandStateSobol32_t *state, float mean, float stddev)
|
| 318 |
+
{
|
| 319 |
+
return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
|
| 320 |
+
}
|
| 321 |
+
/**
|
| 322 |
+
* \brief Return a log-normally distributed float from a scrambled Sobol32 generator.
|
| 323 |
+
*
|
| 324 |
+
* Return a single log-normally distributed float derived from a normal
|
| 325 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 326 |
+
* from the scrambled Sobol32 generator in \p state,
|
| 327 |
+
* increment position of generator by one.
|
| 328 |
+
*
|
| 329 |
+
* The implementation uses the inverse cumulative distribution function
|
| 330 |
+
* to generate a normally distributed result, then transforms the result
|
| 331 |
+
* to log-normal.
|
| 332 |
+
*
|
| 333 |
+
* \param state - Pointer to state to update
|
| 334 |
+
* \param mean - Mean of the related normal distribution
|
| 335 |
+
* \param stddev - Standard deviation of the related normal distribution
|
| 336 |
+
*
|
| 337 |
+
* \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
|
| 338 |
+
*/
|
| 339 |
+
QUALIFIERS float curand_log_normal(curandStateScrambledSobol32_t *state, float mean, float stddev)
|
| 340 |
+
{
|
| 341 |
+
return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
|
| 342 |
+
}
|
| 343 |
+
|
| 344 |
+
/**
|
| 345 |
+
* \brief Return a log-normally distributed float from a Sobol64 generator.
|
| 346 |
+
*
|
| 347 |
+
* Return a single log-normally distributed float derived from a normal
|
| 348 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 349 |
+
* from the Sobol64 generator in \p state,
|
| 350 |
+
* increment position of generator by one.
|
| 351 |
+
*
|
| 352 |
+
* The implementation uses the inverse cumulative distribution function
|
| 353 |
+
* to generate normally distributed results, then converts to log-normal
|
| 354 |
+
* distribution.
|
| 355 |
+
*
|
| 356 |
+
* \param state - Pointer to state to update
|
| 357 |
+
* \param mean - Mean of the related normal distribution
|
| 358 |
+
* \param stddev - Standard deviation of the related normal distribution
|
| 359 |
+
*
|
| 360 |
+
* \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
|
| 361 |
+
*/
|
| 362 |
+
QUALIFIERS float curand_log_normal(curandStateSobol64_t *state, float mean, float stddev)
|
| 363 |
+
{
|
| 364 |
+
return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
/**
|
| 368 |
+
* \brief Return a log-normally distributed float from a scrambled Sobol64 generator.
|
| 369 |
+
*
|
| 370 |
+
* Return a single log-normally distributed float derived from a normal
|
| 371 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 372 |
+
* from the scrambled Sobol64 generator in \p state,
|
| 373 |
+
* increment position of generator by one.
|
| 374 |
+
*
|
| 375 |
+
* The implementation uses the inverse cumulative distribution function
|
| 376 |
+
* to generate normally distributed results, then converts to log-normal
|
| 377 |
+
* distribution.
|
| 378 |
+
*
|
| 379 |
+
* \param state - Pointer to state to update
|
| 380 |
+
* \param mean - Mean of the related normal distribution
|
| 381 |
+
* \param stddev - Standard deviation of the related normal distribution
|
| 382 |
+
*
|
| 383 |
+
* \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
|
| 384 |
+
*/
|
| 385 |
+
QUALIFIERS float curand_log_normal(curandStateScrambledSobol64_t *state, float mean, float stddev)
|
| 386 |
+
{
|
| 387 |
+
return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
|
| 388 |
+
}
|
| 389 |
+
|
| 390 |
+
/**
|
| 391 |
+
* \brief Return a log-normally distributed double from an XORWOW generator.
|
| 392 |
+
*
|
| 393 |
+
* Return a single normally distributed double derived from a normal
|
| 394 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 395 |
+
* from the XORWOW generator in \p state,
|
| 396 |
+
* increment position of generator.
|
| 397 |
+
*
|
| 398 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 399 |
+
* normally distributed results, transforms them to log-normal distribution,
|
| 400 |
+
* then returns them one at a time.
|
| 401 |
+
* See ::curand_log_normal2_double() for a more efficient version that returns
|
| 402 |
+
* both results at once.
|
| 403 |
+
*
|
| 404 |
+
* \param state - Pointer to state to update
|
| 405 |
+
* \param mean - Mean of the related normal distribution
|
| 406 |
+
* \param stddev - Standard deviation of the related normal distribution
|
| 407 |
+
*
|
| 408 |
+
* \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
|
| 409 |
+
*/
|
| 410 |
+
|
| 411 |
+
QUALIFIERS double curand_log_normal_double(curandStateXORWOW_t *state, double mean, double stddev)
|
| 412 |
+
{
|
| 413 |
+
if(state->boxmuller_flag_double != EXTRA_FLAG_LOG_NORMAL) {
|
| 414 |
+
unsigned int x0, x1, y0, y1;
|
| 415 |
+
x0 = curand(state);
|
| 416 |
+
x1 = curand(state);
|
| 417 |
+
y0 = curand(state);
|
| 418 |
+
y1 = curand(state);
|
| 419 |
+
double2 v = _curand_box_muller_double(x0, x1, y0, y1);
|
| 420 |
+
state->boxmuller_extra_double = exp(mean + (stddev * v.y));
|
| 421 |
+
state->boxmuller_flag_double = EXTRA_FLAG_LOG_NORMAL;
|
| 422 |
+
return exp(mean + (stddev * v.x));
|
| 423 |
+
}
|
| 424 |
+
state->boxmuller_flag_double = 0;
|
| 425 |
+
return state->boxmuller_extra_double;
|
| 426 |
+
}
|
| 427 |
+
|
| 428 |
+
/**
|
| 429 |
+
* \brief Return a log-normally distributed double from an Philox4_32_10 generator.
|
| 430 |
+
*
|
| 431 |
+
* Return a single normally distributed double derived from a normal
|
| 432 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 433 |
+
* from the Philox4_32_10 generator in \p state,
|
| 434 |
+
* increment position of generator.
|
| 435 |
+
*
|
| 436 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 437 |
+
* normally distributed results, transforms them to log-normal distribution,
|
| 438 |
+
* then returns them one at a time.
|
| 439 |
+
* See ::curand_log_normal2_double() for a more efficient version that returns
|
| 440 |
+
* both results at once.
|
| 441 |
+
*
|
| 442 |
+
* \param state - Pointer to state to update
|
| 443 |
+
* \param mean - Mean of the related normal distribution
|
| 444 |
+
* \param stddev - Standard deviation of the related normal distribution
|
| 445 |
+
*
|
| 446 |
+
* \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
|
| 447 |
+
*/
|
| 448 |
+
|
| 449 |
+
QUALIFIERS double curand_log_normal_double(curandStatePhilox4_32_10_t *state, double mean, double stddev)
|
| 450 |
+
{
|
| 451 |
+
if(state->boxmuller_flag_double != EXTRA_FLAG_LOG_NORMAL) {
|
| 452 |
+
uint4 _x;
|
| 453 |
+
_x = curand4(state);
|
| 454 |
+
double2 v = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
|
| 455 |
+
state->boxmuller_extra_double = exp(mean + (stddev * v.y));
|
| 456 |
+
state->boxmuller_flag_double = EXTRA_FLAG_LOG_NORMAL;
|
| 457 |
+
return exp(mean + (stddev * v.x));
|
| 458 |
+
}
|
| 459 |
+
state->boxmuller_flag_double = 0;
|
| 460 |
+
return state->boxmuller_extra_double;
|
| 461 |
+
}
|
| 462 |
+
|
| 463 |
+
|
| 464 |
+
/**
|
| 465 |
+
* \brief Return two log-normally distributed doubles from an XORWOW generator.
|
| 466 |
+
*
|
| 467 |
+
* Return two log-normally distributed doubles derived from a normal
|
| 468 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 469 |
+
* from the XORWOW generator in \p state,
|
| 470 |
+
* increment position of generator by two.
|
| 471 |
+
*
|
| 472 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 473 |
+
* normally distributed results, and transforms them to log-normal distribution,.
|
| 474 |
+
*
|
| 475 |
+
* \param state - Pointer to state to update
|
| 476 |
+
* \param mean - Mean of the related normal distribution
|
| 477 |
+
* \param stddev - Standard deviation of the related normal distribution
|
| 478 |
+
*
|
| 479 |
+
* \return Log-normally distributed double2 where each element is from a
|
| 480 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 481 |
+
*/
|
| 482 |
+
QUALIFIERS double2 curand_log_normal2_double(curandStateXORWOW_t *state, double mean, double stddev)
|
| 483 |
+
{
|
| 484 |
+
double2 v = curand_box_muller_double(state);
|
| 485 |
+
v.x = exp(mean + (stddev * v.x));
|
| 486 |
+
v.y = exp(mean + (stddev * v.y));
|
| 487 |
+
return v;
|
| 488 |
+
}
|
| 489 |
+
|
| 490 |
+
/**
|
| 491 |
+
* \brief Return two log-normally distributed doubles from an Philox4_32_10 generator.
|
| 492 |
+
*
|
| 493 |
+
* Return two log-normally distributed doubles derived from a normal
|
| 494 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 495 |
+
* from the Philox4_32_10 generator in \p state,
|
| 496 |
+
* increment position of generator by four.
|
| 497 |
+
*
|
| 498 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 499 |
+
* normally distributed results, and transforms them to log-normal distribution,.
|
| 500 |
+
*
|
| 501 |
+
* \param state - Pointer to state to update
|
| 502 |
+
* \param mean - Mean of the related normal distribution
|
| 503 |
+
* \param stddev - Standard deviation of the related normal distribution
|
| 504 |
+
*
|
| 505 |
+
* \return Log-normally distributed double4 where each element is from a
|
| 506 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 507 |
+
*/
|
| 508 |
+
QUALIFIERS double2 curand_log_normal2_double(curandStatePhilox4_32_10_t *state, double mean, double stddev)
|
| 509 |
+
{
|
| 510 |
+
double2 v = curand_box_muller2_double(state);
|
| 511 |
+
v.x = exp(mean + (stddev * v.x));
|
| 512 |
+
v.y = exp(mean + (stddev * v.y));
|
| 513 |
+
return v;
|
| 514 |
+
}
|
| 515 |
+
// nor part of API
|
| 516 |
+
QUALIFIERS double4 curand_log_normal4_double(curandStatePhilox4_32_10_t *state, double mean, double stddev)
|
| 517 |
+
{
|
| 518 |
+
double4 v = curand_box_muller4_double(state);
|
| 519 |
+
v.x = exp(mean + (stddev * v.x));
|
| 520 |
+
v.y = exp(mean + (stddev * v.y));
|
| 521 |
+
v.z = exp(mean + (stddev * v.z));
|
| 522 |
+
v.w = exp(mean + (stddev * v.w));
|
| 523 |
+
return v;
|
| 524 |
+
}
|
| 525 |
+
|
| 526 |
+
/**
|
| 527 |
+
* \brief Return a log-normally distributed double from an MRG32k3a generator.
|
| 528 |
+
*
|
| 529 |
+
* Return a single normally distributed double derived from a normal
|
| 530 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 531 |
+
* from the MRG32k3a generator in \p state,
|
| 532 |
+
* increment position of generator.
|
| 533 |
+
*
|
| 534 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 535 |
+
* normally distributed results, transforms them to log-normal distribution,
|
| 536 |
+
* then returns them one at a time.
|
| 537 |
+
* See ::curand_log_normal2_double() for a more efficient version that returns
|
| 538 |
+
* both results at once.
|
| 539 |
+
*
|
| 540 |
+
* \param state - Pointer to state to update
|
| 541 |
+
* \param mean - Mean of the related normal distribution
|
| 542 |
+
* \param stddev - Standard deviation of the related normal distribution
|
| 543 |
+
*
|
| 544 |
+
* \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
|
| 545 |
+
*/
|
| 546 |
+
QUALIFIERS double curand_log_normal_double(curandStateMRG32k3a_t *state, double mean, double stddev)
|
| 547 |
+
{
|
| 548 |
+
if(state->boxmuller_flag_double != EXTRA_FLAG_LOG_NORMAL) {
|
| 549 |
+
double2 v = curand_box_muller_mrg_double(state);
|
| 550 |
+
state->boxmuller_extra_double = exp(mean + (stddev * v.y));
|
| 551 |
+
state->boxmuller_flag_double = EXTRA_FLAG_LOG_NORMAL;
|
| 552 |
+
return exp(mean + (stddev * v.x));
|
| 553 |
+
}
|
| 554 |
+
state->boxmuller_flag_double = 0;
|
| 555 |
+
return state->boxmuller_extra_double;
|
| 556 |
+
}
|
| 557 |
+
|
| 558 |
+
/**
|
| 559 |
+
* \brief Return two log-normally distributed doubles from an MRG32k3a generator.
|
| 560 |
+
*
|
| 561 |
+
* Return two log-normally distributed doubles derived from a normal
|
| 562 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 563 |
+
* from the MRG32k3a generator in \p state,
|
| 564 |
+
* increment position of generator by two.
|
| 565 |
+
*
|
| 566 |
+
* The implementation uses a Box-Muller transform to generate two
|
| 567 |
+
* normally distributed results, and transforms them to log-normal distribution,.
|
| 568 |
+
*
|
| 569 |
+
* \param state - Pointer to state to update
|
| 570 |
+
* \param mean - Mean of the related normal distribution
|
| 571 |
+
* \param stddev - Standard deviation of the related normal distribution
|
| 572 |
+
*
|
| 573 |
+
* \return Log-normally distributed double2 where each element is from a
|
| 574 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 575 |
+
*/
|
| 576 |
+
QUALIFIERS double2 curand_log_normal2_double(curandStateMRG32k3a_t *state, double mean, double stddev)
|
| 577 |
+
{
|
| 578 |
+
double2 v = curand_box_muller_mrg_double(state);
|
| 579 |
+
v.x = exp(mean + (stddev * v.x));
|
| 580 |
+
v.y = exp(mean + (stddev * v.y));
|
| 581 |
+
return v;
|
| 582 |
+
}
|
| 583 |
+
|
| 584 |
+
/**
|
| 585 |
+
* \brief Return a log-normally distributed double from an MTGP32 generator.
|
| 586 |
+
*
|
| 587 |
+
* Return a single log-normally distributed double derived from a normal
|
| 588 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 589 |
+
* from the MTGP32 generator in \p state,
|
| 590 |
+
* increment position of generator.
|
| 591 |
+
*
|
| 592 |
+
* The implementation uses the inverse cumulative distribution function
|
| 593 |
+
* to generate normally distributed results, and transforms them into
|
| 594 |
+
* log-normal distribution.
|
| 595 |
+
*
|
| 596 |
+
* \param state - Pointer to state to update
|
| 597 |
+
* \param mean - Mean of the related normal distribution
|
| 598 |
+
* \param stddev - Standard deviation of the related normal distribution
|
| 599 |
+
*
|
| 600 |
+
* \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
|
| 601 |
+
*/
|
| 602 |
+
QUALIFIERS double curand_log_normal_double(curandStateMtgp32_t *state, double mean, double stddev)
|
| 603 |
+
{
|
| 604 |
+
return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
|
| 605 |
+
}
|
| 606 |
+
|
| 607 |
+
/**
|
| 608 |
+
* \brief Return a log-normally distributed double from a Sobol32 generator.
|
| 609 |
+
*
|
| 610 |
+
* Return a single log-normally distributed double derived from a normal
|
| 611 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 612 |
+
* from the Sobol32 generator in \p state,
|
| 613 |
+
* increment position of generator by one.
|
| 614 |
+
*
|
| 615 |
+
* The implementation uses the inverse cumulative distribution function
|
| 616 |
+
* to generate normally distributed results, and transforms them into
|
| 617 |
+
* log-normal distribution.
|
| 618 |
+
*
|
| 619 |
+
* \param state - Pointer to state to update
|
| 620 |
+
* \param mean - Mean of the related normal distribution
|
| 621 |
+
* \param stddev - Standard deviation of the related normal distribution
|
| 622 |
+
*
|
| 623 |
+
* \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
|
| 624 |
+
*/
|
| 625 |
+
QUALIFIERS double curand_log_normal_double(curandStateSobol32_t *state, double mean, double stddev)
|
| 626 |
+
{
|
| 627 |
+
return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
|
| 628 |
+
}
|
| 629 |
+
|
| 630 |
+
/**
|
| 631 |
+
* \brief Return a log-normally distributed double from a scrambled Sobol32 generator.
|
| 632 |
+
*
|
| 633 |
+
* Return a single log-normally distributed double derived from a normal
|
| 634 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 635 |
+
* from the scrambled Sobol32 generator in \p state,
|
| 636 |
+
* increment position of generator by one.
|
| 637 |
+
*
|
| 638 |
+
* The implementation uses the inverse cumulative distribution function
|
| 639 |
+
* to generate normally distributed results, and transforms them into
|
| 640 |
+
* log-normal distribution.
|
| 641 |
+
*
|
| 642 |
+
* \param state - Pointer to state to update
|
| 643 |
+
* \param mean - Mean of the related normal distribution
|
| 644 |
+
* \param stddev - Standard deviation of the related normal distribution
|
| 645 |
+
*
|
| 646 |
+
* \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
|
| 647 |
+
*/
|
| 648 |
+
QUALIFIERS double curand_log_normal_double(curandStateScrambledSobol32_t *state, double mean, double stddev)
|
| 649 |
+
{
|
| 650 |
+
return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
|
| 651 |
+
}
|
| 652 |
+
|
| 653 |
+
/**
|
| 654 |
+
* \brief Return a log-normally distributed double from a Sobol64 generator.
|
| 655 |
+
*
|
| 656 |
+
* Return a single normally distributed double derived from a normal
|
| 657 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 658 |
+
* from the Sobol64 generator in \p state,
|
| 659 |
+
* increment position of generator by one.
|
| 660 |
+
*
|
| 661 |
+
* The implementation uses the inverse cumulative distribution function
|
| 662 |
+
* to generate normally distributed results.
|
| 663 |
+
*
|
| 664 |
+
* \param state - Pointer to state to update
|
| 665 |
+
* \param mean - Mean of the related normal distribution
|
| 666 |
+
* \param stddev - Standard deviation of the related normal distribution
|
| 667 |
+
*
|
| 668 |
+
* \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
|
| 669 |
+
*/
|
| 670 |
+
QUALIFIERS double curand_log_normal_double(curandStateSobol64_t *state, double mean, double stddev)
|
| 671 |
+
{
|
| 672 |
+
return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
|
| 673 |
+
}
|
| 674 |
+
|
| 675 |
+
/**
|
| 676 |
+
* \brief Return a log-normally distributed double from a scrambled Sobol64 generator.
|
| 677 |
+
*
|
| 678 |
+
* Return a single normally distributed double derived from a normal
|
| 679 |
+
* distribution with mean \p mean and standard deviation \p stddev
|
| 680 |
+
* from the scrambled Sobol64 generator in \p state,
|
| 681 |
+
* increment position of generator by one.
|
| 682 |
+
*
|
| 683 |
+
* The implementation uses the inverse cumulative distribution function
|
| 684 |
+
* to generate normally distributed results.
|
| 685 |
+
*
|
| 686 |
+
* \param state - Pointer to state to update
|
| 687 |
+
* \param mean - Mean of the related normal distribution
|
| 688 |
+
* \param stddev - Standard deviation of the related normal distribution
|
| 689 |
+
*
|
| 690 |
+
* \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
|
| 691 |
+
*/
|
| 692 |
+
QUALIFIERS double curand_log_normal_double(curandStateScrambledSobol64_t *state, double mean, double stddev)
|
| 693 |
+
{
|
| 694 |
+
return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
|
| 695 |
+
}
|
| 696 |
+
|
| 697 |
+
#endif // !defined(CURAND_LOGNORMAL_H_)
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mrg32k3a.h
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32dc_p_11213.h
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_normal_static.h
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
#ifndef CURAND_NORMAL_STATIC_H
|
| 49 |
+
#define CURAND_NORMAL_STATIC_H
|
| 50 |
+
|
| 51 |
+
#define QUALIFIERS_STATIC __host__ __device__ __forceinline__
|
| 52 |
+
|
| 53 |
+
QUALIFIERS_STATIC float _curand_normal_icdf(unsigned int x)
|
| 54 |
+
{
|
| 55 |
+
#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
|
| 56 |
+
float s = CURAND_SQRT2;
|
| 57 |
+
// Mirror to avoid loss of precision
|
| 58 |
+
if(x > 0x80000000UL) {
|
| 59 |
+
x = 0xffffffffUL - x;
|
| 60 |
+
s = -s;
|
| 61 |
+
}
|
| 62 |
+
float p = x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
|
| 63 |
+
// p is in (0, 0.5], 2p is in (0, 1]
|
| 64 |
+
return s * erfcinvf(2.0f * p);
|
| 65 |
+
#else
|
| 66 |
+
x++; //suppress warnings
|
| 67 |
+
return 0.0f;
|
| 68 |
+
#endif
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
QUALIFIERS_STATIC float _curand_normal_icdf(unsigned long long x)
|
| 72 |
+
{
|
| 73 |
+
#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
|
| 74 |
+
unsigned int t = (unsigned int)(x >> 32);
|
| 75 |
+
float s = CURAND_SQRT2;
|
| 76 |
+
// Mirror to avoid loss of precision
|
| 77 |
+
if(t > 0x80000000UL) {
|
| 78 |
+
t = 0xffffffffUL - t;
|
| 79 |
+
s = -s;
|
| 80 |
+
}
|
| 81 |
+
float p = t * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
|
| 82 |
+
// p is in (0, 0.5], 2p is in (0, 1]
|
| 83 |
+
return s * erfcinvf(2.0f * p);
|
| 84 |
+
#else
|
| 85 |
+
x++;
|
| 86 |
+
return 0.0f;
|
| 87 |
+
#endif
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
QUALIFIERS_STATIC double _curand_normal_icdf_double(unsigned int x)
|
| 91 |
+
{
|
| 92 |
+
#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
|
| 93 |
+
double s = CURAND_SQRT2_DOUBLE;
|
| 94 |
+
// Mirror to avoid loss of precision
|
| 95 |
+
if(x > 0x80000000UL) {
|
| 96 |
+
x = 0xffffffffUL - x;
|
| 97 |
+
s = -s;
|
| 98 |
+
}
|
| 99 |
+
double p = x * CURAND_2POW32_INV_DOUBLE + (CURAND_2POW32_INV_DOUBLE/2.0);
|
| 100 |
+
// p is in (0, 0.5], 2p is in (0, 1]
|
| 101 |
+
return s * erfcinv(2.0 * p);
|
| 102 |
+
#else
|
| 103 |
+
x++;
|
| 104 |
+
return 0.0;
|
| 105 |
+
#endif
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
QUALIFIERS_STATIC double _curand_normal_icdf_double(unsigned long long x)
|
| 109 |
+
{
|
| 110 |
+
#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
|
| 111 |
+
double s = CURAND_SQRT2_DOUBLE;
|
| 112 |
+
x >>= 11;
|
| 113 |
+
// Mirror to avoid loss of precision
|
| 114 |
+
if(x > 0x10000000000000UL) {
|
| 115 |
+
x = 0x1fffffffffffffUL - x;
|
| 116 |
+
s = -s;
|
| 117 |
+
}
|
| 118 |
+
double p = x * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
|
| 119 |
+
// p is in (0, 0.5], 2p is in (0, 1]
|
| 120 |
+
return s * erfcinv(2.0 * p);
|
| 121 |
+
#else
|
| 122 |
+
x++;
|
| 123 |
+
return 0.0;
|
| 124 |
+
#endif
|
| 125 |
+
}
|
| 126 |
+
#undef QUALIFIERS_STATIC
|
| 127 |
+
#endif
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_philox4x32_x.h
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
/*
|
| 49 |
+
Copyright 2010-2011, D. E. Shaw Research.
|
| 50 |
+
All rights reserved.
|
| 51 |
+
|
| 52 |
+
Redistribution and use in source and binary forms, with or without
|
| 53 |
+
modification, are permitted provided that the following conditions are
|
| 54 |
+
met:
|
| 55 |
+
|
| 56 |
+
* Redistributions of source code must retain the above copyright
|
| 57 |
+
notice, this list of conditions, and the following disclaimer.
|
| 58 |
+
|
| 59 |
+
* Redistributions in binary form must reproduce the above copyright
|
| 60 |
+
notice, this list of conditions, and the following disclaimer in the
|
| 61 |
+
documentation and/or other materials provided with the distribution.
|
| 62 |
+
|
| 63 |
+
* Neither the name of D. E. Shaw Research nor the names of its
|
| 64 |
+
contributors may be used to endorse or promote products derived from
|
| 65 |
+
this software without specific prior written permission.
|
| 66 |
+
|
| 67 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
| 68 |
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
| 69 |
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
| 70 |
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
| 71 |
+
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
| 72 |
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
| 73 |
+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
| 74 |
+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
| 75 |
+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 76 |
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 77 |
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 78 |
+
*/
|
| 79 |
+
|
| 80 |
+
#ifndef CURAND_PHILOX4X32_X__H_
|
| 81 |
+
#define CURAND_PHILOX4X32_X__H_
|
| 82 |
+
|
| 83 |
+
#if !defined(QUALIFIERS)
|
| 84 |
+
#define QUALIFIERS static __forceinline__ __device__
|
| 85 |
+
#endif
|
| 86 |
+
|
| 87 |
+
#define PHILOX_W32_0 (0x9E3779B9)
|
| 88 |
+
#define PHILOX_W32_1 (0xBB67AE85)
|
| 89 |
+
#define PHILOX_M4x32_0 (0xD2511F53)
|
| 90 |
+
#define PHILOX_M4x32_1 (0xCD9E8D57)
|
| 91 |
+
|
| 92 |
+
struct curandStatePhilox4_32_10 {
|
| 93 |
+
uint4 ctr;
|
| 94 |
+
uint4 output;
|
| 95 |
+
uint2 key;
|
| 96 |
+
unsigned int STATE;
|
| 97 |
+
int boxmuller_flag;
|
| 98 |
+
int boxmuller_flag_double;
|
| 99 |
+
float boxmuller_extra;
|
| 100 |
+
double boxmuller_extra_double;
|
| 101 |
+
};
|
| 102 |
+
|
| 103 |
+
typedef struct curandStatePhilox4_32_10 curandStatePhilox4_32_10_t;
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
QUALIFIERS void Philox_State_Incr(curandStatePhilox4_32_10_t* s, unsigned long long n)
|
| 107 |
+
{
|
| 108 |
+
unsigned int nlo = (unsigned int)(n);
|
| 109 |
+
unsigned int nhi = (unsigned int)(n>>32);
|
| 110 |
+
|
| 111 |
+
s->ctr.x += nlo;
|
| 112 |
+
if( s->ctr.x < nlo )
|
| 113 |
+
nhi++;
|
| 114 |
+
|
| 115 |
+
s->ctr.y += nhi;
|
| 116 |
+
if(nhi <= s->ctr.y)
|
| 117 |
+
return;
|
| 118 |
+
if(++s->ctr.z) return;
|
| 119 |
+
++s->ctr.w;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
QUALIFIERS void Philox_State_Incr_hi(curandStatePhilox4_32_10_t* s, unsigned long long n)
|
| 123 |
+
{
|
| 124 |
+
unsigned int nlo = (unsigned int)(n);
|
| 125 |
+
unsigned int nhi = (unsigned int)(n>>32);
|
| 126 |
+
|
| 127 |
+
s->ctr.z += nlo;
|
| 128 |
+
if( s->ctr.z < nlo )
|
| 129 |
+
nhi++;
|
| 130 |
+
|
| 131 |
+
s->ctr.w += nhi;
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
QUALIFIERS void Philox_State_Incr(curandStatePhilox4_32_10_t* s)
|
| 137 |
+
{
|
| 138 |
+
if(++s->ctr.x) return;
|
| 139 |
+
if(++s->ctr.y) return;
|
| 140 |
+
if(++s->ctr.z) return;
|
| 141 |
+
++s->ctr.w;
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
QUALIFIERS unsigned int mulhilo32(unsigned int a, unsigned int b, unsigned int* hip)
|
| 146 |
+
{
|
| 147 |
+
#ifndef __CUDA_ARCH__
|
| 148 |
+
// host code
|
| 149 |
+
unsigned long long product = ((unsigned long long)a) * ((unsigned long long)b);
|
| 150 |
+
*hip = product >> 32;
|
| 151 |
+
return (unsigned int)product;
|
| 152 |
+
#else
|
| 153 |
+
// device code
|
| 154 |
+
*hip = __umulhi(a,b);
|
| 155 |
+
return a*b;
|
| 156 |
+
#endif
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
QUALIFIERS uint4 _philox4x32round(uint4 ctr, uint2 key)
|
| 160 |
+
{
|
| 161 |
+
unsigned int hi0;
|
| 162 |
+
unsigned int hi1;
|
| 163 |
+
unsigned int lo0 = mulhilo32(PHILOX_M4x32_0, ctr.x, &hi0);
|
| 164 |
+
unsigned int lo1 = mulhilo32(PHILOX_M4x32_1, ctr.z, &hi1);
|
| 165 |
+
|
| 166 |
+
uint4 ret = {hi1^ctr.y^key.x, lo1, hi0^ctr.w^key.y, lo0};
|
| 167 |
+
return ret;
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
QUALIFIERS uint4 curand_Philox4x32_10( uint4 c, uint2 k)
|
| 171 |
+
{
|
| 172 |
+
c = _philox4x32round(c, k); // 1
|
| 173 |
+
k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
|
| 174 |
+
c = _philox4x32round(c, k); // 2
|
| 175 |
+
k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
|
| 176 |
+
c = _philox4x32round(c, k); // 3
|
| 177 |
+
k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
|
| 178 |
+
c = _philox4x32round(c, k); // 4
|
| 179 |
+
k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
|
| 180 |
+
c = _philox4x32round(c, k); // 5
|
| 181 |
+
k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
|
| 182 |
+
c = _philox4x32round(c, k); // 6
|
| 183 |
+
k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
|
| 184 |
+
c = _philox4x32round(c, k); // 7
|
| 185 |
+
k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
|
| 186 |
+
c = _philox4x32round(c, k); // 8
|
| 187 |
+
k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
|
| 188 |
+
c = _philox4x32round(c, k); // 9
|
| 189 |
+
k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
|
| 190 |
+
return _philox4x32round(c, k); // 10
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
#endif
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_precalc.h
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtCuda.h
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2009-2017 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO USER:
|
| 5 |
+
*
|
| 6 |
+
* This source code is subject to NVIDIA ownership rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* This software and the information contained herein is PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
|
| 11 |
+
* of a form of NVIDIA software license agreement.
|
| 12 |
+
*
|
| 13 |
+
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
|
| 14 |
+
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
|
| 15 |
+
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
|
| 16 |
+
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
|
| 17 |
+
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 18 |
+
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
|
| 19 |
+
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
| 20 |
+
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
| 21 |
+
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
| 22 |
+
* OR PERFORMANCE OF THIS SOURCE CODE.
|
| 23 |
+
*
|
| 24 |
+
* U.S. Government End Users. This source code is a "commercial item" as
|
| 25 |
+
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
|
| 26 |
+
* "commercial computer software" and "commercial computer software
|
| 27 |
+
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
|
| 28 |
+
* and is provided to the U.S. Government only as a commercial end item.
|
| 29 |
+
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
|
| 30 |
+
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
|
| 31 |
+
* source code with only those rights set forth herein.
|
| 32 |
+
*
|
| 33 |
+
* Any use of this source code in individual and commercial software must
|
| 34 |
+
* include, in the user documentation and internal comments to the code,
|
| 35 |
+
* the above Disclaimer and U.S. Government End Users Notice.
|
| 36 |
+
*/
|
| 37 |
+
|
| 38 |
+
#ifndef NVTOOLSEXT_CUDA_H_
|
| 39 |
+
#define NVTOOLSEXT_CUDA_H_
|
| 40 |
+
|
| 41 |
+
#include "cuda.h"
|
| 42 |
+
|
| 43 |
+
#include "nvToolsExt.h"
|
| 44 |
+
|
| 45 |
+
#ifdef __cplusplus
|
| 46 |
+
extern "C" {
|
| 47 |
+
#endif /* __cplusplus */
|
| 48 |
+
|
| 49 |
+
/* ========================================================================= */
|
| 50 |
+
/** \name Functions for CUDA Resource Naming
|
| 51 |
+
*/
|
| 52 |
+
/** \addtogroup RESOURCE_NAMING
|
| 53 |
+
* \section RESOURCE_NAMING_CUDA CUDA Resource Naming
|
| 54 |
+
*
|
| 55 |
+
* This section covers the API functions that allow to annotate CUDA resources
|
| 56 |
+
* with user-provided names.
|
| 57 |
+
*
|
| 58 |
+
* @{
|
| 59 |
+
*/
|
| 60 |
+
|
| 61 |
+
/* ------------------------------------------------------------------------- */
|
| 62 |
+
/* \cond SHOW_HIDDEN
|
| 63 |
+
* \brief Used to build a non-colliding value for resource types separated class
|
| 64 |
+
* \version \NVTX_VERSION_2
|
| 65 |
+
*/
|
| 66 |
+
#define NVTX_RESOURCE_CLASS_CUDA 4
|
| 67 |
+
/** \endcond */
|
| 68 |
+
|
| 69 |
+
/* ------------------------------------------------------------------------- */
|
| 70 |
+
/** \brief Resource types for CUDA
|
| 71 |
+
*/
|
| 72 |
+
typedef enum nvtxResourceCUDAType_t
|
| 73 |
+
{
|
| 74 |
+
NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */
|
| 75 |
+
NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */
|
| 76 |
+
NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */
|
| 77 |
+
NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4) /* CUevent */
|
| 78 |
+
} nvtxResourceCUDAType_t;
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
/* ------------------------------------------------------------------------- */
|
| 82 |
+
/** \brief Annotates a CUDA device.
|
| 83 |
+
*
|
| 84 |
+
* Allows the user to associate a CUDA device with a user-provided name.
|
| 85 |
+
*
|
| 86 |
+
* \param device - The handle of the CUDA device to name.
|
| 87 |
+
* \param name - The name of the CUDA device.
|
| 88 |
+
*
|
| 89 |
+
* \version \NVTX_VERSION_1
|
| 90 |
+
* @{ */
|
| 91 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name);
|
| 92 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name);
|
| 93 |
+
/** @} */
|
| 94 |
+
|
| 95 |
+
/* ------------------------------------------------------------------------- */
|
| 96 |
+
/** \brief Annotates a CUDA context.
|
| 97 |
+
*
|
| 98 |
+
* Allows the user to associate a CUDA context with a user-provided name.
|
| 99 |
+
*
|
| 100 |
+
* \param context - The handle of the CUDA context to name.
|
| 101 |
+
* \param name - The name of the CUDA context.
|
| 102 |
+
*
|
| 103 |
+
* \par Example:
|
| 104 |
+
* \code
|
| 105 |
+
* CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );
|
| 106 |
+
* if ( CUDA_SUCCESS != status )
|
| 107 |
+
* goto Error;
|
| 108 |
+
* nvtxNameCuContext(cuContext, "CTX_NAME");
|
| 109 |
+
* \endcode
|
| 110 |
+
*
|
| 111 |
+
* \version \NVTX_VERSION_1
|
| 112 |
+
* @{ */
|
| 113 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name);
|
| 114 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name);
|
| 115 |
+
/** @} */
|
| 116 |
+
|
| 117 |
+
/* ------------------------------------------------------------------------- */
|
| 118 |
+
/** \brief Annotates a CUDA stream.
|
| 119 |
+
*
|
| 120 |
+
* Allows the user to associate a CUDA stream with a user-provided name.
|
| 121 |
+
*
|
| 122 |
+
* \param stream - The handle of the CUDA stream to name.
|
| 123 |
+
* \param name - The name of the CUDA stream.
|
| 124 |
+
*
|
| 125 |
+
* \version \NVTX_VERSION_1
|
| 126 |
+
* @{ */
|
| 127 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name);
|
| 128 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name);
|
| 129 |
+
/** @} */
|
| 130 |
+
|
| 131 |
+
/* ------------------------------------------------------------------------- */
|
| 132 |
+
/** \brief Annotates a CUDA event.
|
| 133 |
+
*
|
| 134 |
+
* Allows the user to associate a CUDA event with a user-provided name.
|
| 135 |
+
*
|
| 136 |
+
* \param event - The handle of the CUDA event to name.
|
| 137 |
+
* \param name - The name of the CUDA event.
|
| 138 |
+
*
|
| 139 |
+
* \version \NVTX_VERSION_1
|
| 140 |
+
* @{ */
|
| 141 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name);
|
| 142 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name);
|
| 143 |
+
/** @} */
|
| 144 |
+
|
| 145 |
+
/** @} */ /* END RESOURCE_NAMING */
|
| 146 |
+
|
| 147 |
+
/* ========================================================================= */
|
| 148 |
+
#ifdef UNICODE
|
| 149 |
+
#define nvtxNameCuDevice nvtxNameCuDeviceW
|
| 150 |
+
#define nvtxNameCuContext nvtxNameCuContextW
|
| 151 |
+
#define nvtxNameCuStream nvtxNameCuStreamW
|
| 152 |
+
#define nvtxNameCuEvent nvtxNameCuEventW
|
| 153 |
+
#else
|
| 154 |
+
#define nvtxNameCuDevice nvtxNameCuDeviceA
|
| 155 |
+
#define nvtxNameCuContext nvtxNameCuContextA
|
| 156 |
+
#define nvtxNameCuStream nvtxNameCuStreamA
|
| 157 |
+
#define nvtxNameCuEvent nvtxNameCuEventA
|
| 158 |
+
#endif
|
| 159 |
+
|
| 160 |
+
#ifdef __cplusplus
|
| 161 |
+
}
|
| 162 |
+
#endif /* __cplusplus */
|
| 163 |
+
|
| 164 |
+
#endif /* NVTOOLSEXT_CUDA_H_ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtOpenCL.h
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2009-2017 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO USER:
|
| 5 |
+
*
|
| 6 |
+
* This source code is subject to NVIDIA ownership rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* This software and the information contained herein is PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
|
| 11 |
+
* of a form of NVIDIA software license agreement.
|
| 12 |
+
*
|
| 13 |
+
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
|
| 14 |
+
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
|
| 15 |
+
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
|
| 16 |
+
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
|
| 17 |
+
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 18 |
+
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
|
| 19 |
+
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
| 20 |
+
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
| 21 |
+
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
| 22 |
+
* OR PERFORMANCE OF THIS SOURCE CODE.
|
| 23 |
+
*
|
| 24 |
+
* U.S. Government End Users. This source code is a "commercial item" as
|
| 25 |
+
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
|
| 26 |
+
* "commercial computer software" and "commercial computer software
|
| 27 |
+
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
|
| 28 |
+
* and is provided to the U.S. Government only as a commercial end item.
|
| 29 |
+
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
|
| 30 |
+
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
|
| 31 |
+
* source code with only those rights set forth herein.
|
| 32 |
+
*
|
| 33 |
+
* Any use of this source code in individual and commercial software must
|
| 34 |
+
* include, in the user documentation and internal comments to the code,
|
| 35 |
+
* the above Disclaimer and U.S. Government End Users Notice.
|
| 36 |
+
*/
|
| 37 |
+
|
| 38 |
+
#ifndef NVTOOLSEXT_OPENCL_H_
|
| 39 |
+
#define NVTOOLSEXT_OPENCL_H_
|
| 40 |
+
|
| 41 |
+
#include <CL/cl.h>
|
| 42 |
+
|
| 43 |
+
#include "nvToolsExt.h"
|
| 44 |
+
|
| 45 |
+
#ifdef __cplusplus
|
| 46 |
+
extern "C" {
|
| 47 |
+
#endif /* __cplusplus */
|
| 48 |
+
|
| 49 |
+
/* ========================================================================= */
|
| 50 |
+
/** \name Functions for OpenCL Resource Naming
|
| 51 |
+
*/
|
| 52 |
+
/** \addtogroup RESOURCE_NAMING
|
| 53 |
+
* \section RESOURCE_NAMING_OPENCL OpenCL Resource Naming
|
| 54 |
+
*
|
| 55 |
+
* This section covers the API functions that allow to annotate OpenCL resources
|
| 56 |
+
* with user-provided names.
|
| 57 |
+
*
|
| 58 |
+
* @{
|
| 59 |
+
*/
|
| 60 |
+
|
| 61 |
+
/* ------------------------------------------------------------------------- */
|
| 62 |
+
/* \cond SHOW_HIDDEN
|
| 63 |
+
* \brief Used to build a non-colliding value for resource types separated class
|
| 64 |
+
* \version \NVTX_VERSION_2
|
| 65 |
+
*/
|
| 66 |
+
#define NVTX_RESOURCE_CLASS_OPENCL 6
|
| 67 |
+
/** \endcond */
|
| 68 |
+
|
| 69 |
+
/* ------------------------------------------------------------------------- */
|
| 70 |
+
/** \brief Resource types for OpenCL
|
| 71 |
+
*/
|
| 72 |
+
typedef enum nvtxResourceOpenCLType_t
|
| 73 |
+
{
|
| 74 |
+
NVTX_RESOURCE_TYPE_OPENCL_DEVICE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 1),
|
| 75 |
+
NVTX_RESOURCE_TYPE_OPENCL_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 2),
|
| 76 |
+
NVTX_RESOURCE_TYPE_OPENCL_COMMANDQUEUE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 3),
|
| 77 |
+
NVTX_RESOURCE_TYPE_OPENCL_MEMOBJECT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 4),
|
| 78 |
+
NVTX_RESOURCE_TYPE_OPENCL_SAMPLER = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 5),
|
| 79 |
+
NVTX_RESOURCE_TYPE_OPENCL_PROGRAM = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 6),
|
| 80 |
+
NVTX_RESOURCE_TYPE_OPENCL_EVENT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 7)
|
| 81 |
+
} nvtxResourceOpenCLType_t;
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
/* ------------------------------------------------------------------------- */
|
| 85 |
+
/** \brief Annotates an OpenCL device.
|
| 86 |
+
*
|
| 87 |
+
* Allows to associate an OpenCL device with a user-provided name.
|
| 88 |
+
*
|
| 89 |
+
* \param device - The handle of the OpenCL device to name.
|
| 90 |
+
* \param name - The name of the OpenCL device.
|
| 91 |
+
*
|
| 92 |
+
* \version \NVTX_VERSION_1
|
| 93 |
+
* @{ */
|
| 94 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name);
|
| 95 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name);
|
| 96 |
+
/** @} */
|
| 97 |
+
|
| 98 |
+
/* ------------------------------------------------------------------------- */
|
| 99 |
+
/** \brief Annotates an OpenCL context.
|
| 100 |
+
*
|
| 101 |
+
* Allows to associate an OpenCL context with a user-provided name.
|
| 102 |
+
*
|
| 103 |
+
* \param context - The handle of the OpenCL context to name.
|
| 104 |
+
* \param name - The name of the OpenCL context.
|
| 105 |
+
*
|
| 106 |
+
* \version \NVTX_VERSION_1
|
| 107 |
+
* @{ */
|
| 108 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name);
|
| 109 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name);
|
| 110 |
+
/** @} */
|
| 111 |
+
|
| 112 |
+
/* ------------------------------------------------------------------------- */
|
| 113 |
+
/** \brief Annotates an OpenCL command queue.
|
| 114 |
+
*
|
| 115 |
+
* Allows to associate an OpenCL command queue with a user-provided name.
|
| 116 |
+
*
|
| 117 |
+
* \param command_queue - The handle of the OpenCL command queue to name.
|
| 118 |
+
* \param name - The name of the OpenCL command queue.
|
| 119 |
+
*
|
| 120 |
+
* \version \NVTX_VERSION_1
|
| 121 |
+
* @{ */
|
| 122 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name);
|
| 123 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name);
|
| 124 |
+
/** @} */
|
| 125 |
+
|
| 126 |
+
/* ------------------------------------------------------------------------- */
|
| 127 |
+
/** \brief Annotates an OpenCL memory object.
|
| 128 |
+
*
|
| 129 |
+
* Allows to associate an OpenCL memory object with a user-provided name.
|
| 130 |
+
*
|
| 131 |
+
* \param memobj - The handle of the OpenCL memory object to name.
|
| 132 |
+
* \param name - The name of the OpenCL memory object.
|
| 133 |
+
*
|
| 134 |
+
* \version \NVTX_VERSION_1
|
| 135 |
+
* @{ */
|
| 136 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name);
|
| 137 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name);
|
| 138 |
+
/** @} */
|
| 139 |
+
|
| 140 |
+
/* ------------------------------------------------------------------------- */
|
| 141 |
+
/** \brief Annotates an OpenCL sampler.
|
| 142 |
+
*
|
| 143 |
+
* Allows to associate an OpenCL sampler with a user-provided name.
|
| 144 |
+
*
|
| 145 |
+
* \param sampler - The handle of the OpenCL sampler to name.
|
| 146 |
+
* \param name - The name of the OpenCL sampler.
|
| 147 |
+
*
|
| 148 |
+
* \version \NVTX_VERSION_1
|
| 149 |
+
* @{ */
|
| 150 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name);
|
| 151 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name);
|
| 152 |
+
/** @} */
|
| 153 |
+
|
| 154 |
+
/* ------------------------------------------------------------------------- */
|
| 155 |
+
/** \brief Annotates an OpenCL program.
|
| 156 |
+
*
|
| 157 |
+
* Allows to associate an OpenCL program with a user-provided name.
|
| 158 |
+
*
|
| 159 |
+
* \param program - The handle of the OpenCL program to name.
|
| 160 |
+
* \param name - The name of the OpenCL program.
|
| 161 |
+
*
|
| 162 |
+
* \code
|
| 163 |
+
* cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
|
| 164 |
+
* (const char **) &cSourceCL, &program_length, &ciErrNum);
|
| 165 |
+
* shrCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
| 166 |
+
* nvtxNameClProgram(cpProgram, L"PROGRAM_NAME");
|
| 167 |
+
* \endcode
|
| 168 |
+
*
|
| 169 |
+
* \version \NVTX_VERSION_1
|
| 170 |
+
* @{ */
|
| 171 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name);
|
| 172 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name);
|
| 173 |
+
/** @} */
|
| 174 |
+
|
| 175 |
+
/* ------------------------------------------------------------------------- */
|
| 176 |
+
/** \brief Annotates an OpenCL event.
|
| 177 |
+
*
|
| 178 |
+
* Allows to associate an OpenCL event with a user-provided name.
|
| 179 |
+
*
|
| 180 |
+
* \param evnt - The handle of the OpenCL event to name.
|
| 181 |
+
* \param name - The name of the OpenCL event.
|
| 182 |
+
*
|
| 183 |
+
* \version \NVTX_VERSION_1
|
| 184 |
+
* @{ */
|
| 185 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name);
|
| 186 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name);
|
| 187 |
+
/** @} */
|
| 188 |
+
|
| 189 |
+
/** @} */ /* END RESOURCE_NAMING */
|
| 190 |
+
|
| 191 |
+
/* ========================================================================= */
|
| 192 |
+
#ifdef UNICODE
|
| 193 |
+
#define nvtxNameClDevice nvtxNameClDeviceW
|
| 194 |
+
#define nvtxNameClContext nvtxNameClContextW
|
| 195 |
+
#define nvtxNameClCommandQueue nvtxNameClCommandQueueW
|
| 196 |
+
#define nvtxNameClMemObject nvtxNameClMemObjectW
|
| 197 |
+
#define nvtxNameClSampler nvtxNameClSamplerW
|
| 198 |
+
#define nvtxNameClProgram nvtxNameClProgramW
|
| 199 |
+
#define nvtxNameClEvent nvtxNameClEventW
|
| 200 |
+
#else
|
| 201 |
+
#define nvtxNameClDevice nvtxNameClDeviceA
|
| 202 |
+
#define nvtxNameClContext nvtxNameClContextA
|
| 203 |
+
#define nvtxNameClCommandQueue nvtxNameClCommandQueueA
|
| 204 |
+
#define nvtxNameClMemObject nvtxNameClMemObjectA
|
| 205 |
+
#define nvtxNameClSampler nvtxNameClSamplerA
|
| 206 |
+
#define nvtxNameClProgram nvtxNameClProgramA
|
| 207 |
+
#define nvtxNameClEvent nvtxNameClEventA
|
| 208 |
+
#endif
|
| 209 |
+
|
| 210 |
+
#ifdef __cplusplus
|
| 211 |
+
}
|
| 212 |
+
#endif /* __cplusplus */
|
| 213 |
+
|
| 214 |
+
#endif /* NVTOOLSEXT_OPENCL_H_ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExtOpenCL.h
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2009-2016 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO USER:
|
| 5 |
+
*
|
| 6 |
+
* This source code is subject to NVIDIA ownership rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* This software and the information contained herein is PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
|
| 11 |
+
* of a form of NVIDIA software license agreement.
|
| 12 |
+
*
|
| 13 |
+
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
|
| 14 |
+
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
|
| 15 |
+
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
|
| 16 |
+
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
|
| 17 |
+
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 18 |
+
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
|
| 19 |
+
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
| 20 |
+
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
| 21 |
+
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
| 22 |
+
* OR PERFORMANCE OF THIS SOURCE CODE.
|
| 23 |
+
*
|
| 24 |
+
* U.S. Government End Users. This source code is a "commercial item" as
|
| 25 |
+
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
|
| 26 |
+
* "commercial computer software" and "commercial computer software
|
| 27 |
+
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
|
| 28 |
+
* and is provided to the U.S. Government only as a commercial end item.
|
| 29 |
+
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
|
| 30 |
+
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
|
| 31 |
+
* source code with only those rights set forth herein.
|
| 32 |
+
*
|
| 33 |
+
* Any use of this source code in individual and commercial software must
|
| 34 |
+
* include, in the user documentation and internal comments to the code,
|
| 35 |
+
* the above Disclaimer and U.S. Government End Users Notice.
|
| 36 |
+
*/
|
| 37 |
+
|
| 38 |
+
#include "nvToolsExt.h"
|
| 39 |
+
|
| 40 |
+
#include <CL/cl.h>
|
| 41 |
+
|
| 42 |
+
#ifndef NVTOOLSEXT_OPENCL_V3
|
| 43 |
+
#define NVTOOLSEXT_OPENCL_V3
|
| 44 |
+
|
| 45 |
+
#ifdef __cplusplus
|
| 46 |
+
extern "C" {
|
| 47 |
+
#endif /* __cplusplus */
|
| 48 |
+
|
| 49 |
+
/* ========================================================================= */
|
| 50 |
+
/** \name Functions for OpenCL Resource Naming
|
| 51 |
+
*/
|
| 52 |
+
/** \addtogroup RESOURCE_NAMING
|
| 53 |
+
* \section RESOURCE_NAMING_OPENCL OpenCL Resource Naming
|
| 54 |
+
*
|
| 55 |
+
* This section covers the API functions that allow to annotate OpenCL resources
|
| 56 |
+
* with user-provided names.
|
| 57 |
+
*
|
| 58 |
+
* @{
|
| 59 |
+
*/
|
| 60 |
+
|
| 61 |
+
/* ------------------------------------------------------------------------- */
|
| 62 |
+
/* \cond SHOW_HIDDEN
|
| 63 |
+
* \brief Used to build a non-colliding value for resource types separated class
|
| 64 |
+
* \version \NVTX_VERSION_2
|
| 65 |
+
*/
|
| 66 |
+
#define NVTX_RESOURCE_CLASS_OPENCL 6
|
| 67 |
+
/** \endcond */
|
| 68 |
+
|
| 69 |
+
/* ------------------------------------------------------------------------- */
|
| 70 |
+
/** \brief Resource types for OpenCL
|
| 71 |
+
*/
|
| 72 |
+
typedef enum nvtxResourceOpenCLType_t
|
| 73 |
+
{
|
| 74 |
+
NVTX_RESOURCE_TYPE_OPENCL_DEVICE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 1),
|
| 75 |
+
NVTX_RESOURCE_TYPE_OPENCL_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 2),
|
| 76 |
+
NVTX_RESOURCE_TYPE_OPENCL_COMMANDQUEUE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 3),
|
| 77 |
+
NVTX_RESOURCE_TYPE_OPENCL_MEMOBJECT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 4),
|
| 78 |
+
NVTX_RESOURCE_TYPE_OPENCL_SAMPLER = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 5),
|
| 79 |
+
NVTX_RESOURCE_TYPE_OPENCL_PROGRAM = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 6),
|
| 80 |
+
NVTX_RESOURCE_TYPE_OPENCL_EVENT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 7),
|
| 81 |
+
} nvtxResourceOpenCLType_t;
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
/* ------------------------------------------------------------------------- */
|
| 85 |
+
/** \brief Annotates an OpenCL device.
|
| 86 |
+
*
|
| 87 |
+
* Allows to associate an OpenCL device with a user-provided name.
|
| 88 |
+
*
|
| 89 |
+
* \param device - The handle of the OpenCL device to name.
|
| 90 |
+
* \param name - The name of the OpenCL device.
|
| 91 |
+
*
|
| 92 |
+
* \version \NVTX_VERSION_1
|
| 93 |
+
* @{ */
|
| 94 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name);
|
| 95 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name);
|
| 96 |
+
/** @} */
|
| 97 |
+
|
| 98 |
+
/* ------------------------------------------------------------------------- */
|
| 99 |
+
/** \brief Annotates an OpenCL context.
|
| 100 |
+
*
|
| 101 |
+
* Allows to associate an OpenCL context with a user-provided name.
|
| 102 |
+
*
|
| 103 |
+
* \param context - The handle of the OpenCL context to name.
|
| 104 |
+
* \param name - The name of the OpenCL context.
|
| 105 |
+
*
|
| 106 |
+
* \version \NVTX_VERSION_1
|
| 107 |
+
* @{ */
|
| 108 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name);
|
| 109 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name);
|
| 110 |
+
/** @} */
|
| 111 |
+
|
| 112 |
+
/* ------------------------------------------------------------------------- */
|
| 113 |
+
/** \brief Annotates an OpenCL command queue.
|
| 114 |
+
*
|
| 115 |
+
* Allows to associate an OpenCL command queue with a user-provided name.
|
| 116 |
+
*
|
| 117 |
+
* \param command_queue - The handle of the OpenCL command queue to name.
|
| 118 |
+
* \param name - The name of the OpenCL command queue.
|
| 119 |
+
*
|
| 120 |
+
* \version \NVTX_VERSION_1
|
| 121 |
+
* @{ */
|
| 122 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name);
|
| 123 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name);
|
| 124 |
+
/** @} */
|
| 125 |
+
|
| 126 |
+
/* ------------------------------------------------------------------------- */
|
| 127 |
+
/** \brief Annotates an OpenCL memory object.
|
| 128 |
+
*
|
| 129 |
+
* Allows to associate an OpenCL memory object with a user-provided name.
|
| 130 |
+
*
|
| 131 |
+
* \param memobj - The handle of the OpenCL memory object to name.
|
| 132 |
+
* \param name - The name of the OpenCL memory object.
|
| 133 |
+
*
|
| 134 |
+
* \version \NVTX_VERSION_1
|
| 135 |
+
* @{ */
|
| 136 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name);
|
| 137 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name);
|
| 138 |
+
/** @} */
|
| 139 |
+
|
| 140 |
+
/* ------------------------------------------------------------------------- */
|
| 141 |
+
/** \brief Annotates an OpenCL sampler.
|
| 142 |
+
*
|
| 143 |
+
* Allows to associate an OpenCL sampler with a user-provided name.
|
| 144 |
+
*
|
| 145 |
+
* \param sampler - The handle of the OpenCL sampler to name.
|
| 146 |
+
* \param name - The name of the OpenCL sampler.
|
| 147 |
+
*
|
| 148 |
+
* \version \NVTX_VERSION_1
|
| 149 |
+
* @{ */
|
| 150 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name);
|
| 151 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name);
|
| 152 |
+
/** @} */
|
| 153 |
+
|
| 154 |
+
/* ------------------------------------------------------------------------- */
|
| 155 |
+
/** \brief Annotates an OpenCL program.
|
| 156 |
+
*
|
| 157 |
+
* Allows to associate an OpenCL program with a user-provided name.
|
| 158 |
+
*
|
| 159 |
+
* \param program - The handle of the OpenCL program to name.
|
| 160 |
+
* \param name - The name of the OpenCL program.
|
| 161 |
+
*
|
| 162 |
+
* \code
|
| 163 |
+
* cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
|
| 164 |
+
* (const char **) &cSourceCL, &program_length, &ciErrNum);
|
| 165 |
+
* shrCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
| 166 |
+
* nvtxNameClProgram(cpProgram, L"PROGRAM_NAME");
|
| 167 |
+
* \endcode
|
| 168 |
+
*
|
| 169 |
+
* \version \NVTX_VERSION_1
|
| 170 |
+
* @{ */
|
| 171 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name);
|
| 172 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name);
|
| 173 |
+
/** @} */
|
| 174 |
+
|
| 175 |
+
/* ------------------------------------------------------------------------- */
|
| 176 |
+
/** \brief Annotates an OpenCL event.
|
| 177 |
+
*
|
| 178 |
+
* Allows to associate an OpenCL event with a user-provided name.
|
| 179 |
+
*
|
| 180 |
+
* \param evnt - The handle of the OpenCL event to name.
|
| 181 |
+
* \param name - The name of the OpenCL event.
|
| 182 |
+
*
|
| 183 |
+
* \version \NVTX_VERSION_1
|
| 184 |
+
* @{ */
|
| 185 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name);
|
| 186 |
+
NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name);
|
| 187 |
+
/** @} */
|
| 188 |
+
|
| 189 |
+
/** @} */ /* END RESOURCE_NAMING */
|
| 190 |
+
|
| 191 |
+
/* ========================================================================= */
|
| 192 |
+
#ifdef UNICODE
|
| 193 |
+
#define nvtxNameClDevice nvtxNameClDeviceW
|
| 194 |
+
#define nvtxNameClContext nvtxNameClContextW
|
| 195 |
+
#define nvtxNameClCommandQueue nvtxNameClCommandQueueW
|
| 196 |
+
#define nvtxNameClMemObject nvtxNameClMemObjectW
|
| 197 |
+
#define nvtxNameClSampler nvtxNameClSamplerW
|
| 198 |
+
#define nvtxNameClProgram nvtxNameClProgramW
|
| 199 |
+
#define nvtxNameClEvent nvtxNameClEventW
|
| 200 |
+
#else
|
| 201 |
+
#define nvtxNameClDevice nvtxNameClDeviceA
|
| 202 |
+
#define nvtxNameClContext nvtxNameClContextA
|
| 203 |
+
#define nvtxNameClCommandQueue nvtxNameClCommandQueueA
|
| 204 |
+
#define nvtxNameClMemObject nvtxNameClMemObjectA
|
| 205 |
+
#define nvtxNameClSampler nvtxNameClSamplerA
|
| 206 |
+
#define nvtxNameClProgram nvtxNameClProgramA
|
| 207 |
+
#define nvtxNameClEvent nvtxNameClEventA
|
| 208 |
+
#endif
|
| 209 |
+
|
| 210 |
+
#ifdef __cplusplus
|
| 211 |
+
}
|
| 212 |
+
#endif /* __cplusplus */
|
| 213 |
+
|
| 214 |
+
#ifndef NVTX_NO_IMPL
|
| 215 |
+
#define NVTX_IMPL_GUARD_OPENCL /* Ensure other headers cannot included directly */
|
| 216 |
+
#include "nvtxDetail/nvtxImplOpenCL_v3.h"
|
| 217 |
+
#undef NVTX_IMPL_GUARD_OPENCL
|
| 218 |
+
#endif /*NVTX_NO_IMPL*/
|
| 219 |
+
|
| 220 |
+
#endif /* NVTOOLSEXT_OPENCL_V3 */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/WHEEL
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Wheel-Version: 1.0
|
| 2 |
+
Generator: setuptools (74.1.2)
|
| 3 |
+
Root-Is-Purelib: true
|
| 4 |
+
Tag: py3-none-any
|
| 5 |
+
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pybind11
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .pyximport import *
|
| 2 |
+
|
| 3 |
+
# replicate docstring
|
| 4 |
+
from .pyximport import __doc__
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__pycache__/_pyximport2.cpython-311.pyc
ADDED
|
Binary file (28.6 kB). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__pycache__/pyxbuild.cpython-311.pyc
ADDED
|
Binary file (7.1 kB). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/_pyximport3.py
ADDED
|
@@ -0,0 +1,478 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Import hooks; when installed with the install() function, these hooks
|
| 3 |
+
allow importing .pyx files as if they were Python modules.
|
| 4 |
+
|
| 5 |
+
If you want the hook installed every time you run Python
|
| 6 |
+
you can add it to your Python version by adding these lines to
|
| 7 |
+
sitecustomize.py (which you can create from scratch in site-packages
|
| 8 |
+
if it doesn't exist there or somewhere else on your python path)::
|
| 9 |
+
|
| 10 |
+
import pyximport
|
| 11 |
+
pyximport.install()
|
| 12 |
+
|
| 13 |
+
For instance on the Mac with a non-system Python 2.3, you could create
|
| 14 |
+
sitecustomize.py with only those two lines at
|
| 15 |
+
/usr/local/lib/python2.3/site-packages/sitecustomize.py .
|
| 16 |
+
|
| 17 |
+
A custom distutils.core.Extension instance and setup() args
|
| 18 |
+
(Distribution) for for the build can be defined by a <modulename>.pyxbld
|
| 19 |
+
file like:
|
| 20 |
+
|
| 21 |
+
# examplemod.pyxbld
|
| 22 |
+
def make_ext(modname, pyxfilename):
|
| 23 |
+
from distutils.extension import Extension
|
| 24 |
+
return Extension(name = modname,
|
| 25 |
+
sources=[pyxfilename, 'hello.c'],
|
| 26 |
+
include_dirs=['/myinclude'] )
|
| 27 |
+
def make_setup_args():
|
| 28 |
+
return dict(script_args=["--compiler=mingw32"])
|
| 29 |
+
|
| 30 |
+
Extra dependencies can be defined by a <modulename>.pyxdep .
|
| 31 |
+
See README.
|
| 32 |
+
|
| 33 |
+
Since Cython 0.11, the :mod:`pyximport` module also has experimental
|
| 34 |
+
compilation support for normal Python modules. This allows you to
|
| 35 |
+
automatically run Cython on every .pyx and .py module that Python
|
| 36 |
+
imports, including parts of the standard library and installed
|
| 37 |
+
packages. Cython will still fail to compile a lot of Python modules,
|
| 38 |
+
in which case the import mechanism will fall back to loading the
|
| 39 |
+
Python source modules instead. The .py import mechanism is installed
|
| 40 |
+
like this::
|
| 41 |
+
|
| 42 |
+
pyximport.install(pyimport = True)
|
| 43 |
+
|
| 44 |
+
Running this module as a top-level script will run a test and then print
|
| 45 |
+
the documentation.
|
| 46 |
+
"""
|
| 47 |
+
|
| 48 |
+
import glob
|
| 49 |
+
import importlib
|
| 50 |
+
import os
|
| 51 |
+
import sys
|
| 52 |
+
from importlib.abc import MetaPathFinder
|
| 53 |
+
from importlib.machinery import ExtensionFileLoader, SourceFileLoader
|
| 54 |
+
from importlib.util import spec_from_file_location
|
| 55 |
+
|
| 56 |
+
mod_name = "pyximport"
|
| 57 |
+
|
| 58 |
+
PY_EXT = ".py"
|
| 59 |
+
PYX_EXT = ".pyx"
|
| 60 |
+
PYXDEP_EXT = ".pyxdep"
|
| 61 |
+
PYXBLD_EXT = ".pyxbld"
|
| 62 |
+
|
| 63 |
+
DEBUG_IMPORT = False
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def _print(message, args):
|
| 67 |
+
if args:
|
| 68 |
+
message = message % args
|
| 69 |
+
print(message)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def _debug(message, *args):
|
| 73 |
+
if DEBUG_IMPORT:
|
| 74 |
+
_print(message, args)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def _info(message, *args):
|
| 78 |
+
_print(message, args)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def load_source(file_path):
|
| 82 |
+
import importlib.util
|
| 83 |
+
from importlib.machinery import SourceFileLoader
|
| 84 |
+
spec = importlib.util.spec_from_file_location("XXXX", file_path, loader=SourceFileLoader("XXXX", file_path))
|
| 85 |
+
module = importlib.util.module_from_spec(spec)
|
| 86 |
+
spec.loader.exec_module(module)
|
| 87 |
+
return module
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def get_distutils_extension(modname, pyxfilename, language_level=None):
|
| 91 |
+
# try:
|
| 92 |
+
# import hashlib
|
| 93 |
+
# except ImportError:
|
| 94 |
+
# import md5 as hashlib
|
| 95 |
+
# extra = "_" + hashlib.md5(open(pyxfilename).read()).hexdigest()
|
| 96 |
+
# modname = modname + extra
|
| 97 |
+
extension_mod,setup_args = handle_special_build(modname, pyxfilename)
|
| 98 |
+
if not extension_mod:
|
| 99 |
+
if not isinstance(pyxfilename, str):
|
| 100 |
+
# distutils is stupid in Py2 and requires exactly 'str'
|
| 101 |
+
# => encode accidentally coerced unicode strings back to str
|
| 102 |
+
pyxfilename = pyxfilename.encode(sys.getfilesystemencoding())
|
| 103 |
+
from distutils.extension import Extension
|
| 104 |
+
extension_mod = Extension(name = modname, sources=[pyxfilename])
|
| 105 |
+
if language_level is not None:
|
| 106 |
+
extension_mod.cython_directives = {'language_level': language_level}
|
| 107 |
+
return extension_mod,setup_args
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def handle_special_build(modname, pyxfilename):
|
| 111 |
+
special_build = os.path.splitext(pyxfilename)[0] + PYXBLD_EXT
|
| 112 |
+
ext = None
|
| 113 |
+
setup_args={}
|
| 114 |
+
if os.path.exists(special_build):
|
| 115 |
+
# globls = {}
|
| 116 |
+
# locs = {}
|
| 117 |
+
# execfile(special_build, globls, locs)
|
| 118 |
+
# ext = locs["make_ext"](modname, pyxfilename)
|
| 119 |
+
mod = load_source(special_build)
|
| 120 |
+
make_ext = getattr(mod,'make_ext',None)
|
| 121 |
+
if make_ext:
|
| 122 |
+
ext = make_ext(modname, pyxfilename)
|
| 123 |
+
assert ext and ext.sources, "make_ext in %s did not return Extension" % special_build
|
| 124 |
+
make_setup_args = getattr(mod, 'make_setup_args',None)
|
| 125 |
+
if make_setup_args:
|
| 126 |
+
setup_args = make_setup_args()
|
| 127 |
+
assert isinstance(setup_args,dict), ("make_setup_args in %s did not return a dict"
|
| 128 |
+
% special_build)
|
| 129 |
+
assert ext or setup_args, ("neither make_ext nor make_setup_args %s"
|
| 130 |
+
% special_build)
|
| 131 |
+
ext.sources = [os.path.join(os.path.dirname(special_build), source)
|
| 132 |
+
for source in ext.sources]
|
| 133 |
+
return ext, setup_args
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def handle_dependencies(pyxfilename):
|
| 137 |
+
testing = '_test_files' in globals()
|
| 138 |
+
dependfile = os.path.splitext(pyxfilename)[0] + PYXDEP_EXT
|
| 139 |
+
|
| 140 |
+
# by default let distutils decide whether to rebuild on its own
|
| 141 |
+
# (it has a better idea of what the output file will be)
|
| 142 |
+
|
| 143 |
+
# but we know more about dependencies so force a rebuild if
|
| 144 |
+
# some of the dependencies are newer than the pyxfile.
|
| 145 |
+
if os.path.exists(dependfile):
|
| 146 |
+
with open(dependfile) as fid:
|
| 147 |
+
depends = fid.readlines()
|
| 148 |
+
depends = [depend.strip() for depend in depends]
|
| 149 |
+
|
| 150 |
+
# gather dependencies in the "files" variable
|
| 151 |
+
# the dependency file is itself a dependency
|
| 152 |
+
files = [dependfile]
|
| 153 |
+
for depend in depends:
|
| 154 |
+
fullpath = os.path.join(os.path.dirname(dependfile),
|
| 155 |
+
depend)
|
| 156 |
+
files.extend(glob.glob(fullpath))
|
| 157 |
+
|
| 158 |
+
# only for unit testing to see we did the right thing
|
| 159 |
+
if testing:
|
| 160 |
+
_test_files[:] = [] #$pycheck_no
|
| 161 |
+
|
| 162 |
+
# if any file that the pyxfile depends upon is newer than
|
| 163 |
+
# the pyx file, 'touch' the pyx file so that distutils will
|
| 164 |
+
# be tricked into rebuilding it.
|
| 165 |
+
for file in files:
|
| 166 |
+
from distutils.dep_util import newer
|
| 167 |
+
if newer(file, pyxfilename):
|
| 168 |
+
_debug("Rebuilding %s because of %s", pyxfilename, file)
|
| 169 |
+
filetime = os.path.getmtime(file)
|
| 170 |
+
os.utime(pyxfilename, (filetime, filetime))
|
| 171 |
+
if testing:
|
| 172 |
+
_test_files.append(file)
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
def build_module(name, pyxfilename, pyxbuild_dir=None, inplace=False, language_level=None):
|
| 176 |
+
assert os.path.exists(pyxfilename), "Path does not exist: %s" % pyxfilename
|
| 177 |
+
handle_dependencies(pyxfilename)
|
| 178 |
+
|
| 179 |
+
extension_mod, setup_args = get_distutils_extension(name, pyxfilename, language_level)
|
| 180 |
+
build_in_temp = pyxargs.build_in_temp
|
| 181 |
+
sargs = pyxargs.setup_args.copy()
|
| 182 |
+
sargs.update(setup_args)
|
| 183 |
+
build_in_temp = sargs.pop('build_in_temp',build_in_temp)
|
| 184 |
+
|
| 185 |
+
from . import pyxbuild
|
| 186 |
+
olddir = os.getcwd()
|
| 187 |
+
common = ''
|
| 188 |
+
if pyxbuild_dir and sys.platform == 'win32':
|
| 189 |
+
# Windows concatenates the pyxbuild_dir to the pyxfilename when
|
| 190 |
+
# compiling, and then complains that the filename is too long
|
| 191 |
+
common = os.path.commonprefix([pyxbuild_dir, pyxfilename])
|
| 192 |
+
if len(common) > 30:
|
| 193 |
+
pyxfilename = os.path.relpath(pyxfilename, common)
|
| 194 |
+
pyxbuild_dir = os.path.relpath(pyxbuild_dir, common)
|
| 195 |
+
os.chdir(common)
|
| 196 |
+
try:
|
| 197 |
+
so_path = pyxbuild.pyx_to_dll(pyxfilename, extension_mod,
|
| 198 |
+
build_in_temp=build_in_temp,
|
| 199 |
+
pyxbuild_dir=pyxbuild_dir,
|
| 200 |
+
setup_args=sargs,
|
| 201 |
+
inplace=inplace,
|
| 202 |
+
reload_support=pyxargs.reload_support)
|
| 203 |
+
finally:
|
| 204 |
+
os.chdir(olddir)
|
| 205 |
+
so_path = os.path.join(common, so_path)
|
| 206 |
+
assert os.path.exists(so_path), "Cannot find: %s" % so_path
|
| 207 |
+
|
| 208 |
+
junkpath = os.path.join(os.path.dirname(so_path), name+"_*") #very dangerous with --inplace ? yes, indeed, trying to eat my files ;)
|
| 209 |
+
junkstuff = glob.glob(junkpath)
|
| 210 |
+
for path in junkstuff:
|
| 211 |
+
if path != so_path:
|
| 212 |
+
try:
|
| 213 |
+
os.remove(path)
|
| 214 |
+
except IOError:
|
| 215 |
+
_info("Couldn't remove %s", path)
|
| 216 |
+
|
| 217 |
+
return so_path
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
# import hooks
|
| 221 |
+
|
| 222 |
+
class PyxImportMetaFinder(MetaPathFinder):
|
| 223 |
+
|
| 224 |
+
def __init__(self, extension=PYX_EXT, pyxbuild_dir=None, inplace=False, language_level=None):
|
| 225 |
+
self.pyxbuild_dir = pyxbuild_dir
|
| 226 |
+
self.inplace = inplace
|
| 227 |
+
self.language_level = language_level
|
| 228 |
+
self.extension = extension
|
| 229 |
+
|
| 230 |
+
def find_spec(self, fullname, path, target=None):
|
| 231 |
+
if not path:
|
| 232 |
+
path = [os.getcwd()] # top level import --
|
| 233 |
+
if "." in fullname:
|
| 234 |
+
*parents, name = fullname.split(".")
|
| 235 |
+
else:
|
| 236 |
+
name = fullname
|
| 237 |
+
for entry in path:
|
| 238 |
+
if os.path.isdir(os.path.join(entry, name)):
|
| 239 |
+
# this module has child modules
|
| 240 |
+
filename = os.path.join(entry, name, "__init__" + self.extension)
|
| 241 |
+
submodule_locations = [os.path.join(entry, name)]
|
| 242 |
+
else:
|
| 243 |
+
filename = os.path.join(entry, name + self.extension)
|
| 244 |
+
submodule_locations = None
|
| 245 |
+
if not os.path.exists(filename):
|
| 246 |
+
continue
|
| 247 |
+
|
| 248 |
+
return spec_from_file_location(
|
| 249 |
+
fullname, filename,
|
| 250 |
+
loader=PyxImportLoader(filename, self.pyxbuild_dir, self.inplace, self.language_level),
|
| 251 |
+
submodule_search_locations=submodule_locations)
|
| 252 |
+
|
| 253 |
+
return None # we don't know how to import this
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
class PyImportMetaFinder(MetaPathFinder):
|
| 257 |
+
|
| 258 |
+
def __init__(self, extension=PY_EXT, pyxbuild_dir=None, inplace=False, language_level=None):
|
| 259 |
+
self.pyxbuild_dir = pyxbuild_dir
|
| 260 |
+
self.inplace = inplace
|
| 261 |
+
self.language_level = language_level
|
| 262 |
+
self.extension = extension
|
| 263 |
+
self.uncompilable_modules = {}
|
| 264 |
+
self.blocked_modules = ['Cython', 'pyxbuild', 'pyximport.pyxbuild',
|
| 265 |
+
'distutils', 'cython']
|
| 266 |
+
self.blocked_packages = ['Cython.', 'distutils.']
|
| 267 |
+
|
| 268 |
+
def find_spec(self, fullname, path, target=None):
|
| 269 |
+
if fullname in sys.modules:
|
| 270 |
+
return None
|
| 271 |
+
if any([fullname.startswith(pkg) for pkg in self.blocked_packages]):
|
| 272 |
+
return None
|
| 273 |
+
if fullname in self.blocked_modules:
|
| 274 |
+
# prevent infinite recursion
|
| 275 |
+
return None
|
| 276 |
+
|
| 277 |
+
self.blocked_modules.append(fullname)
|
| 278 |
+
name = fullname
|
| 279 |
+
if not path:
|
| 280 |
+
path = [os.getcwd()] # top level import --
|
| 281 |
+
try:
|
| 282 |
+
for entry in path:
|
| 283 |
+
if os.path.isdir(os.path.join(entry, name)):
|
| 284 |
+
# this module has child modules
|
| 285 |
+
filename = os.path.join(entry, name, "__init__" + self.extension)
|
| 286 |
+
submodule_locations = [os.path.join(entry, name)]
|
| 287 |
+
else:
|
| 288 |
+
filename = os.path.join(entry, name + self.extension)
|
| 289 |
+
submodule_locations = None
|
| 290 |
+
if not os.path.exists(filename):
|
| 291 |
+
continue
|
| 292 |
+
|
| 293 |
+
return spec_from_file_location(
|
| 294 |
+
fullname, filename,
|
| 295 |
+
loader=PyxImportLoader(filename, self.pyxbuild_dir, self.inplace, self.language_level),
|
| 296 |
+
submodule_search_locations=submodule_locations)
|
| 297 |
+
finally:
|
| 298 |
+
self.blocked_modules.pop()
|
| 299 |
+
|
| 300 |
+
return None # we don't know how to import this
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
class PyxImportLoader(ExtensionFileLoader):
|
| 304 |
+
|
| 305 |
+
def __init__(self, filename, pyxbuild_dir, inplace, language_level):
|
| 306 |
+
module_name = os.path.splitext(os.path.basename(filename))[0]
|
| 307 |
+
super().__init__(module_name, filename)
|
| 308 |
+
self._pyxbuild_dir = pyxbuild_dir
|
| 309 |
+
self._inplace = inplace
|
| 310 |
+
self._language_level = language_level
|
| 311 |
+
|
| 312 |
+
def create_module(self, spec):
|
| 313 |
+
try:
|
| 314 |
+
so_path = build_module(spec.name, pyxfilename=spec.origin, pyxbuild_dir=self._pyxbuild_dir,
|
| 315 |
+
inplace=self._inplace, language_level=self._language_level)
|
| 316 |
+
self.path = so_path
|
| 317 |
+
spec.origin = so_path
|
| 318 |
+
return super().create_module(spec)
|
| 319 |
+
except Exception as failure_exc:
|
| 320 |
+
_debug("Failed to load extension module: %r" % failure_exc)
|
| 321 |
+
if pyxargs.load_py_module_on_import_failure and spec.origin.endswith(PY_EXT):
|
| 322 |
+
spec = importlib.util.spec_from_file_location(spec.name, spec.origin,
|
| 323 |
+
loader=SourceFileLoader(spec.name, spec.origin))
|
| 324 |
+
mod = importlib.util.module_from_spec(spec)
|
| 325 |
+
assert mod.__file__ in (spec.origin, spec.origin + 'c', spec.origin + 'o'), (mod.__file__, spec.origin)
|
| 326 |
+
return mod
|
| 327 |
+
else:
|
| 328 |
+
tb = sys.exc_info()[2]
|
| 329 |
+
import traceback
|
| 330 |
+
exc = ImportError("Building module %s failed: %s" % (
|
| 331 |
+
spec.name, traceback.format_exception_only(*sys.exc_info()[:2])))
|
| 332 |
+
raise exc.with_traceback(tb)
|
| 333 |
+
|
| 334 |
+
def exec_module(self, module):
|
| 335 |
+
try:
|
| 336 |
+
return super().exec_module(module)
|
| 337 |
+
except Exception as failure_exc:
|
| 338 |
+
import traceback
|
| 339 |
+
_debug("Failed to load extension module: %r" % failure_exc)
|
| 340 |
+
raise ImportError("Executing module %s failed %s" % (
|
| 341 |
+
module.__file__, traceback.format_exception_only(*sys.exc_info()[:2])))
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
#install args
|
| 345 |
+
class PyxArgs(object):
|
| 346 |
+
build_dir=True
|
| 347 |
+
build_in_temp=True
|
| 348 |
+
setup_args={} #None
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
def _have_importers():
|
| 352 |
+
has_py_importer = False
|
| 353 |
+
has_pyx_importer = False
|
| 354 |
+
for importer in sys.meta_path:
|
| 355 |
+
if isinstance(importer, PyxImportMetaFinder):
|
| 356 |
+
if isinstance(importer, PyImportMetaFinder):
|
| 357 |
+
has_py_importer = True
|
| 358 |
+
else:
|
| 359 |
+
has_pyx_importer = True
|
| 360 |
+
|
| 361 |
+
return has_py_importer, has_pyx_importer
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
def install(pyximport=True, pyimport=False, build_dir=None, build_in_temp=True,
|
| 365 |
+
setup_args=None, reload_support=False,
|
| 366 |
+
load_py_module_on_import_failure=False, inplace=False,
|
| 367 |
+
language_level=None):
|
| 368 |
+
""" Main entry point for pyxinstall.
|
| 369 |
+
|
| 370 |
+
Call this to install the ``.pyx`` import hook in
|
| 371 |
+
your meta-path for a single Python process. If you want it to be
|
| 372 |
+
installed whenever you use Python, add it to your ``sitecustomize``
|
| 373 |
+
(as described above).
|
| 374 |
+
|
| 375 |
+
:param pyximport: If set to False, does not try to import ``.pyx`` files.
|
| 376 |
+
|
| 377 |
+
:param pyimport: You can pass ``pyimport=True`` to also
|
| 378 |
+
install the ``.py`` import hook
|
| 379 |
+
in your meta-path. Note, however, that it is rather experimental,
|
| 380 |
+
will not work at all for some ``.py`` files and packages, and will
|
| 381 |
+
heavily slow down your imports due to search and compilation.
|
| 382 |
+
Use at your own risk.
|
| 383 |
+
|
| 384 |
+
:param build_dir: By default, compiled modules will end up in a ``.pyxbld``
|
| 385 |
+
directory in the user's home directory. Passing a different path
|
| 386 |
+
as ``build_dir`` will override this.
|
| 387 |
+
|
| 388 |
+
:param build_in_temp: If ``False``, will produce the C files locally. Working
|
| 389 |
+
with complex dependencies and debugging becomes more easy. This
|
| 390 |
+
can principally interfere with existing files of the same name.
|
| 391 |
+
|
| 392 |
+
:param setup_args: Dict of arguments for Distribution.
|
| 393 |
+
See ``distutils.core.setup()``.
|
| 394 |
+
|
| 395 |
+
:param reload_support: Enables support for dynamic
|
| 396 |
+
``reload(my_module)``, e.g. after a change in the Cython code.
|
| 397 |
+
Additional files ``<so_path>.reloadNN`` may arise on that account, when
|
| 398 |
+
the previously loaded module file cannot be overwritten.
|
| 399 |
+
|
| 400 |
+
:param load_py_module_on_import_failure: If the compilation of a ``.py``
|
| 401 |
+
file succeeds, but the subsequent import fails for some reason,
|
| 402 |
+
retry the import with the normal ``.py`` module instead of the
|
| 403 |
+
compiled module. Note that this may lead to unpredictable results
|
| 404 |
+
for modules that change the system state during their import, as
|
| 405 |
+
the second import will rerun these modifications in whatever state
|
| 406 |
+
the system was left after the import of the compiled module
|
| 407 |
+
failed.
|
| 408 |
+
|
| 409 |
+
:param inplace: Install the compiled module
|
| 410 |
+
(``.so`` for Linux and Mac / ``.pyd`` for Windows)
|
| 411 |
+
next to the source file.
|
| 412 |
+
|
| 413 |
+
:param language_level: The source language level to use: 2 or 3.
|
| 414 |
+
The default is to use the language level of the current Python
|
| 415 |
+
runtime for .py files and Py2 for ``.pyx`` files.
|
| 416 |
+
"""
|
| 417 |
+
if setup_args is None:
|
| 418 |
+
setup_args = {}
|
| 419 |
+
if not build_dir:
|
| 420 |
+
build_dir = os.path.join(os.path.expanduser('~'), '.pyxbld')
|
| 421 |
+
|
| 422 |
+
global pyxargs
|
| 423 |
+
pyxargs = PyxArgs() #$pycheck_no
|
| 424 |
+
pyxargs.build_dir = build_dir
|
| 425 |
+
pyxargs.build_in_temp = build_in_temp
|
| 426 |
+
pyxargs.setup_args = (setup_args or {}).copy()
|
| 427 |
+
pyxargs.reload_support = reload_support
|
| 428 |
+
pyxargs.load_py_module_on_import_failure = load_py_module_on_import_failure
|
| 429 |
+
|
| 430 |
+
has_py_importer, has_pyx_importer = _have_importers()
|
| 431 |
+
py_importer, pyx_importer = None, None
|
| 432 |
+
|
| 433 |
+
if pyimport and not has_py_importer:
|
| 434 |
+
py_importer = PyImportMetaFinder(pyxbuild_dir=build_dir, inplace=inplace,
|
| 435 |
+
language_level=language_level)
|
| 436 |
+
# make sure we import Cython before we install the import hook
|
| 437 |
+
import Cython.Compiler.Main, Cython.Compiler.Pipeline, Cython.Compiler.Optimize
|
| 438 |
+
sys.meta_path.insert(0, py_importer)
|
| 439 |
+
|
| 440 |
+
if pyximport and not has_pyx_importer:
|
| 441 |
+
pyx_importer = PyxImportMetaFinder(pyxbuild_dir=build_dir, inplace=inplace,
|
| 442 |
+
language_level=language_level)
|
| 443 |
+
sys.meta_path.append(pyx_importer)
|
| 444 |
+
|
| 445 |
+
return py_importer, pyx_importer
|
| 446 |
+
|
| 447 |
+
|
| 448 |
+
def uninstall(py_importer, pyx_importer):
|
| 449 |
+
"""
|
| 450 |
+
Uninstall an import hook.
|
| 451 |
+
"""
|
| 452 |
+
try:
|
| 453 |
+
sys.meta_path.remove(py_importer)
|
| 454 |
+
except ValueError:
|
| 455 |
+
pass
|
| 456 |
+
|
| 457 |
+
try:
|
| 458 |
+
sys.meta_path.remove(pyx_importer)
|
| 459 |
+
except ValueError:
|
| 460 |
+
pass
|
| 461 |
+
|
| 462 |
+
|
| 463 |
+
# MAIN
|
| 464 |
+
|
| 465 |
+
def show_docs():
|
| 466 |
+
import __main__
|
| 467 |
+
__main__.__name__ = mod_name
|
| 468 |
+
for name in dir(__main__):
|
| 469 |
+
item = getattr(__main__, name)
|
| 470 |
+
try:
|
| 471 |
+
setattr(item, "__module__", mod_name)
|
| 472 |
+
except (AttributeError, TypeError):
|
| 473 |
+
pass
|
| 474 |
+
help(__main__)
|
| 475 |
+
|
| 476 |
+
|
| 477 |
+
if __name__ == '__main__':
|
| 478 |
+
show_docs()
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_VF.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
This makes the functions in torch._C._VariableFunctions available as
|
| 3 |
+
torch._VF.<funcname>
|
| 4 |
+
without mypy being able to find them.
|
| 5 |
+
|
| 6 |
+
A subset of those functions are mapped to ATen functions in
|
| 7 |
+
torch/jit/_builtins.py
|
| 8 |
+
|
| 9 |
+
See https://github.com/pytorch/pytorch/issues/21478 for the reason for
|
| 10 |
+
introducing torch._VF
|
| 11 |
+
|
| 12 |
+
"""
|
| 13 |
+
import sys
|
| 14 |
+
import types
|
| 15 |
+
|
| 16 |
+
import torch
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class VFModule(types.ModuleType):
|
| 20 |
+
vf: types.ModuleType
|
| 21 |
+
|
| 22 |
+
def __init__(self, name):
|
| 23 |
+
super().__init__(name)
|
| 24 |
+
self.vf = torch._C._VariableFunctions
|
| 25 |
+
|
| 26 |
+
def __getattr__(self, attr):
|
| 27 |
+
return getattr(self.vf, attr)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
sys.modules[__name__] = VFModule(__name__)
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_classes.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import types
|
| 2 |
+
|
| 3 |
+
import torch._C
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class _ClassNamespace(types.ModuleType):
|
| 7 |
+
def __init__(self, name):
|
| 8 |
+
super().__init__("torch.classes" + name)
|
| 9 |
+
self.name = name
|
| 10 |
+
|
| 11 |
+
def __getattr__(self, attr):
|
| 12 |
+
proxy = torch._C._get_custom_class_python_wrapper(self.name, attr)
|
| 13 |
+
if proxy is None:
|
| 14 |
+
raise RuntimeError(f"Class {self.name}.{attr} not registered!")
|
| 15 |
+
return proxy
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class _Classes(types.ModuleType):
|
| 19 |
+
__file__ = "_classes.py"
|
| 20 |
+
|
| 21 |
+
def __init__(self):
|
| 22 |
+
super().__init__("torch.classes")
|
| 23 |
+
|
| 24 |
+
def __getattr__(self, name):
|
| 25 |
+
namespace = _ClassNamespace(name)
|
| 26 |
+
setattr(self, name, namespace)
|
| 27 |
+
return namespace
|
| 28 |
+
|
| 29 |
+
@property
|
| 30 |
+
def loaded_libraries(self):
|
| 31 |
+
return torch.ops.loaded_libraries
|
| 32 |
+
|
| 33 |
+
def load_library(self, path):
|
| 34 |
+
"""
|
| 35 |
+
Loads a shared library from the given path into the current process.
|
| 36 |
+
|
| 37 |
+
The library being loaded may run global initialization code to register
|
| 38 |
+
custom classes with the PyTorch JIT runtime. This allows dynamically
|
| 39 |
+
loading custom classes. For this, you should compile your class
|
| 40 |
+
and the static registration code into a shared library object, and then
|
| 41 |
+
call ``torch.classes.load_library('path/to/libcustom.so')`` to load the
|
| 42 |
+
shared object.
|
| 43 |
+
|
| 44 |
+
After the library is loaded, it is added to the
|
| 45 |
+
``torch.classes.loaded_libraries`` attribute, a set that may be inspected
|
| 46 |
+
for the paths of all libraries loaded using this function.
|
| 47 |
+
|
| 48 |
+
Args:
|
| 49 |
+
path (str): A path to a shared library to load.
|
| 50 |
+
"""
|
| 51 |
+
torch.ops.load_library(path)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# The classes "namespace"
|
| 55 |
+
classes = _Classes()
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_deploy.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import io
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
from torch.package import Importer, OrderedImporter, PackageImporter, sys_importer
|
| 5 |
+
from torch.package._package_pickler import create_pickler
|
| 6 |
+
from torch.package._package_unpickler import PackageUnpickler
|
| 7 |
+
from torch.serialization import _maybe_decode_ascii
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def _save_storages(importer, obj):
|
| 11 |
+
serialized_storages = []
|
| 12 |
+
serialized_dtypes = []
|
| 13 |
+
|
| 14 |
+
importer = importer if isinstance(importer, torch.package.PackageImporter) else None
|
| 15 |
+
importers: Importer
|
| 16 |
+
if importer is not None:
|
| 17 |
+
importers = OrderedImporter(importer, sys_importer)
|
| 18 |
+
else:
|
| 19 |
+
importers = sys_importer
|
| 20 |
+
|
| 21 |
+
def persistent_id(obj):
|
| 22 |
+
if torch.is_storage(obj) or isinstance(obj, torch.storage.TypedStorage):
|
| 23 |
+
if isinstance(obj, torch.storage.TypedStorage):
|
| 24 |
+
# TODO: Once we decide to break serialization FC, we can
|
| 25 |
+
# remove this case
|
| 26 |
+
storage = obj._untyped_storage
|
| 27 |
+
dtype = obj.dtype
|
| 28 |
+
else:
|
| 29 |
+
storage = obj
|
| 30 |
+
dtype = torch.uint8
|
| 31 |
+
|
| 32 |
+
serialized_storages.append(obj)
|
| 33 |
+
serialized_dtypes.append(dtype)
|
| 34 |
+
return ("storage", len(serialized_storages) - 1)
|
| 35 |
+
|
| 36 |
+
if hasattr(obj, "__reduce_deploy__"):
|
| 37 |
+
if _serialized_reduces.get(id(obj)) is None:
|
| 38 |
+
_serialized_reduces[id(obj)] = (
|
| 39 |
+
"reduce_deploy",
|
| 40 |
+
id(obj),
|
| 41 |
+
*obj.__reduce_deploy__(importers),
|
| 42 |
+
)
|
| 43 |
+
return _serialized_reduces[id(obj)]
|
| 44 |
+
|
| 45 |
+
return None
|
| 46 |
+
|
| 47 |
+
# Write the pickle data for `obj`
|
| 48 |
+
data_buf = io.BytesIO()
|
| 49 |
+
pickler = create_pickler(data_buf, importers)
|
| 50 |
+
pickler.persistent_id = persistent_id
|
| 51 |
+
pickler.dump(obj)
|
| 52 |
+
data_value = data_buf.getvalue()
|
| 53 |
+
return (
|
| 54 |
+
data_value,
|
| 55 |
+
serialized_storages,
|
| 56 |
+
serialized_dtypes,
|
| 57 |
+
importer.zip_reader if importer else None,
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def _load_storages(id, zip_reader, obj_bytes, serialized_storages, serialized_dtypes):
|
| 62 |
+
def persistent_load(saved_id):
|
| 63 |
+
assert isinstance(saved_id, tuple)
|
| 64 |
+
typename = _maybe_decode_ascii(saved_id[0])
|
| 65 |
+
data = saved_id[1:]
|
| 66 |
+
|
| 67 |
+
if typename == "storage":
|
| 68 |
+
# TODO: Once we decide to break serialization FC, we can
|
| 69 |
+
# stop wrapping with TypedStorage
|
| 70 |
+
storage = serialized_storages[data[0]]
|
| 71 |
+
dtype = serialized_dtypes[data[0]]
|
| 72 |
+
return torch.storage.TypedStorage(
|
| 73 |
+
wrap_storage=storage.untyped(), dtype=dtype
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
if typename == "reduce_deploy":
|
| 77 |
+
reduce_id, func, args = data
|
| 78 |
+
if reduce_id not in _loaded_reduces:
|
| 79 |
+
_loaded_reduces[reduce_id] = func(_raw_packages[zip_reader], *args)
|
| 80 |
+
return _loaded_reduces[reduce_id]
|
| 81 |
+
|
| 82 |
+
return None
|
| 83 |
+
|
| 84 |
+
importer: Importer
|
| 85 |
+
if zip_reader is not None:
|
| 86 |
+
importer = OrderedImporter(_get_package(zip_reader), sys_importer)
|
| 87 |
+
else:
|
| 88 |
+
importer = sys_importer
|
| 89 |
+
|
| 90 |
+
unpickler = PackageUnpickler(importer, io.BytesIO(obj_bytes))
|
| 91 |
+
unpickler.persistent_load = persistent_load # type: ignore[method-assign]
|
| 92 |
+
result = _deploy_objects[id] = unpickler.load()
|
| 93 |
+
return result
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def _get_package(zip_reader):
|
| 97 |
+
if zip_reader not in _raw_packages:
|
| 98 |
+
_raw_packages[zip_reader] = PackageImporter(zip_reader)
|
| 99 |
+
return _raw_packages[zip_reader]
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
_raw_packages: dict = {}
|
| 103 |
+
_deploy_objects: dict = {}
|
| 104 |
+
_serialized_reduces: dict = {}
|
| 105 |
+
_loaded_reduces: dict = {}
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_linalg_utils.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Various linear algebra utility methods for internal use.
|
| 2 |
+
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from typing import Optional, Tuple
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
from torch import Tensor
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def is_sparse(A):
|
| 12 |
+
"""Check if tensor A is a sparse tensor"""
|
| 13 |
+
if isinstance(A, torch.Tensor):
|
| 14 |
+
return A.layout == torch.sparse_coo
|
| 15 |
+
|
| 16 |
+
error_str = "expected Tensor"
|
| 17 |
+
if not torch.jit.is_scripting():
|
| 18 |
+
error_str += f" but got {type(A)}"
|
| 19 |
+
raise TypeError(error_str)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def get_floating_dtype(A):
|
| 23 |
+
"""Return the floating point dtype of tensor A.
|
| 24 |
+
|
| 25 |
+
Integer types map to float32.
|
| 26 |
+
"""
|
| 27 |
+
dtype = A.dtype
|
| 28 |
+
if dtype in (torch.float16, torch.float32, torch.float64):
|
| 29 |
+
return dtype
|
| 30 |
+
return torch.float32
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def matmul(A: Optional[Tensor], B: Tensor) -> Tensor:
|
| 34 |
+
"""Multiply two matrices.
|
| 35 |
+
|
| 36 |
+
If A is None, return B. A can be sparse or dense. B is always
|
| 37 |
+
dense.
|
| 38 |
+
"""
|
| 39 |
+
if A is None:
|
| 40 |
+
return B
|
| 41 |
+
if is_sparse(A):
|
| 42 |
+
return torch.sparse.mm(A, B)
|
| 43 |
+
return torch.matmul(A, B)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def conjugate(A):
|
| 47 |
+
"""Return conjugate of tensor A.
|
| 48 |
+
|
| 49 |
+
.. note:: If A's dtype is not complex, A is returned.
|
| 50 |
+
"""
|
| 51 |
+
if A.is_complex():
|
| 52 |
+
return A.conj()
|
| 53 |
+
return A
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def transpose(A):
|
| 57 |
+
"""Return transpose of a matrix or batches of matrices."""
|
| 58 |
+
ndim = len(A.shape)
|
| 59 |
+
return A.transpose(ndim - 1, ndim - 2)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def transjugate(A):
|
| 63 |
+
"""Return transpose conjugate of a matrix or batches of matrices."""
|
| 64 |
+
return conjugate(transpose(A))
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def bform(X: Tensor, A: Optional[Tensor], Y: Tensor) -> Tensor:
|
| 68 |
+
"""Return bilinear form of matrices: :math:`X^T A Y`."""
|
| 69 |
+
return matmul(transpose(X), matmul(A, Y))
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def qform(A: Optional[Tensor], S: Tensor):
|
| 73 |
+
"""Return quadratic form :math:`S^T A S`."""
|
| 74 |
+
return bform(S, A, S)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def basis(A):
|
| 78 |
+
"""Return orthogonal basis of A columns."""
|
| 79 |
+
return torch.linalg.qr(A).Q
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def symeig(A: Tensor, largest: Optional[bool] = False) -> Tuple[Tensor, Tensor]:
|
| 83 |
+
"""Return eigenpairs of A with specified ordering."""
|
| 84 |
+
if largest is None:
|
| 85 |
+
largest = False
|
| 86 |
+
E, Z = torch.linalg.eigh(A, UPLO="U")
|
| 87 |
+
# assuming that E is ordered
|
| 88 |
+
if largest:
|
| 89 |
+
E = torch.flip(E, dims=(-1,))
|
| 90 |
+
Z = torch.flip(Z, dims=(-1,))
|
| 91 |
+
return E, Z
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
# These functions were deprecated and removed
|
| 95 |
+
# This nice error message can be removed in version 1.13+
|
| 96 |
+
def matrix_rank(input, tol=None, symmetric=False, *, out=None) -> Tensor:
|
| 97 |
+
raise RuntimeError(
|
| 98 |
+
"This function was deprecated since version 1.9 and is now removed.\n"
|
| 99 |
+
"Please use the `torch.linalg.matrix_rank` function instead. "
|
| 100 |
+
"The parameter 'symmetric' was renamed in `torch.linalg.matrix_rank()` to 'hermitian'."
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def solve(input: Tensor, A: Tensor, *, out=None) -> Tuple[Tensor, Tensor]:
|
| 105 |
+
raise RuntimeError(
|
| 106 |
+
"This function was deprecated since version 1.9 and is now removed. "
|
| 107 |
+
"`torch.solve` is deprecated in favor of `torch.linalg.solve`. "
|
| 108 |
+
"`torch.linalg.solve` has its arguments reversed and does not return the LU factorization.\n\n"
|
| 109 |
+
"To get the LU factorization see `torch.lu`, which can be used with `torch.lu_solve` or `torch.lu_unpack`.\n"
|
| 110 |
+
"X = torch.solve(B, A).solution "
|
| 111 |
+
"should be replaced with:\n"
|
| 112 |
+
"X = torch.linalg.solve(A, B)"
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def lstsq(input: Tensor, A: Tensor, *, out=None) -> Tuple[Tensor, Tensor]:
|
| 117 |
+
raise RuntimeError(
|
| 118 |
+
"This function was deprecated since version 1.9 and is now removed. "
|
| 119 |
+
"`torch.lstsq` is deprecated in favor of `torch.linalg.lstsq`.\n"
|
| 120 |
+
"`torch.linalg.lstsq` has reversed arguments and does not return the QR decomposition in "
|
| 121 |
+
"the returned tuple (although it returns other information about the problem).\n\n"
|
| 122 |
+
"To get the QR decomposition consider using `torch.linalg.qr`.\n\n"
|
| 123 |
+
"The returned solution in `torch.lstsq` stored the residuals of the solution in the "
|
| 124 |
+
"last m - n columns of the returned value whenever m > n. In torch.linalg.lstsq, "
|
| 125 |
+
"the residuals are in the field 'residuals' of the returned named tuple.\n\n"
|
| 126 |
+
"The unpacking of the solution, as in\n"
|
| 127 |
+
"X, _ = torch.lstsq(B, A).solution[:A.size(1)]\n"
|
| 128 |
+
"should be replaced with:\n"
|
| 129 |
+
"X = torch.linalg.lstsq(A, B).solution"
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def _symeig(
|
| 134 |
+
input, eigenvectors=False, upper=True, *, out=None
|
| 135 |
+
) -> Tuple[Tensor, Tensor]:
|
| 136 |
+
raise RuntimeError(
|
| 137 |
+
"This function was deprecated since version 1.9 and is now removed. "
|
| 138 |
+
"The default behavior has changed from using the upper triangular portion of the matrix by default "
|
| 139 |
+
"to using the lower triangular portion.\n\n"
|
| 140 |
+
"L, _ = torch.symeig(A, upper=upper) "
|
| 141 |
+
"should be replaced with:\n"
|
| 142 |
+
"L = torch.linalg.eigvalsh(A, UPLO='U' if upper else 'L')\n\n"
|
| 143 |
+
"and\n\n"
|
| 144 |
+
"L, V = torch.symeig(A, eigenvectors=True) "
|
| 145 |
+
"should be replaced with:\n"
|
| 146 |
+
"L, V = torch.linalg.eigh(A, UPLO='U' if upper else 'L')"
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def eig(
|
| 151 |
+
self: Tensor, eigenvectors: bool = False, *, e=None, v=None
|
| 152 |
+
) -> Tuple[Tensor, Tensor]:
|
| 153 |
+
raise RuntimeError(
|
| 154 |
+
"This function was deprecated since version 1.9 and is now removed. "
|
| 155 |
+
"`torch.linalg.eig` returns complex tensors of dtype `cfloat` or `cdouble` rather than real tensors "
|
| 156 |
+
"mimicking complex tensors.\n\n"
|
| 157 |
+
"L, _ = torch.eig(A) "
|
| 158 |
+
"should be replaced with:\n"
|
| 159 |
+
"L_complex = torch.linalg.eigvals(A)\n\n"
|
| 160 |
+
"and\n\n"
|
| 161 |
+
"L, V = torch.eig(A, eigenvectors=True) "
|
| 162 |
+
"should be replaced with:\n"
|
| 163 |
+
"L_complex, V_complex = torch.linalg.eig(A)"
|
| 164 |
+
)
|