koichi12 commited on Feb 12, 2025

Commit

eda6db7

verified ·

1 Parent(s): b891f5b

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__init__.py +51 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_util.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_windows.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_unix.py +65 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_util.py +47 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_windows.py +65 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/__pycache__/test_fp.cpython-311.pyc +3 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openmp/cupti_openmp.h +100 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openmp/omp-tools.h +1083 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cuda_stdint.h +112 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_driver_cbid.h +690 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_metrics.h +825 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_gl_interop_meta.h +71 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/lib/__init__.py +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/lib/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/common_functions.h +65 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaEGL.h +659 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_egl_interop.h +642 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp8.hpp +1546 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline_helpers.h +373 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline_primitives.h +148 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_types.h +81 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/host_defines.h +65 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/mma.h +60 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_61_intrinsics.h +123 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_types.h +281 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/vector_functions.h +175 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_globals.h +93 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_lognormal.h +697 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mrg32k3a.h +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32dc_p_11213.h +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_normal_static.h +127 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_philox4x32_x.h +194 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_precalc.h +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtCuda.h +164 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtOpenCL.h +214 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExtOpenCL.h +220 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/WHEEL +5 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/top_level.txt +1 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__init__.py +4 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__pycache__/_pyximport2.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__pycache__/pyxbuild.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/_pyximport3.py +478 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_VF.py +30 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_classes.py +55 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_deploy.py +105 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_linalg_utils.py +164 -0

.gitattributes CHANGED Viewed

@@ -62,3 +62,5 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/__py
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/jinja2/__pycache__/compiler.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/_C.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Nodes.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

 tuning-competition-baseline/.venv/lib/python3.11/site-packages/jinja2/__pycache__/compiler.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/functorch/_C.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
 tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/__pycache__/Nodes.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+tuning-competition-baseline/.venv/lib/python3.11/site-packages/torchgen/__pycache__/model.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/__pycache__/test_fp.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__init__.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""
+A platform independent file lock that supports the with-statement.
+.. autodata:: filelock.__version__
+   :no-value:
+"""
+from __future__ import annotations
+import sys
+import warnings
+from typing import TYPE_CHECKING
+from ._api import AcquireReturnProxy, BaseFileLock
+from ._error import Timeout
+from ._soft import SoftFileLock
+from ._unix import UnixFileLock, has_fcntl
+from ._windows import WindowsFileLock
+from .version import version
+#: version of the project as a string
+__version__: str = version
+if sys.platform == "win32":  # pragma: win32 cover
+    _FileLock: type[BaseFileLock] = WindowsFileLock
+else:  # pragma: win32 no cover # noqa: PLR5501
+    if has_fcntl:
+        _FileLock: type[BaseFileLock] = UnixFileLock
+    else:
+        _FileLock = SoftFileLock
+        if warnings is not None:
+            warnings.warn("only soft file lock is available", stacklevel=2)
+if TYPE_CHECKING:
+    FileLock = SoftFileLock
+else:
+    #: Alias for the lock, which should be used for the current platform.
+    FileLock = _FileLock
+__all__ = [
+    "__version__",
+    "FileLock",
+    "SoftFileLock",
+    "Timeout",
+    "UnixFileLock",
+    "WindowsFileLock",
+    "BaseFileLock",
+    "AcquireReturnProxy",
+]

tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_util.cpython-311.pyc ADDED Viewed

Binary file (2.2 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/__pycache__/_windows.cpython-311.pyc ADDED Viewed

Binary file (3.68 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_unix.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from __future__ import annotations
+import os
+import sys
+from contextlib import suppress
+from errno import ENOSYS
+from typing import cast
+from ._api import BaseFileLock
+from ._util import ensure_directory_exists
+#: a flag to indicate if the fcntl API is available
+has_fcntl = False
+if sys.platform == "win32":  # pragma: win32 cover
+    class UnixFileLock(BaseFileLock):
+        """Uses the :func:`fcntl.flock` to hard lock the lock file on unix systems."""
+        def _acquire(self) -> None:
+            raise NotImplementedError
+        def _release(self) -> None:
+            raise NotImplementedError
+else:  # pragma: win32 no cover
+    try:
+        import fcntl
+    except ImportError:
+        pass
+    else:
+        has_fcntl = True
+    class UnixFileLock(BaseFileLock):
+        """Uses the :func:`fcntl.flock` to hard lock the lock file on unix systems."""
+        def _acquire(self) -> None:
+            ensure_directory_exists(self.lock_file)
+            open_flags = os.O_RDWR | os.O_CREAT | os.O_TRUNC
+            fd = os.open(self.lock_file, open_flags, self._context.mode)
+            with suppress(PermissionError):  # This locked is not owned by this UID
+                os.fchmod(fd, self._context.mode)
+            try:
+                fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
+            except OSError as exception:
+                os.close(fd)
+                if exception.errno == ENOSYS:  # NotImplemented error
+                    msg = "FileSystem does not appear to support flock; user SoftFileLock instead"
+                    raise NotImplementedError(msg) from exception
+            else:
+                self._context.lock_file_fd = fd
+        def _release(self) -> None:
+            # Do not remove the lockfile:
+            #   https://github.com/tox-dev/py-filelock/issues/31
+            #   https://stackoverflow.com/questions/17708885/flock-removing-locked-file-without-race-condition
+            fd = cast(int, self._context.lock_file_fd)
+            self._context.lock_file_fd = None
+            fcntl.flock(fd, fcntl.LOCK_UN)
+            os.close(fd)
+__all__ = [
+    "has_fcntl",
+    "UnixFileLock",
+]

tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_util.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from __future__ import annotations
+import os
+import stat
+import sys
+from errno import EACCES, EISDIR
+from pathlib import Path
+def raise_on_not_writable_file(filename: str) -> None:
+    """
+    Raise an exception if attempting to open the file for writing would fail.
+    This is done so files that will never be writable can be separated from
+    files that are writable but currently locked
+    :param filename: file to check
+    :raises OSError: as if the file was opened for writing.
+    """
+    try:  # use stat to do exists + can write to check without race condition
+        file_stat = os.stat(filename)  # noqa: PTH116
+    except OSError:
+        return  # swallow does not exist or other errors
+    if file_stat.st_mtime != 0:  # if os.stat returns but modification is zero that's an invalid os.stat - ignore it
+        if not (file_stat.st_mode & stat.S_IWUSR):
+            raise PermissionError(EACCES, "Permission denied", filename)
+        if stat.S_ISDIR(file_stat.st_mode):
+            if sys.platform == "win32":  # pragma: win32 cover
+                # On Windows, this is PermissionError
+                raise PermissionError(EACCES, "Permission denied", filename)
+            else:  # pragma: win32 no cover # noqa: RET506
+                # On linux / macOS, this is IsADirectoryError
+                raise IsADirectoryError(EISDIR, "Is a directory", filename)
+def ensure_directory_exists(filename: Path | str) -> None:
+    """
+    Ensure the directory containing the file exists (create it if necessary)
+    :param filename: file.
+    """
+    Path(filename).parent.mkdir(parents=True, exist_ok=True)
+__all__ = [
+    "raise_on_not_writable_file",
+    "ensure_directory_exists",
+]

tuning-competition-baseline/.venv/lib/python3.11/site-packages/filelock/_windows.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from __future__ import annotations
+import os
+import sys
+from contextlib import suppress
+from errno import EACCES
+from pathlib import Path
+from typing import cast
+from ._api import BaseFileLock
+from ._util import ensure_directory_exists, raise_on_not_writable_file
+if sys.platform == "win32":  # pragma: win32 cover
+    import msvcrt
+    class WindowsFileLock(BaseFileLock):
+        """Uses the :func:`msvcrt.locking` function to hard lock the lock file on Windows systems."""
+        def _acquire(self) -> None:
+            raise_on_not_writable_file(self.lock_file)
+            ensure_directory_exists(self.lock_file)
+            flags = (
+                os.O_RDWR  # open for read and write
+                | os.O_CREAT  # create file if not exists
+                | os.O_TRUNC  # truncate file if not empty
+            )
+            try:
+                fd = os.open(self.lock_file, flags, self._context.mode)
+            except OSError as exception:
+                if exception.errno != EACCES:  # has no access to this lock
+                    raise
+            else:
+                try:
+                    msvcrt.locking(fd, msvcrt.LK_NBLCK, 1)
+                except OSError as exception:
+                    os.close(fd)  # close file first
+                    if exception.errno != EACCES:  # file is already locked
+                        raise
+                else:
+                    self._context.lock_file_fd = fd
+        def _release(self) -> None:
+            fd = cast(int, self._context.lock_file_fd)
+            self._context.lock_file_fd = None
+            msvcrt.locking(fd, msvcrt.LK_UNLCK, 1)
+            os.close(fd)
+            with suppress(OSError):  # Probably another instance of the application hat acquired the file lock.
+                Path(self.lock_file).unlink()
+else:  # pragma: win32 no cover
+    class WindowsFileLock(BaseFileLock):
+        """Uses the :func:`msvcrt.locking` function to hard lock the lock file on Windows systems."""
+        def _acquire(self) -> None:
+            raise NotImplementedError
+        def _release(self) -> None:
+            raise NotImplementedError
+__all__ = [
+    "WindowsFileLock",
+]

tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath/tests/__pycache__/test_fp.cpython-311.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fac5cd5bfbd06bb4a9b6ca2c30c684bea761aa5b6dbe0c019ed92f1f4a7d8143
+size 142559

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (218 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openmp/cupti_openmp.h ADDED Viewed

	@@ -0,0 +1,100 @@

+/*
+ * Copyright 2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#include <cuda_stdint.h>
+#include "Openmp/omp-tools.h"
+#if !defined(_CUPTI_OPENMP_H_)
+#define _CUPTI_OPENMP_H_
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+#if defined(__LP64__)
+#define CUPTILP64 1
+#elif defined(_WIN64)
+#define CUPTILP64 1
+#else
+#undef CUPTILP64
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+/**
+ * \brief Initialize OPENMP support (deprecated, used before OpenMP 5.0)
+ *
+ */
+int CUPTIAPI cuptiOpenMpInitialize(ompt_function_lookup_t ompt_fn_lookup, const char *runtime_version, unsigned int ompt_version);
+/**
+ * \brief Initialize OPENMP support
+ *
+ */
+int CUPTIAPI cuptiOpenMpInitialize_v2(ompt_function_lookup_t lookup, int initial_device_num, ompt_data_t *tool_data);
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+#if defined(__cplusplus)
+}
+#endif
+#endif /*_CUPTI_OPENMP_H_*/

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openmp/omp-tools.h ADDED Viewed

	@@ -0,0 +1,1083 @@

+/*
+ * include/50/omp-tools.h.var
+ */
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+#ifndef __OMPT__
+#define __OMPT__
+/*****************************************************************************
+ * system include files
+ *****************************************************************************/
+#include <stdint.h>
+#include <stddef.h>
+/*****************************************************************************
+ * iteration macros
+ *****************************************************************************/
+#define FOREACH_OMPT_INQUIRY_FN(macro)      \
+    macro (ompt_enumerate_states)           \
+    macro (ompt_enumerate_mutex_impls)      \
+                                            \
+    macro (ompt_set_callback)               \
+    macro (ompt_get_callback)               \
+                                            \
+    macro (ompt_get_state)                  \
+                                            \
+    macro (ompt_get_parallel_info)          \
+    macro (ompt_get_task_info)              \
+    macro (ompt_get_task_memory)            \
+    macro (ompt_get_thread_data)            \
+    macro (ompt_get_unique_id)              \
+    macro (ompt_finalize_tool)              \
+                                            \
+    macro(ompt_get_num_procs)               \
+    macro(ompt_get_num_places)              \
+    macro(ompt_get_place_proc_ids)          \
+    macro(ompt_get_place_num)               \
+    macro(ompt_get_partition_place_nums)    \
+    macro(ompt_get_proc_id)                 \
+                                            \
+    macro(ompt_get_target_info)             \
+    macro(ompt_get_num_devices)
+#define FOREACH_OMPT_STATE(macro)                                                                \
+                                                                                                \
+    /* first available state */                                                                 \
+    macro (ompt_state_undefined, 0x102)      /* undefined thread state */                        \
+                                                                                                \
+    /* work states (0..15) */                                                                   \
+    macro (ompt_state_work_serial, 0x000)    /* working outside parallel */                      \
+    macro (ompt_state_work_parallel, 0x001)  /* working within parallel */                       \
+    macro (ompt_state_work_reduction, 0x002) /* performing a reduction */                        \
+                                                                                                \
+    /* barrier wait states (16..31) */                                                          \
+    macro (ompt_state_wait_barrier, 0x010)   /* waiting at a barrier */                          \
+    macro (ompt_state_wait_barrier_implicit_parallel, 0x011)                                     \
+                                            /* implicit barrier at the end of parallel region */\
+    macro (ompt_state_wait_barrier_implicit_workshare, 0x012)                                    \
+                                            /* implicit barrier at the end of worksharing */    \
+    macro (ompt_state_wait_barrier_implicit, 0x013)  /* implicit barrier */                      \
+    macro (ompt_state_wait_barrier_explicit, 0x014)  /* explicit barrier */                      \
+                                                                                                \
+    /* task wait states (32..63) */                                                             \
+    macro (ompt_state_wait_taskwait, 0x020)  /* waiting at a taskwait */                         \
+    macro (ompt_state_wait_taskgroup, 0x021) /* waiting at a taskgroup */                        \
+                                                                                                \
+    /* mutex wait states (64..127) */                                                           \
+    macro (ompt_state_wait_mutex, 0x040)                                                         \
+    macro (ompt_state_wait_lock, 0x041)      /* waiting for lock */                              \
+    macro (ompt_state_wait_critical, 0x042)  /* waiting for critical */                          \
+    macro (ompt_state_wait_atomic, 0x043)    /* waiting for atomic */                            \
+    macro (ompt_state_wait_ordered, 0x044)   /* waiting for ordered */                           \
+                                                                                                \
+    /* target wait states (128..255) */                                                         \
+    macro (ompt_state_wait_target, 0x080)        /* waiting for target region */                 \
+    macro (ompt_state_wait_target_map, 0x081)    /* waiting for target data mapping operation */ \
+    macro (ompt_state_wait_target_update, 0x082) /* waiting for target update operation */       \
+                                                                                                \
+    /* misc (256..511) */                                                                       \
+    macro (ompt_state_idle, 0x100)           /* waiting for work */                              \
+    macro (ompt_state_overhead, 0x101)       /* overhead excluding wait states */                \
+                                                                                                \
+    /* implementation-specific states (512..) */
+#define FOREACH_KMP_MUTEX_IMPL(macro)                                                \
+    macro (kmp_mutex_impl_none, 0)         /* unknown implementation */              \
+    macro (kmp_mutex_impl_spin, 1)         /* based on spin */                       \
+    macro (kmp_mutex_impl_queuing, 2)      /* based on some fair policy */           \
+    macro (kmp_mutex_impl_speculative, 3)  /* based on HW-supported speculation */
+#define FOREACH_OMPT_EVENT(macro)                                                                                        \
+                                                                                                                         \
+    /*--- Mandatory Events ---*/                                                                                         \
+    macro (ompt_callback_thread_begin,      ompt_callback_thread_begin_t,       1) /* thread begin                    */ \
+    macro (ompt_callback_thread_end,        ompt_callback_thread_end_t,         2) /* thread end                      */ \
+                                                                                                                         \
+    macro (ompt_callback_parallel_begin,    ompt_callback_parallel_begin_t,     3) /* parallel begin                  */ \
+    macro (ompt_callback_parallel_end,      ompt_callback_parallel_end_t,       4) /* parallel end                    */ \
+                                                                                                                         \
+    macro (ompt_callback_task_create,       ompt_callback_task_create_t,        5) /* task begin                      */ \
+    macro (ompt_callback_task_schedule,     ompt_callback_task_schedule_t,      6) /* task schedule                   */ \
+    macro (ompt_callback_implicit_task,     ompt_callback_implicit_task_t,      7) /* implicit task                   */ \
+                                                                                                                         \
+    macro (ompt_callback_target,            ompt_callback_target_t,             8) /* target                          */ \
+    macro (ompt_callback_target_data_op,    ompt_callback_target_data_op_t,     9) /* target data op                  */ \
+    macro (ompt_callback_target_submit,     ompt_callback_target_submit_t,     10) /* target  submit                  */ \
+                                                                                                                         \
+    macro (ompt_callback_control_tool,      ompt_callback_control_tool_t,      11) /* control tool                    */ \
+                                                                                                                         \
+    macro (ompt_callback_device_initialize, ompt_callback_device_initialize_t, 12) /* device initialize               */ \
+    macro (ompt_callback_device_finalize,   ompt_callback_device_finalize_t,   13) /* device finalize                 */ \
+                                                                                                                         \
+    macro (ompt_callback_device_load,       ompt_callback_device_load_t,       14) /* device load                     */ \
+    macro (ompt_callback_device_unload,     ompt_callback_device_unload_t,     15) /* device unload                   */ \
+                                                                                                                         \
+    /* Optional Events */                                                                                                \
+    macro (ompt_callback_sync_region_wait,  ompt_callback_sync_region_t,       16) /* sync region wait begin or end   */ \
+                                                                                                                         \
+    macro (ompt_callback_mutex_released,    ompt_callback_mutex_t,             17) /* mutex released                  */ \
+                                                                                                                         \
+    macro (ompt_callback_dependences,       ompt_callback_dependences_t,       18) /* report task dependences         */ \
+    macro (ompt_callback_task_dependence,   ompt_callback_task_dependence_t,   19) /* report task dependence          */ \
+                                                                                                                         \
+    macro (ompt_callback_work,              ompt_callback_work_t,              20) /* task at work begin or end       */ \
+                                                                                                                         \
+    macro (ompt_callback_master,            ompt_callback_master_t,            21) /* task at master begin or end     */ \
+                                                                                                                         \
+    macro (ompt_callback_target_map,        ompt_callback_target_map_t,        22) /* target map                      */ \
+                                                                                                                         \
+    macro (ompt_callback_sync_region,       ompt_callback_sync_region_t,       23) /* sync region begin or end        */ \
+                                                                                                                         \
+    macro (ompt_callback_lock_init,         ompt_callback_mutex_acquire_t,     24) /* lock init                       */ \
+    macro (ompt_callback_lock_destroy,      ompt_callback_mutex_t,             25) /* lock destroy                    */ \
+                                                                                                                         \
+    macro (ompt_callback_mutex_acquire,     ompt_callback_mutex_acquire_t,     26) /* mutex acquire                   */ \
+    macro (ompt_callback_mutex_acquired,    ompt_callback_mutex_t,             27) /* mutex acquired                  */ \
+                                                                                                                         \
+    macro (ompt_callback_nest_lock,         ompt_callback_nest_lock_t,         28) /* nest lock                       */ \
+                                                                                                                         \
+    macro (ompt_callback_flush,             ompt_callback_flush_t,             29) /* after executing flush           */ \
+                                                                                                                         \
+    macro (ompt_callback_cancel,            ompt_callback_cancel_t,            30) /* cancel innermost binding region */ \
+                                                                                                                         \
+    macro (ompt_callback_reduction,         ompt_callback_sync_region_t,       31) /* reduction                       */ \
+                                                                                                                         \
+    macro (ompt_callback_dispatch,          ompt_callback_dispatch_t,          32) /* dispatch of work                */
+/*****************************************************************************
+ * implementation specific types
+ *****************************************************************************/
+typedef enum kmp_mutex_impl_t {
+#define kmp_mutex_impl_macro(impl, code) impl = code,
+    FOREACH_KMP_MUTEX_IMPL(kmp_mutex_impl_macro)
+#undef kmp_mutex_impl_macro
+} kmp_mutex_impl_t;
+/*****************************************************************************
+ * definitions generated from spec
+ *****************************************************************************/
+typedef enum ompt_callbacks_t {
+  ompt_callback_thread_begin             = 1,
+  ompt_callback_thread_end               = 2,
+  ompt_callback_parallel_begin           = 3,
+  ompt_callback_parallel_end             = 4,
+  ompt_callback_task_create              = 5,
+  ompt_callback_task_schedule            = 6,
+  ompt_callback_implicit_task            = 7,
+  ompt_callback_target                   = 8,
+  ompt_callback_target_data_op           = 9,
+  ompt_callback_target_submit            = 10,
+  ompt_callback_control_tool             = 11,
+  ompt_callback_device_initialize        = 12,
+  ompt_callback_device_finalize          = 13,
+  ompt_callback_device_load              = 14,
+  ompt_callback_device_unload            = 15,
+  ompt_callback_sync_region_wait         = 16,
+  ompt_callback_mutex_released           = 17,
+  ompt_callback_dependences              = 18,
+  ompt_callback_task_dependence          = 19,
+  ompt_callback_work                     = 20,
+  ompt_callback_master                   = 21,
+  ompt_callback_target_map               = 22,
+  ompt_callback_sync_region              = 23,
+  ompt_callback_lock_init                = 24,
+  ompt_callback_lock_destroy             = 25,
+  ompt_callback_mutex_acquire            = 26,
+  ompt_callback_mutex_acquired           = 27,
+  ompt_callback_nest_lock                = 28,
+  ompt_callback_flush                    = 29,
+  ompt_callback_cancel                   = 30,
+  ompt_callback_reduction                = 31,
+  ompt_callback_dispatch                 = 32
+} ompt_callbacks_t;
+typedef enum ompt_record_t {
+  ompt_record_ompt               = 1,
+  ompt_record_native             = 2,
+  ompt_record_invalid            = 3
+} ompt_record_t;
+typedef enum ompt_record_native_t {
+  ompt_record_native_info  = 1,
+  ompt_record_native_event = 2
+} ompt_record_native_t;
+typedef enum ompt_set_result_t {
+  ompt_set_error            = 0,
+  ompt_set_never            = 1,
+  ompt_set_impossible       = 2,
+  ompt_set_sometimes        = 3,
+  ompt_set_sometimes_paired = 4,
+  ompt_set_always           = 5
+} ompt_set_result_t;
+typedef uint64_t ompt_id_t;
+typedef uint64_t ompt_device_time_t;
+typedef uint64_t ompt_buffer_cursor_t;
+typedef enum ompt_thread_t {
+  ompt_thread_initial                 = 1,
+  ompt_thread_worker                  = 2,
+  ompt_thread_other                   = 3,
+  ompt_thread_unknown                 = 4
+} ompt_thread_t;
+typedef enum ompt_scope_endpoint_t {
+  ompt_scope_begin                    = 1,
+  ompt_scope_end                      = 2
+} ompt_scope_endpoint_t;
+typedef enum ompt_dispatch_t {
+  ompt_dispatch_iteration             = 1,
+  ompt_dispatch_section               = 2
+} ompt_dispatch_t;
+typedef enum ompt_sync_region_t {
+  ompt_sync_region_barrier                = 1,
+  ompt_sync_region_barrier_implicit       = 2,
+  ompt_sync_region_barrier_explicit       = 3,
+  ompt_sync_region_barrier_implementation = 4,
+  ompt_sync_region_taskwait               = 5,
+  ompt_sync_region_taskgroup              = 6,
+  ompt_sync_region_reduction              = 7
+} ompt_sync_region_t;
+typedef enum ompt_target_data_op_t {
+  ompt_target_data_alloc                = 1,
+  ompt_target_data_transfer_to_device   = 2,
+  ompt_target_data_transfer_from_device = 3,
+  ompt_target_data_delete               = 4,
+  ompt_target_data_associate            = 5,
+  ompt_target_data_disassociate         = 6
+} ompt_target_data_op_t;
+typedef enum ompt_work_t {
+  ompt_work_loop               = 1,
+  ompt_work_sections           = 2,
+  ompt_work_single_executor    = 3,
+  ompt_work_single_other       = 4,
+  ompt_work_workshare          = 5,
+  ompt_work_distribute         = 6,
+  ompt_work_taskloop           = 7
+} ompt_work_t;
+typedef enum ompt_mutex_t {
+  ompt_mutex_lock                     = 1,
+  ompt_mutex_test_lock                = 2,
+  ompt_mutex_nest_lock                = 3,
+  ompt_mutex_test_nest_lock           = 4,
+  ompt_mutex_critical                 = 5,
+  ompt_mutex_atomic                   = 6,
+  ompt_mutex_ordered                  = 7
+} ompt_mutex_t;
+typedef enum ompt_native_mon_flag_t {
+  ompt_native_data_motion_explicit    = 0x01,
+  ompt_native_data_motion_implicit    = 0x02,
+  ompt_native_kernel_invocation       = 0x04,
+  ompt_native_kernel_execution        = 0x08,
+  ompt_native_driver                  = 0x10,
+  ompt_native_runtime                 = 0x20,
+  ompt_native_overhead                = 0x40,
+  ompt_native_idleness                = 0x80
+} ompt_native_mon_flag_t;
+typedef enum ompt_task_flag_t {
+  ompt_task_initial                   = 0x00000001,
+  ompt_task_implicit                  = 0x00000002,
+  ompt_task_explicit                  = 0x00000004,
+  ompt_task_target                    = 0x00000008,
+  ompt_task_undeferred                = 0x08000000,
+  ompt_task_untied                    = 0x10000000,
+  ompt_task_final                     = 0x20000000,
+  ompt_task_mergeable                 = 0x40000000,
+  ompt_task_merged                    = 0x80000000
+} ompt_task_flag_t;
+typedef enum ompt_task_status_t {
+  ompt_task_complete      = 1,
+  ompt_task_yield         = 2,
+  ompt_task_cancel        = 3,
+  ompt_task_detach        = 4,
+  ompt_task_early_fulfill = 5,
+  ompt_task_late_fulfill  = 6,
+  ompt_task_switch        = 7
+} ompt_task_status_t;
+typedef enum ompt_target_t {
+  ompt_target                         = 1,
+  ompt_target_enter_data              = 2,
+  ompt_target_exit_data               = 3,
+  ompt_target_update                  = 4
+} ompt_target_t;
+typedef enum ompt_parallel_flag_t {
+  ompt_parallel_invoker_program = 0x00000001,
+  ompt_parallel_invoker_runtime = 0x00000002,
+  ompt_parallel_league          = 0x40000000,
+  ompt_parallel_team            = 0x80000000
+} ompt_parallel_flag_t;
+typedef enum ompt_target_map_flag_t {
+  ompt_target_map_flag_to             = 0x01,
+  ompt_target_map_flag_from           = 0x02,
+  ompt_target_map_flag_alloc          = 0x04,
+  ompt_target_map_flag_release        = 0x08,
+  ompt_target_map_flag_delete         = 0x10,
+  ompt_target_map_flag_implicit       = 0x20
+} ompt_target_map_flag_t;
+typedef enum ompt_dependence_type_t {
+  ompt_dependence_type_in              = 1,
+  ompt_dependence_type_out             = 2,
+  ompt_dependence_type_inout           = 3,
+  ompt_dependence_type_mutexinoutset   = 4,
+  ompt_dependence_type_source          = 5,
+  ompt_dependence_type_sink            = 6
+} ompt_dependence_type_t;
+typedef enum ompt_cancel_flag_t {
+  ompt_cancel_parallel       = 0x01,
+  ompt_cancel_sections       = 0x02,
+  ompt_cancel_loop           = 0x04,
+  ompt_cancel_taskgroup      = 0x08,
+  ompt_cancel_activated      = 0x10,
+  ompt_cancel_detected       = 0x20,
+  ompt_cancel_discarded_task = 0x40
+} ompt_cancel_flag_t;
+typedef uint64_t ompt_hwid_t;
+typedef uint64_t ompt_wait_id_t;
+typedef enum ompt_frame_flag_t {
+  ompt_frame_runtime        = 0x00,
+  ompt_frame_application    = 0x01,
+  ompt_frame_cfa            = 0x10,
+  ompt_frame_framepointer   = 0x20,
+  ompt_frame_stackaddress   = 0x30
+} ompt_frame_flag_t;
+typedef enum ompt_state_t {
+  ompt_state_work_serial                      = 0x000,
+  ompt_state_work_parallel                    = 0x001,
+  ompt_state_work_reduction                   = 0x002,
+  ompt_state_wait_barrier                     = 0x010,
+  ompt_state_wait_barrier_implicit_parallel   = 0x011,
+  ompt_state_wait_barrier_implicit_workshare  = 0x012,
+  ompt_state_wait_barrier_implicit            = 0x013,
+  ompt_state_wait_barrier_explicit            = 0x014,
+  ompt_state_wait_taskwait                    = 0x020,
+  ompt_state_wait_taskgroup                   = 0x021,
+  ompt_state_wait_mutex                       = 0x040,
+  ompt_state_wait_lock                        = 0x041,
+  ompt_state_wait_critical                    = 0x042,
+  ompt_state_wait_atomic                      = 0x043,
+  ompt_state_wait_ordered                     = 0x044,
+  ompt_state_wait_target                      = 0x080,
+  ompt_state_wait_target_map                  = 0x081,
+  ompt_state_wait_target_update               = 0x082,
+  ompt_state_idle                             = 0x100,
+  ompt_state_overhead                         = 0x101,
+  ompt_state_undefined                        = 0x102
+} ompt_state_t;
+typedef uint64_t (*ompt_get_unique_id_t) (void);
+typedef uint64_t ompd_size_t;
+typedef uint64_t ompd_wait_id_t;
+typedef uint64_t ompd_addr_t;
+typedef int64_t  ompd_word_t;
+typedef uint64_t ompd_seg_t;
+typedef uint64_t ompd_device_t;
+typedef uint64_t ompd_thread_id_t;
+typedef enum ompd_scope_t {
+  ompd_scope_global = 1,
+  ompd_scope_address_space = 2,
+  ompd_scope_thread = 3,
+  ompd_scope_parallel = 4,
+  ompd_scope_implicit_task = 5,
+  ompd_scope_task = 6
+} ompd_scope_t;
+typedef uint64_t ompd_icv_id_t;
+typedef enum ompd_rc_t {
+  ompd_rc_ok = 0,
+  ompd_rc_unavailable = 1,
+  ompd_rc_stale_handle = 2,
+  ompd_rc_bad_input = 3,
+  ompd_rc_error = 4,
+  ompd_rc_unsupported = 5,
+  ompd_rc_needs_state_tracking = 6,
+  ompd_rc_incompatible = 7,
+  ompd_rc_device_read_error = 8,
+  ompd_rc_device_write_error = 9,
+  ompd_rc_nomem = 10,
+} ompd_rc_t;
+typedef void (*ompt_interface_fn_t) (void);
+typedef ompt_interface_fn_t (*ompt_function_lookup_t) (
+  const char *interface_function_name
+);
+typedef union ompt_data_t {
+  uint64_t value;
+  void *ptr;
+} ompt_data_t;
+typedef struct ompt_frame_t {
+  ompt_data_t exit_frame;
+  ompt_data_t enter_frame;
+  int exit_frame_flags;
+  int enter_frame_flags;
+} ompt_frame_t;
+typedef void (*ompt_callback_t) (void);
+typedef void ompt_device_t;
+typedef void ompt_buffer_t;
+typedef void (*ompt_callback_buffer_request_t) (
+  int device_num,
+  ompt_buffer_t **buffer,
+  size_t *bytes
+);
+typedef void (*ompt_callback_buffer_complete_t) (
+  int device_num,
+  ompt_buffer_t *buffer,
+  size_t bytes,
+  ompt_buffer_cursor_t begin,
+  int buffer_owned
+);
+typedef void (*ompt_finalize_t) (
+  ompt_data_t *tool_data
+);
+typedef int (*ompt_initialize_t) (
+  ompt_function_lookup_t lookup,
+  int initial_device_num,
+  ompt_data_t *tool_data
+);
+typedef struct ompt_start_tool_result_t {
+  ompt_initialize_t initialize;
+  ompt_finalize_t finalize;
+  ompt_data_t tool_data;
+} ompt_start_tool_result_t;
+typedef struct ompt_record_abstract_t {
+  ompt_record_native_t rclass;
+  const char *type;
+  ompt_device_time_t start_time;
+  ompt_device_time_t end_time;
+  ompt_hwid_t hwid;
+} ompt_record_abstract_t;
+typedef struct ompt_dependence_t {
+  ompt_data_t variable;
+  ompt_dependence_type_t dependence_type;
+} ompt_dependence_t;
+typedef int (*ompt_enumerate_states_t) (
+  int current_state,
+  int *next_state,
+  const char **next_state_name
+);
+typedef int (*ompt_enumerate_mutex_impls_t) (
+  int current_impl,
+  int *next_impl,
+  const char **next_impl_name
+);
+typedef ompt_set_result_t (*ompt_set_callback_t) (
+  ompt_callbacks_t event,
+  ompt_callback_t callback
+);
+typedef int (*ompt_get_callback_t) (
+  ompt_callbacks_t event,
+  ompt_callback_t *callback
+);
+typedef ompt_data_t *(*ompt_get_thread_data_t) (void);
+typedef int (*ompt_get_num_procs_t) (void);
+typedef int (*ompt_get_num_places_t) (void);
+typedef int (*ompt_get_place_proc_ids_t) (
+  int place_num,
+  int ids_size,
+  int *ids
+);
+typedef int (*ompt_get_place_num_t) (void);
+typedef int (*ompt_get_partition_place_nums_t) (
+  int place_nums_size,
+  int *place_nums
+);
+typedef int (*ompt_get_proc_id_t) (void);
+typedef int (*ompt_get_state_t) (
+  ompt_wait_id_t *wait_id
+);
+typedef int (*ompt_get_parallel_info_t) (
+  int ancestor_level,
+  ompt_data_t **parallel_data,
+  int *team_size
+);
+typedef int (*ompt_get_task_info_t) (
+  int ancestor_level,
+  int *flags,
+  ompt_data_t **task_data,
+  ompt_frame_t **task_frame,
+  ompt_data_t **parallel_data,
+  int *thread_num
+);
+typedef int (*ompt_get_task_memory_t)(
+  void **addr,
+  size_t *size,
+  int block
+);
+typedef int (*ompt_get_target_info_t) (
+  uint64_t *device_num,
+  ompt_id_t *target_id,
+  ompt_id_t *host_op_id
+);
+typedef int (*ompt_get_num_devices_t) (void);
+typedef void (*ompt_finalize_tool_t) (void);
+typedef int (*ompt_get_device_num_procs_t) (
+  ompt_device_t *device
+);
+typedef ompt_device_time_t (*ompt_get_device_time_t) (
+  ompt_device_t *device
+);
+typedef double (*ompt_translate_time_t) (
+  ompt_device_t *device,
+  ompt_device_time_t time
+);
+typedef ompt_set_result_t (*ompt_set_trace_ompt_t) (
+  ompt_device_t *device,
+  unsigned int enable,
+  unsigned int etype
+);
+typedef ompt_set_result_t (*ompt_set_trace_native_t) (
+  ompt_device_t *device,
+  int enable,
+  int flags
+);
+typedef int (*ompt_start_trace_t) (
+  ompt_device_t *device,
+  ompt_callback_buffer_request_t request,
+  ompt_callback_buffer_complete_t complete
+);
+typedef int (*ompt_pause_trace_t) (
+  ompt_device_t *device,
+  int begin_pause
+);
+typedef int (*ompt_flush_trace_t) (
+  ompt_device_t *device
+);
+typedef int (*ompt_stop_trace_t) (
+  ompt_device_t *device
+);
+typedef int (*ompt_advance_buffer_cursor_t) (
+  ompt_device_t *device,
+  ompt_buffer_t *buffer,
+  size_t size,
+  ompt_buffer_cursor_t current,
+  ompt_buffer_cursor_t *next
+);
+typedef ompt_record_t (*ompt_get_record_type_t) (
+  ompt_buffer_t *buffer,
+  ompt_buffer_cursor_t current
+);
+typedef void *(*ompt_get_record_native_t) (
+  ompt_buffer_t *buffer,
+  ompt_buffer_cursor_t current,
+  ompt_id_t *host_op_id
+);
+typedef ompt_record_abstract_t *
+(*ompt_get_record_abstract_t) (
+  void *native_record
+);
+typedef void (*ompt_callback_thread_begin_t) (
+  ompt_thread_t thread_type,
+  ompt_data_t *thread_data
+);
+typedef struct ompt_record_thread_begin_t {
+  ompt_thread_t thread_type;
+} ompt_record_thread_begin_t;
+typedef void (*ompt_callback_thread_end_t) (
+  ompt_data_t *thread_data
+);
+typedef void (*ompt_callback_parallel_begin_t) (
+  ompt_data_t *encountering_task_data,
+  const ompt_frame_t *encountering_task_frame,
+  ompt_data_t *parallel_data,
+  unsigned int requested_parallelism,
+  int flags,
+  const void *codeptr_ra
+);
+typedef struct ompt_record_parallel_begin_t {
+  ompt_id_t encountering_task_id;
+  ompt_id_t parallel_id;
+  unsigned int requested_parallelism;
+  int flags;
+  const void *codeptr_ra;
+} ompt_record_parallel_begin_t;
+typedef void (*ompt_callback_parallel_end_t) (
+  ompt_data_t *parallel_data,
+  ompt_data_t *encountering_task_data,
+  int flags,
+  const void *codeptr_ra
+);
+typedef struct ompt_record_parallel_end_t {
+  ompt_id_t parallel_id;
+  ompt_id_t encountering_task_id;
+  int flags;
+  const void *codeptr_ra;
+} ompt_record_parallel_end_t;
+typedef void (*ompt_callback_work_t) (
+  ompt_work_t wstype,
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  uint64_t count,
+  const void *codeptr_ra
+);
+typedef struct ompt_record_work_t {
+  ompt_work_t wstype;
+  ompt_scope_endpoint_t endpoint;
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  uint64_t count;
+  const void *codeptr_ra;
+} ompt_record_work_t;
+typedef void (*ompt_callback_dispatch_t) (
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  ompt_dispatch_t kind,
+  ompt_data_t instance
+);
+typedef struct ompt_record_dispatch_t {
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  ompt_dispatch_t kind;
+  ompt_data_t instance;
+} ompt_record_dispatch_t;
+typedef void (*ompt_callback_task_create_t) (
+  ompt_data_t *encountering_task_data,
+  const ompt_frame_t *encountering_task_frame,
+  ompt_data_t *new_task_data,
+  int flags,
+  int has_dependences,
+  const void *codeptr_ra
+);
+typedef struct ompt_record_task_create_t {
+  ompt_id_t encountering_task_id;
+  ompt_id_t new_task_id;
+  int flags;
+  int has_dependences;
+  const void *codeptr_ra;
+} ompt_record_task_create_t;
+typedef void (*ompt_callback_dependences_t) (
+  ompt_data_t *task_data,
+  const ompt_dependence_t *deps,
+  int ndeps
+);
+typedef struct ompt_record_dependences_t {
+  ompt_id_t task_id;
+  ompt_dependence_t dep;
+  int ndeps;
+} ompt_record_dependences_t;
+typedef void (*ompt_callback_task_dependence_t) (
+  ompt_data_t *src_task_data,
+  ompt_data_t *sink_task_data
+);
+typedef struct ompt_record_task_dependence_t {
+  ompt_id_t src_task_id;
+  ompt_id_t sink_task_id;
+} ompt_record_task_dependence_t;
+typedef void (*ompt_callback_task_schedule_t) (
+  ompt_data_t *prior_task_data,
+  ompt_task_status_t prior_task_status,
+  ompt_data_t *next_task_data
+);
+typedef struct ompt_record_task_schedule_t {
+  ompt_id_t prior_task_id;
+  ompt_task_status_t prior_task_status;
+  ompt_id_t next_task_id;
+} ompt_record_task_schedule_t;
+typedef void (*ompt_callback_implicit_task_t) (
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  unsigned int actual_parallelism,
+  unsigned int index,
+  int flags
+);
+typedef struct ompt_record_implicit_task_t {
+  ompt_scope_endpoint_t endpoint;
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  unsigned int actual_parallelism;
+  unsigned int index;
+  int flags;
+} ompt_record_implicit_task_t;
+typedef void (*ompt_callback_master_t) (
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  const void *codeptr_ra
+);
+typedef struct ompt_record_master_t {
+  ompt_scope_endpoint_t endpoint;
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  const void *codeptr_ra;
+} ompt_record_master_t;
+typedef void (*ompt_callback_sync_region_t) (
+  ompt_sync_region_t kind,
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  const void *codeptr_ra
+);
+typedef struct ompt_record_sync_region_t {
+  ompt_sync_region_t kind;
+  ompt_scope_endpoint_t endpoint;
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  const void *codeptr_ra;
+} ompt_record_sync_region_t;
+typedef void (*ompt_callback_mutex_acquire_t) (
+  ompt_mutex_t kind,
+  unsigned int hint,
+  unsigned int impl,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra
+);
+typedef struct ompt_record_mutex_acquire_t {
+  ompt_mutex_t kind;
+  unsigned int hint;
+  unsigned int impl;
+  ompt_wait_id_t wait_id;
+  const void *codeptr_ra;
+} ompt_record_mutex_acquire_t;
+typedef void (*ompt_callback_mutex_t) (
+  ompt_mutex_t kind,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra
+);
+typedef struct ompt_record_mutex_t {
+  ompt_mutex_t kind;
+  ompt_wait_id_t wait_id;
+  const void *codeptr_ra;
+} ompt_record_mutex_t;
+typedef void (*ompt_callback_nest_lock_t) (
+  ompt_scope_endpoint_t endpoint,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra
+);
+typedef struct ompt_record_nest_lock_t {
+  ompt_scope_endpoint_t endpoint;
+  ompt_wait_id_t wait_id;
+  const void *codeptr_ra;
+} ompt_record_nest_lock_t;
+typedef void (*ompt_callback_flush_t) (
+  ompt_data_t *thread_data,
+  const void *codeptr_ra
+);
+typedef struct ompt_record_flush_t {
+  const void *codeptr_ra;
+} ompt_record_flush_t;
+typedef void (*ompt_callback_cancel_t) (
+  ompt_data_t *task_data,
+  int flags,
+  const void *codeptr_ra
+);
+typedef struct ompt_record_cancel_t {
+  ompt_id_t task_id;
+  int flags;
+  const void *codeptr_ra;
+} ompt_record_cancel_t;
+typedef void (*ompt_callback_device_initialize_t) (
+  int device_num,
+  const char *type,
+  ompt_device_t *device,
+  ompt_function_lookup_t lookup,
+  const char *documentation
+);
+typedef void (*ompt_callback_device_finalize_t) (
+  int device_num
+);
+typedef void (*ompt_callback_device_load_t) (
+  int device_num,
+  const char *filename,
+  int64_t offset_in_file,
+  void *vma_in_file,
+  size_t bytes,
+  void *host_addr,
+  void *device_addr,
+  uint64_t module_id
+);
+typedef void (*ompt_callback_device_unload_t) (
+  int device_num,
+  uint64_t module_id
+);
+typedef void (*ompt_callback_target_data_op_t) (
+  ompt_id_t target_id,
+  ompt_id_t host_op_id,
+  ompt_target_data_op_t optype,
+  void *src_addr,
+  int src_device_num,
+  void *dest_addr,
+  int dest_device_num,
+  size_t bytes,
+  const void *codeptr_ra
+);
+typedef struct ompt_record_target_data_op_t {
+  ompt_id_t host_op_id;
+  ompt_target_data_op_t optype;
+  void *src_addr;
+  int src_device_num;
+  void *dest_addr;
+  int dest_device_num;
+  size_t bytes;
+  ompt_device_time_t end_time;
+  const void *codeptr_ra;
+} ompt_record_target_data_op_t;
+typedef void (*ompt_callback_target_t) (
+  ompt_target_t kind,
+  ompt_scope_endpoint_t endpoint,
+  int device_num,
+  ompt_data_t *task_data,
+  ompt_id_t target_id,
+  const void *codeptr_ra
+);
+typedef struct ompt_record_target_t {
+  ompt_target_t kind;
+  ompt_scope_endpoint_t endpoint;
+  int device_num;
+  ompt_id_t task_id;
+  ompt_id_t target_id;
+  const void *codeptr_ra;
+} ompt_record_target_t;
+typedef void (*ompt_callback_target_map_t) (
+  ompt_id_t target_id,
+  unsigned int nitems,
+  void **host_addr,
+  void **device_addr,
+  size_t *bytes,
+  unsigned int *mapping_flags,
+  const void *codeptr_ra
+);
+typedef struct ompt_record_target_map_t {
+  ompt_id_t target_id;
+  unsigned int nitems;
+  void **host_addr;
+  void **device_addr;
+  size_t *bytes;
+  unsigned int *mapping_flags;
+  const void *codeptr_ra;
+} ompt_record_target_map_t;
+typedef void (*ompt_callback_target_submit_t) (
+  ompt_id_t target_id,
+  ompt_id_t host_op_id,
+  unsigned int requested_num_teams
+);
+typedef struct ompt_record_target_kernel_t {
+  ompt_id_t host_op_id;
+  unsigned int requested_num_teams;
+  unsigned int granted_num_teams;
+  ompt_device_time_t end_time;
+} ompt_record_target_kernel_t;
+typedef int (*ompt_callback_control_tool_t) (
+  uint64_t command,
+  uint64_t modifier,
+  void *arg,
+  const void *codeptr_ra
+);
+typedef struct ompt_record_control_tool_t {
+  uint64_t command;
+  uint64_t modifier;
+  const void *codeptr_ra;
+} ompt_record_control_tool_t;
+typedef struct ompd_address_t {
+  ompd_seg_t segment;
+  ompd_addr_t address;
+} ompd_address_t;
+typedef struct ompd_frame_info_t {
+  ompd_address_t frame_address;
+  ompd_word_t frame_flag;
+} ompd_frame_info_t;
+typedef struct _ompd_aspace_handle ompd_address_space_handle_t;
+typedef struct _ompd_thread_handle ompd_thread_handle_t;
+typedef struct _ompd_parallel_handle ompd_parallel_handle_t;
+typedef struct _ompd_task_handle ompd_task_handle_t;
+typedef struct _ompd_aspace_cont ompd_address_space_context_t;
+typedef struct _ompd_thread_cont ompd_thread_context_t;
+typedef struct ompd_device_type_sizes_t {
+  uint8_t sizeof_char;
+  uint8_t sizeof_short;
+  uint8_t sizeof_int;
+  uint8_t sizeof_long;
+  uint8_t sizeof_long_long;
+  uint8_t sizeof_pointer;
+} ompd_device_type_sizes_t;
+typedef struct ompt_record_ompt_t {
+  ompt_callbacks_t type;
+  ompt_device_time_t time;
+  ompt_id_t thread_id;
+  ompt_id_t target_id;
+  union {
+    ompt_record_thread_begin_t thread_begin;
+    ompt_record_parallel_begin_t parallel_begin;
+    ompt_record_parallel_end_t parallel_end;
+    ompt_record_work_t work;
+    ompt_record_dispatch_t dispatch;
+    ompt_record_task_create_t task_create;
+    ompt_record_dependences_t dependences;
+    ompt_record_task_dependence_t task_dependence;
+    ompt_record_task_schedule_t task_schedule;
+    ompt_record_implicit_task_t implicit_task;
+    ompt_record_master_t master;
+    ompt_record_sync_region_t sync_region;
+    ompt_record_mutex_acquire_t mutex_acquire;
+    ompt_record_mutex_t mutex;
+    ompt_record_nest_lock_t nest_lock;
+    ompt_record_flush_t flush;
+    ompt_record_cancel_t cancel;
+    ompt_record_target_t target;
+    ompt_record_target_data_op_t target_data_op;
+    ompt_record_target_map_t target_map;
+    ompt_record_target_kernel_t target_kernel;
+    ompt_record_control_tool_t control_tool;
+  } record;
+} ompt_record_ompt_t;
+typedef ompt_record_ompt_t *(*ompt_get_record_ompt_t) (
+  ompt_buffer_t *buffer,
+  ompt_buffer_cursor_t current
+);
+#define ompt_id_none 0
+#define ompt_data_none {0}
+#define ompt_time_none 0
+#define ompt_hwid_none 0
+#define ompt_addr_none ~0
+#define ompt_mutex_impl_none 0
+#define ompt_wait_id_none 0
+#define ompd_segment_none 0
+#endif /* __OMPT__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (226 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cuda_stdint.h ADDED Viewed

	@@ -0,0 +1,112 @@

+/*
+ * Copyright 2009-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef __cuda_stdint_h__
+#define __cuda_stdint_h__
+// Compiler-specific treatment for C99's stdint.h
+//
+// By default, this header will use the standard headers (so it
+// is your responsibility to make sure they are available), except
+// on MSVC before Visual Studio 2010, when they were not provided.
+// To support old MSVC, a few of the commonly-used definitions are
+// provided here.  If more definitions are needed, add them here,
+// or replace these definitions with a complete implementation,
+// such as the ones available from Google, Boost, or MSVC10.  You
+// can prevent the definition of any of these types (in order to
+// use your own) by #defining CU_STDINT_TYPES_ALREADY_DEFINED.
+#if !defined(CU_STDINT_TYPES_ALREADY_DEFINED)
+// In VS including stdint.h forces the C++ runtime dep - provide an opt-out
+// (CU_STDINT_VS_FORCE_NO_STDINT_H) for users that care (notably static
+// cudart).
+#if defined(_MSC_VER) && ((_MSC_VER < 1600) || defined(CU_STDINT_VS_FORCE_NO_STDINT_H))
+// These definitions can be used with MSVC 8 and 9,
+// which don't ship with stdint.h:
+typedef unsigned   char   uint8_t;
+typedef            short  int16_t;
+typedef unsigned   short uint16_t;
+// To keep it consistent with all MSVC build. define those types
+// in the exact same way they are defined with the MSVC headers
+#if defined(_MSC_VER)
+typedef signed     char    int8_t;
+typedef            int     int32_t;
+typedef unsigned   int     uint32_t;
+typedef long long          int64_t;
+typedef unsigned long long uint64_t;
+#else
+typedef            char    int8_t;
+typedef            long   int32_t;
+typedef unsigned   long  uint32_t;
+typedef          __int64  int64_t;
+typedef unsigned __int64 uint64_t;
+#endif
+#elif defined(__DJGPP__)
+// These definitions can be used when compiling
+// C code with DJGPP, which only provides stdint.h
+// when compiling C++ code with TR1 enabled.
+typedef               char    int8_t;
+typedef unsigned      char   uint8_t;
+typedef               short  int16_t;
+typedef unsigned      short uint16_t;
+typedef               long   int32_t;
+typedef unsigned      long  uint32_t;
+typedef          long long   int64_t;
+typedef unsigned long long  uint64_t;
+#else
+// Use standard headers, as specified by C99 and C++ TR1.
+// Known to be provided by:
+// - gcc/glibc, supported by all versions of glibc
+// - djgpp, supported since 2001
+// - MSVC, supported by Visual Studio 2010 and later
+#include <stdint.h>
+#endif
+#endif // !defined(CU_STDINT_TYPES_ALREADY_DEFINED)
+#endif // file guard

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_driver_cbid.h ADDED Viewed

	@@ -0,0 +1,690 @@

+// *************************************************************************
+//      Definitions of indices for API functions, unique across entire API
+// *************************************************************************
+// This file is generated.  Any changes you make will be lost during the next clean build.
+// CUDA public interface, for type definitions and cu* function prototypes
+typedef enum CUpti_driver_api_trace_cbid_enum {
+    CUPTI_DRIVER_TRACE_CBID_INVALID                                                        = 0,
+    CUPTI_DRIVER_TRACE_CBID_cuInit                                                         = 1,
+    CUPTI_DRIVER_TRACE_CBID_cuDriverGetVersion                                             = 2,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGet                                                    = 3,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetCount                                               = 4,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetName                                                = 5,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceComputeCapability                                      = 6,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceTotalMem                                               = 7,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetProperties                                          = 8,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetAttribute                                           = 9,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxCreate                                                    = 10,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy                                                   = 11,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxAttach                                                    = 12,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxDetach                                                    = 13,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxPushCurrent                                               = 14,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxPopCurrent                                                = 15,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetDevice                                                 = 16,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxSynchronize                                               = 17,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleLoad                                                   = 18,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleLoadData                                               = 19,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleLoadDataEx                                             = 20,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleLoadFatBinary                                          = 21,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleUnload                                                 = 22,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetFunction                                            = 23,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetGlobal                                              = 24,
+    CUPTI_DRIVER_TRACE_CBID_cu64ModuleGetGlobal                                            = 25,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetTexRef                                              = 26,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetInfo                                                   = 27,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemGetInfo                                                 = 28,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAlloc                                                     = 29,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemAlloc                                                   = 30,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocPitch                                                = 31,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemAllocPitch                                              = 32,
+    CUPTI_DRIVER_TRACE_CBID_cuMemFree                                                      = 33,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemFree                                                    = 34,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetAddressRange                                           = 35,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemGetAddressRange                                         = 36,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocHost                                                 = 37,
+    CUPTI_DRIVER_TRACE_CBID_cuMemFreeHost                                                  = 38,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostAlloc                                                 = 39,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostGetDevicePointer                                      = 40,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemHostGetDevicePointer                                    = 41,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostGetFlags                                              = 42,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD                                                   = 43,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyHtoD                                                 = 44,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH                                                   = 45,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoH                                                 = 46,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD                                                   = 47,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoD                                                 = 48,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA                                                   = 49,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoA                                                 = 50,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD                                                   = 51,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyAtoD                                                 = 52,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA                                                   = 53,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH                                                   = 54,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA                                                   = 55,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D                                                     = 56,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned                                            = 57,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D                                                     = 58,
+    CUPTI_DRIVER_TRACE_CBID_cu64Memcpy3D                                                   = 59,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync                                              = 60,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyHtoDAsync                                            = 61,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync                                              = 62,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoHAsync                                            = 63,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync                                              = 64,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoDAsync                                            = 65,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync                                              = 66,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync                                              = 67,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync                                                = 68,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync                                                = 69,
+    CUPTI_DRIVER_TRACE_CBID_cu64Memcpy3DAsync                                              = 70,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD8                                                     = 71,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD8                                                   = 72,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD16                                                    = 73,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD16                                                  = 74,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD32                                                    = 75,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD32                                                  = 76,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8                                                   = 77,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D8                                                 = 78,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16                                                  = 79,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D16                                                = 80,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32                                                  = 81,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D32                                                = 82,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncSetBlockShape                                            = 83,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncSetSharedSize                                            = 84,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncGetAttribute                                             = 85,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncSetCacheConfig                                           = 86,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayCreate                                                  = 87,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayGetDescriptor                                           = 88,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayDestroy                                                 = 89,
+    CUPTI_DRIVER_TRACE_CBID_cuArray3DCreate                                                = 90,
+    CUPTI_DRIVER_TRACE_CBID_cuArray3DGetDescriptor                                         = 91,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefCreate                                                 = 92,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefDestroy                                                = 93,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetArray                                               = 94,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress                                             = 95,
+    CUPTI_DRIVER_TRACE_CBID_cu64TexRefSetAddress                                           = 96,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress2D                                           = 97,
+    CUPTI_DRIVER_TRACE_CBID_cu64TexRefSetAddress2D                                         = 98,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetFormat                                              = 99,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddressMode                                         = 100,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetFilterMode                                          = 101,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetFlags                                               = 102,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetAddress                                             = 103,
+    CUPTI_DRIVER_TRACE_CBID_cu64TexRefGetAddress                                           = 104,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetArray                                               = 105,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetAddressMode                                         = 106,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetFilterMode                                          = 107,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetFormat                                              = 108,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetFlags                                               = 109,
+    CUPTI_DRIVER_TRACE_CBID_cuParamSetSize                                                 = 110,
+    CUPTI_DRIVER_TRACE_CBID_cuParamSeti                                                    = 111,
+    CUPTI_DRIVER_TRACE_CBID_cuParamSetf                                                    = 112,
+    CUPTI_DRIVER_TRACE_CBID_cuParamSetv                                                    = 113,
+    CUPTI_DRIVER_TRACE_CBID_cuParamSetTexRef                                               = 114,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunch                                                       = 115,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchGrid                                                   = 116,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchGridAsync                                              = 117,
+    CUPTI_DRIVER_TRACE_CBID_cuEventCreate                                                  = 118,
+    CUPTI_DRIVER_TRACE_CBID_cuEventRecord                                                  = 119,
+    CUPTI_DRIVER_TRACE_CBID_cuEventQuery                                                   = 120,
+    CUPTI_DRIVER_TRACE_CBID_cuEventSynchronize                                             = 121,
+    CUPTI_DRIVER_TRACE_CBID_cuEventDestroy                                                 = 122,
+    CUPTI_DRIVER_TRACE_CBID_cuEventElapsedTime                                             = 123,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamCreate                                                 = 124,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamQuery                                                  = 125,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize                                            = 126,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamDestroy                                                = 127,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsUnregisterResource                                   = 128,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsSubResourceGetMappedArray                            = 129,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceGetMappedPointer                             = 130,
+    CUPTI_DRIVER_TRACE_CBID_cu64GraphicsResourceGetMappedPointer                           = 131,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceSetMapFlags                                  = 132,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsMapResources                                         = 133,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsUnmapResources                                       = 134,
+    CUPTI_DRIVER_TRACE_CBID_cuGetExportTable                                               = 135,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxSetLimit                                                  = 136,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetLimit                                                  = 137,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10GetDevice                                               = 138,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10CtxCreate                                               = 139,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsD3D10RegisterResource                                = 140,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10RegisterResource                                        = 141,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10UnregisterResource                                      = 142,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10MapResources                                            = 143,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10UnmapResources                                          = 144,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceSetMapFlags                                     = 145,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedArray                                  = 146,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedPointer                                = 147,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedSize                                   = 148,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedPitch                                  = 149,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetSurfaceDimensions                            = 150,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D11GetDevice                                               = 151,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D11CtxCreate                                               = 152,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsD3D11RegisterResource                                = 153,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9GetDevice                                                = 154,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9CtxCreate                                                = 155,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsD3D9RegisterResource                                 = 156,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9GetDirect3DDevice                                        = 157,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9RegisterResource                                         = 158,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9UnregisterResource                                       = 159,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9MapResources                                             = 160,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9UnmapResources                                           = 161,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceSetMapFlags                                      = 162,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetSurfaceDimensions                             = 163,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedArray                                   = 164,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedPointer                                 = 165,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedSize                                    = 166,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedPitch                                   = 167,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9Begin                                                    = 168,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9End                                                      = 169,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9RegisterVertexBuffer                                     = 170,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9MapVertexBuffer                                          = 171,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9UnmapVertexBuffer                                        = 172,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9UnregisterVertexBuffer                                   = 173,
+    CUPTI_DRIVER_TRACE_CBID_cuGLCtxCreate                                                  = 174,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsGLRegisterBuffer                                     = 175,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsGLRegisterImage                                      = 176,
+    CUPTI_DRIVER_TRACE_CBID_cuWGLGetDevice                                                 = 177,
+    CUPTI_DRIVER_TRACE_CBID_cuGLInit                                                       = 178,
+    CUPTI_DRIVER_TRACE_CBID_cuGLRegisterBufferObject                                       = 179,
+    CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObject                                            = 180,
+    CUPTI_DRIVER_TRACE_CBID_cuGLUnmapBufferObject                                          = 181,
+    CUPTI_DRIVER_TRACE_CBID_cuGLUnregisterBufferObject                                     = 182,
+    CUPTI_DRIVER_TRACE_CBID_cuGLSetBufferObjectMapFlags                                    = 183,
+    CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObjectAsync                                       = 184,
+    CUPTI_DRIVER_TRACE_CBID_cuGLUnmapBufferObjectAsync                                     = 185,
+    CUPTI_DRIVER_TRACE_CBID_cuVDPAUGetDevice                                               = 186,
+    CUPTI_DRIVER_TRACE_CBID_cuVDPAUCtxCreate                                               = 187,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsVDPAURegisterVideoSurface                            = 188,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsVDPAURegisterOutputSurface                           = 189,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetSurfRef                                             = 190,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfRefCreate                                                = 191,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfRefDestroy                                               = 192,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfRefSetFormat                                             = 193,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfRefSetArray                                              = 194,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfRefGetFormat                                             = 195,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfRefGetArray                                              = 196,
+    CUPTI_DRIVER_TRACE_CBID_cu64DeviceTotalMem                                             = 197,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D10ResourceGetMappedPointer                              = 198,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D10ResourceGetMappedSize                                 = 199,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D10ResourceGetMappedPitch                                = 200,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D10ResourceGetSurfaceDimensions                          = 201,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D9ResourceGetSurfaceDimensions                           = 202,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D9ResourceGetMappedPointer                               = 203,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D9ResourceGetMappedSize                                  = 204,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D9ResourceGetMappedPitch                                 = 205,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D9MapVertexBuffer                                        = 206,
+    CUPTI_DRIVER_TRACE_CBID_cu64GLMapBufferObject                                          = 207,
+    CUPTI_DRIVER_TRACE_CBID_cu64GLMapBufferObjectAsync                                     = 208,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D11GetDevices                                              = 209,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D11CtxCreateOnDevice                                       = 210,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10GetDevices                                              = 211,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10CtxCreateOnDevice                                       = 212,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9GetDevices                                               = 213,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9CtxCreateOnDevice                                        = 214,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemHostAlloc                                               = 215,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async                                                = 216,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD8Async                                              = 217,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async                                               = 218,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD16Async                                             = 219,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async                                               = 220,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD32Async                                             = 221,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async                                              = 222,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D8Async                                            = 223,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async                                             = 224,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D16Async                                           = 225,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async                                             = 226,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D32Async                                           = 227,
+    CUPTI_DRIVER_TRACE_CBID_cu64ArrayCreate                                                = 228,
+    CUPTI_DRIVER_TRACE_CBID_cu64ArrayGetDescriptor                                         = 229,
+    CUPTI_DRIVER_TRACE_CBID_cu64Array3DCreate                                              = 230,
+    CUPTI_DRIVER_TRACE_CBID_cu64Array3DGetDescriptor                                       = 231,
+    CUPTI_DRIVER_TRACE_CBID_cu64Memcpy2D                                                   = 232,
+    CUPTI_DRIVER_TRACE_CBID_cu64Memcpy2DUnaligned                                          = 233,
+    CUPTI_DRIVER_TRACE_CBID_cu64Memcpy2DAsync                                              = 234,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxCreate_v2                                                 = 235,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10CtxCreate_v2                                            = 236,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D11CtxCreate_v2                                            = 237,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9CtxCreate_v2                                             = 238,
+    CUPTI_DRIVER_TRACE_CBID_cuGLCtxCreate_v2                                               = 239,
+    CUPTI_DRIVER_TRACE_CBID_cuVDPAUCtxCreate_v2                                            = 240,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetGlobal_v2                                           = 241,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetInfo_v2                                                = 242,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2                                                  = 243,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocPitch_v2                                             = 244,
+    CUPTI_DRIVER_TRACE_CBID_cuMemFree_v2                                                   = 245,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetAddressRange_v2                                        = 246,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostGetDevicePointer_v2                                   = 247,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy_v2                                                    = 248,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2                                                  = 249,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2                                                 = 250,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2                                                 = 251,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2                                                = 252,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2                                               = 253,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2                                               = 254,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress_v2                                          = 255,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress2D_v2                                        = 256,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetAddress_v2                                          = 257,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceGetMappedPointer_v2                          = 258,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceTotalMem_v2                                            = 259,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedPointer_v2                             = 260,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedSize_v2                                = 261,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedPitch_v2                               = 262,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetSurfaceDimensions_v2                         = 263,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetSurfaceDimensions_v2                          = 264,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedPointer_v2                              = 265,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedSize_v2                                 = 266,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedPitch_v2                                = 267,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9MapVertexBuffer_v2                                       = 268,
+    CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObject_v2                                         = 269,
+    CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObjectAsync_v2                                    = 270,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostAlloc_v2                                              = 271,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayCreate_v2                                               = 272,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayGetDescriptor_v2                                        = 273,
+    CUPTI_DRIVER_TRACE_CBID_cuArray3DCreate_v2                                             = 274,
+    CUPTI_DRIVER_TRACE_CBID_cuArray3DGetDescriptor_v2                                      = 275,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2                                                = 276,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2                                           = 277,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2                                                = 278,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2                                           = 279,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2                                                = 280,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2                                           = 281,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2                                                = 282,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2                                           = 283,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2                                                = 284,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2                                                = 285,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2                                                = 286,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2                                                  = 287,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2                                         = 288,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2                                             = 289,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2                                                  = 290,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2                                             = 291,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2                                                = 292,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2                                           = 293,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocHost_v2                                              = 294,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitEvent                                              = 295,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetApiVersion                                             = 296,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10GetDirect3DDevice                                       = 297,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D11GetDirect3DDevice                                       = 298,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetCacheConfig                                            = 299,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxSetCacheConfig                                            = 300,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostRegister                                              = 301,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostUnregister                                            = 302,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxSetCurrent                                                = 303,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetCurrent                                                = 304,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy                                                       = 305,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync                                                  = 306,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel                                                 = 307,
+    CUPTI_DRIVER_TRACE_CBID_cuProfilerStart                                                = 308,
+    CUPTI_DRIVER_TRACE_CBID_cuProfilerStop                                                 = 309,
+    CUPTI_DRIVER_TRACE_CBID_cuPointerGetAttribute                                          = 310,
+    CUPTI_DRIVER_TRACE_CBID_cuProfilerInitialize                                           = 311,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceCanAccessPeer                                          = 312,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxEnablePeerAccess                                          = 313,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxDisablePeerAccess                                         = 314,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPeerRegister                                              = 315,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPeerUnregister                                            = 316,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPeerGetDevicePointer                                      = 317,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer                                                   = 318,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync                                              = 319,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DPeer                                                 = 320,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DPeerAsync                                            = 321,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy_v2                                                = 322,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxPushCurrent_v2                                            = 323,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxPopCurrent_v2                                             = 324,
+    CUPTI_DRIVER_TRACE_CBID_cuEventDestroy_v2                                              = 325,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamDestroy_v2                                             = 326,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress2D_v3                                        = 327,
+    CUPTI_DRIVER_TRACE_CBID_cuIpcGetMemHandle                                              = 328,
+    CUPTI_DRIVER_TRACE_CBID_cuIpcOpenMemHandle                                             = 329,
+    CUPTI_DRIVER_TRACE_CBID_cuIpcCloseMemHandle                                            = 330,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetByPCIBusId                                          = 331,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetPCIBusId                                            = 332,
+    CUPTI_DRIVER_TRACE_CBID_cuGLGetDevices                                                 = 333,
+    CUPTI_DRIVER_TRACE_CBID_cuIpcGetEventHandle                                            = 334,
+    CUPTI_DRIVER_TRACE_CBID_cuIpcOpenEventHandle                                           = 335,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxSetSharedMemConfig                                        = 336,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetSharedMemConfig                                        = 337,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncSetSharedMemConfig                                       = 338,
+    CUPTI_DRIVER_TRACE_CBID_cuTexObjectCreate                                              = 339,
+    CUPTI_DRIVER_TRACE_CBID_cuTexObjectDestroy                                             = 340,
+    CUPTI_DRIVER_TRACE_CBID_cuTexObjectGetResourceDesc                                     = 341,
+    CUPTI_DRIVER_TRACE_CBID_cuTexObjectGetTextureDesc                                      = 342,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfObjectCreate                                             = 343,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfObjectDestroy                                            = 344,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfObjectGetResourceDesc                                    = 345,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamAddCallback                                            = 346,
+    CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayCreate                                         = 347,
+    CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayGetLevel                                       = 348,
+    CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayDestroy                                        = 349,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMipmappedArray                                      = 350,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMipmapFilterMode                                    = 351,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMipmapLevelBias                                     = 352,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMipmapLevelClamp                                    = 353,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMaxAnisotropy                                       = 354,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMipmappedArray                                      = 355,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMipmapFilterMode                                    = 356,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMipmapLevelBias                                     = 357,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMipmapLevelClamp                                    = 358,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMaxAnisotropy                                       = 359,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceGetMappedMipmappedArray                      = 360,
+    CUPTI_DRIVER_TRACE_CBID_cuTexObjectGetResourceViewDesc                                 = 361,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkCreate                                                   = 362,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkAddData                                                  = 363,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkAddFile                                                  = 364,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkComplete                                                 = 365,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkDestroy                                                  = 366,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamCreateWithPriority                                     = 367,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetPriority                                            = 368,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetFlags                                               = 369,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetStreamPriorityRange                                    = 370,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocManaged                                              = 371,
+    CUPTI_DRIVER_TRACE_CBID_cuGetErrorString                                               = 372,
+    CUPTI_DRIVER_TRACE_CBID_cuGetErrorName                                                 = 373,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxActiveBlocksPerMultiprocessor                    = 374,
+    CUPTI_DRIVER_TRACE_CBID_cuCompilePtx                                                   = 375,
+    CUPTI_DRIVER_TRACE_CBID_cuBinaryFree                                                   = 376,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamAttachMemAsync                                         = 377,
+    CUPTI_DRIVER_TRACE_CBID_cuPointerSetAttribute                                          = 378,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostRegister_v2                                           = 379,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceSetMapFlags_v2                               = 380,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkCreate_v2                                                = 381,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkAddData_v2                                               = 382,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkAddFile_v2                                               = 383,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxPotentialBlockSize                               = 384,
+    CUPTI_DRIVER_TRACE_CBID_cuGLGetDevices_v2                                              = 385,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxRetain                                       = 386,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxRelease                                      = 387,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxSetFlags                                     = 388,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxReset                                        = 389,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsEGLRegisterImage                                     = 390,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetFlags                                                  = 391,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxGetState                                     = 392,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerConnect                                     = 393,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerDisconnect                                  = 394,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerAcquireFrame                                = 395,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerReleaseFrame                                = 396,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2_ptds                                           = 397,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2_ptds                                           = 398,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2_ptds                                           = 399,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2_ptds                                           = 400,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2_ptds                                           = 401,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2_ptds                                           = 402,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2_ptds                                           = 403,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2_ptds                                           = 404,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2_ptds                                             = 405,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2_ptds                                    = 406,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2_ptds                                             = 407,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy_ptds                                                  = 408,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer_ptds                                              = 409,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DPeer_ptds                                            = 410,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2_ptds                                             = 411,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2_ptds                                            = 412,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2_ptds                                            = 413,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2_ptds                                           = 414,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2_ptds                                          = 415,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2_ptds                                          = 416,
+    CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObject_v2_ptds                                    = 417,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync_ptsz                                             = 418,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2_ptsz                                      = 419,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2_ptsz                                      = 420,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2_ptsz                                      = 421,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2_ptsz                                      = 422,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2_ptsz                                      = 423,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2_ptsz                                        = 424,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2_ptsz                                        = 425,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync_ptsz                                         = 426,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DPeerAsync_ptsz                                       = 427,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async_ptsz                                           = 428,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async_ptsz                                          = 429,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async_ptsz                                          = 430,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async_ptsz                                         = 431,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async_ptsz                                        = 432,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async_ptsz                                        = 433,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetPriority_ptsz                                       = 434,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetFlags_ptsz                                          = 435,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitEvent_ptsz                                         = 436,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamAddCallback_ptsz                                       = 437,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamAttachMemAsync_ptsz                                    = 438,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamQuery_ptsz                                             = 439,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize_ptsz                                       = 440,
+    CUPTI_DRIVER_TRACE_CBID_cuEventRecord_ptsz                                             = 441,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel_ptsz                                            = 442,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsMapResources_ptsz                                    = 443,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsUnmapResources_ptsz                                  = 444,
+    CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObjectAsync_v2_ptsz                               = 445,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamProducerConnect                                     = 446,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamProducerDisconnect                                  = 447,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamProducerPresentFrame                                = 448,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceGetMappedEglFrame                            = 449,
+    CUPTI_DRIVER_TRACE_CBID_cuPointerGetAttributes                                         = 450,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags           = 451,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxPotentialBlockSizeWithFlags                      = 452,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamProducerReturnFrame                                 = 453,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetP2PAttribute                                        = 454,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetBorderColor                                         = 455,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetBorderColor                                         = 456,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAdvise                                                    = 457,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue32                                            = 458,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue32_ptsz                                       = 459,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue32                                           = 460,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue32_ptsz                                      = 461,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBatchMemOp                                             = 462,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBatchMemOp_ptsz                                        = 463,
+    CUPTI_DRIVER_TRACE_CBID_cuNVNbufferGetPointer                                          = 464,
+    CUPTI_DRIVER_TRACE_CBID_cuNVNtextureGetArray                                           = 465,
+    CUPTI_DRIVER_TRACE_CBID_cuNNSetAllocator                                               = 466,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPrefetchAsync                                             = 467,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPrefetchAsync_ptsz                                        = 468,
+    CUPTI_DRIVER_TRACE_CBID_cuEventCreateFromNVNSync                                       = 469,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerConnectWithFlags                            = 470,
+    CUPTI_DRIVER_TRACE_CBID_cuMemRangeGetAttribute                                         = 471,
+    CUPTI_DRIVER_TRACE_CBID_cuMemRangeGetAttributes                                        = 472,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue64                                            = 473,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue64_ptsz                                       = 474,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue64                                           = 475,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue64_ptsz                                      = 476,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel                                      = 477,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel_ptsz                                 = 478,
+    CUPTI_DRIVER_TRACE_CBID_cuEventCreateFromEGLSync                                       = 479,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice                           = 480,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncSetAttribute                                             = 481,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetUuid                                                = 482,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCtx                                                 = 483,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCtx_ptsz                                            = 484,
+    CUPTI_DRIVER_TRACE_CBID_cuImportExternalMemory                                         = 485,
+    CUPTI_DRIVER_TRACE_CBID_cuExternalMemoryGetMappedBuffer                                = 486,
+    CUPTI_DRIVER_TRACE_CBID_cuExternalMemoryGetMappedMipmappedArray                        = 487,
+    CUPTI_DRIVER_TRACE_CBID_cuDestroyExternalMemory                                        = 488,
+    CUPTI_DRIVER_TRACE_CBID_cuImportExternalSemaphore                                      = 489,
+    CUPTI_DRIVER_TRACE_CBID_cuSignalExternalSemaphoresAsync                                = 490,
+    CUPTI_DRIVER_TRACE_CBID_cuSignalExternalSemaphoresAsync_ptsz                           = 491,
+    CUPTI_DRIVER_TRACE_CBID_cuWaitExternalSemaphoresAsync                                  = 492,
+    CUPTI_DRIVER_TRACE_CBID_cuWaitExternalSemaphoresAsync_ptsz                             = 493,
+    CUPTI_DRIVER_TRACE_CBID_cuDestroyExternalSemaphore                                     = 494,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCapture                                           = 495,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCapture_ptsz                                      = 496,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamEndCapture                                             = 497,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamEndCapture_ptsz                                        = 498,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamIsCapturing                                            = 499,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamIsCapturing_ptsz                                       = 500,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphCreate                                                  = 501,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddKernelNode                                           = 502,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeGetParams                                     = 503,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemcpyNode                                           = 504,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphMemcpyNodeGetParams                                     = 505,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemsetNode                                           = 506,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphMemsetNodeGetParams                                     = 507,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphMemsetNodeSetParams                                     = 508,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetType                                             = 509,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphGetRootNodes                                            = 510,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetDependencies                                     = 511,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetDependentNodes                                   = 512,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiate                                             = 513,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch                                                  = 514,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz                                             = 515,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecDestroy                                             = 516,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphDestroy                                                 = 517,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddDependencies                                         = 518,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphRemoveDependencies                                      = 519,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphMemcpyNodeSetParams                                     = 520,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeSetParams                                     = 521,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphDestroyNode                                             = 522,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphClone                                                   = 523,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeFindInClone                                         = 524,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddChildGraphNode                                       = 525,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddEmptyNode                                            = 526,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchHostFunc                                               = 527,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchHostFunc_ptsz                                          = 528,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphChildGraphNodeGetGraph                                  = 529,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddHostNode                                             = 530,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphHostNodeGetParams                                       = 531,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetLuid                                                = 532,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphHostNodeSetParams                                       = 533,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphGetNodes                                                = 534,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphGetEdges                                                = 535,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo                                         = 536,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo_ptsz                                    = 537,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecKernelNodeSetParams                                 = 538,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCapture_v2                                        = 539,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCapture_v2_ptsz                                   = 540,
+    CUPTI_DRIVER_TRACE_CBID_cuThreadExchangeStreamCaptureMode                              = 541,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetNvSciSyncAttributes                                 = 542,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyAvailableDynamicSMemPerBlock                        = 543,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxRelease_v2                                   = 544,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxReset_v2                                     = 545,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxSetFlags_v2                                  = 546,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAddressReserve                                            = 547,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAddressFree                                               = 548,
+    CUPTI_DRIVER_TRACE_CBID_cuMemCreate                                                    = 549,
+    CUPTI_DRIVER_TRACE_CBID_cuMemRelease                                                   = 550,
+    CUPTI_DRIVER_TRACE_CBID_cuMemMap                                                       = 551,
+    CUPTI_DRIVER_TRACE_CBID_cuMemUnmap                                                     = 552,
+    CUPTI_DRIVER_TRACE_CBID_cuMemSetAccess                                                 = 553,
+    CUPTI_DRIVER_TRACE_CBID_cuMemExportToShareableHandle                                   = 554,
+    CUPTI_DRIVER_TRACE_CBID_cuMemImportFromShareableHandle                                 = 555,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetAllocationGranularity                                  = 556,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetAllocationPropertiesFromHandle                         = 557,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetAccess                                                 = 558,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamSetFlags                                               = 559,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamSetFlags_ptsz                                          = 560,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecUpdate                                              = 561,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecMemcpyNodeSetParams                                 = 562,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecMemsetNodeSetParams                                 = 563,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecHostNodeSetParams                                   = 564,
+    CUPTI_DRIVER_TRACE_CBID_cuMemRetainAllocationHandle                                    = 565,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncGetModule                                                = 566,
+    CUPTI_DRIVER_TRACE_CBID_cuIpcOpenMemHandle_v2                                          = 567,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxResetPersistingL2Cache                                    = 568,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeCopyAttributes                                = 569,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeGetAttribute                                  = 570,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeSetAttribute                                  = 571,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamCopyAttributes                                         = 572,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamCopyAttributes_ptsz                                    = 573,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetAttribute                                           = 574,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetAttribute_ptsz                                      = 575,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamSetAttribute                                           = 576,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamSetAttribute_ptsz                                      = 577,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiate_v2                                          = 578,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetTexture1DLinearMaxWidth                             = 579,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphUpload                                                  = 580,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphUpload_ptsz                                             = 581,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayGetSparseProperties                                     = 582,
+    CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayGetSparseProperties                            = 583,
+    CUPTI_DRIVER_TRACE_CBID_cuMemMapArrayAsync                                             = 584,
+    CUPTI_DRIVER_TRACE_CBID_cuMemMapArrayAsync_ptsz                                        = 585,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecChildGraphNodeSetParams                             = 586,
+    CUPTI_DRIVER_TRACE_CBID_cuEventRecordWithFlags                                         = 587,
+    CUPTI_DRIVER_TRACE_CBID_cuEventRecordWithFlags_ptsz                                    = 588,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddEventRecordNode                                      = 589,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddEventWaitNode                                        = 590,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphEventRecordNodeGetEvent                                 = 591,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphEventWaitNodeGetEvent                                   = 592,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphEventRecordNodeSetEvent                                 = 593,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphEventWaitNodeSetEvent                                   = 594,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecEventRecordNodeSetEvent                             = 595,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecEventWaitNodeSetEvent                               = 596,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayGetPlane                                                = 597,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocAsync                                                = 598,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocAsync_ptsz                                           = 599,
+    CUPTI_DRIVER_TRACE_CBID_cuMemFreeAsync                                                 = 600,
+    CUPTI_DRIVER_TRACE_CBID_cuMemFreeAsync_ptsz                                            = 601,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolTrimTo                                                = 602,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolSetAttribute                                          = 603,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolGetAttribute                                          = 604,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolSetAccess                                             = 605,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetDefaultMemPool                                      = 606,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolCreate                                                = 607,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolDestroy                                               = 608,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceSetMemPool                                             = 609,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetMemPool                                             = 610,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocFromPoolAsync                                        = 611,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocFromPoolAsync_ptsz                                   = 612,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolExportToShareableHandle                               = 613,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolImportFromShareableHandle                             = 614,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolExportPointer                                         = 615,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolImportPointer                                         = 616,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolGetAccess                                             = 617,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddExternalSemaphoresSignalNode                         = 618,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExternalSemaphoresSignalNodeGetParams                   = 619,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExternalSemaphoresSignalNodeSetParams                   = 620,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddExternalSemaphoresWaitNode                           = 621,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExternalSemaphoresWaitNodeGetParams                     = 622,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExternalSemaphoresWaitNodeSetParams                     = 623,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecExternalSemaphoresSignalNodeSetParams               = 624,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecExternalSemaphoresWaitNodeSetParams                 = 625,
+    CUPTI_DRIVER_TRACE_CBID_cuGetProcAddress                                               = 626,
+    CUPTI_DRIVER_TRACE_CBID_cuFlushGPUDirectRDMAWrites                                     = 627,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphDebugDotPrint                                           = 628,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo_v2                                      = 629,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo_v2_ptsz                                 = 630,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamUpdateCaptureDependencies                              = 631,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamUpdateCaptureDependencies_ptsz                         = 632,
+    CUPTI_DRIVER_TRACE_CBID_cuUserObjectCreate                                             = 633,
+    CUPTI_DRIVER_TRACE_CBID_cuUserObjectRetain                                             = 634,
+    CUPTI_DRIVER_TRACE_CBID_cuUserObjectRelease                                            = 635,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphRetainUserObject                                        = 636,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphReleaseUserObject                                       = 637,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemAllocNode                                         = 638,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemFreeNode                                          = 639,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGraphMemTrim                                           = 640,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetGraphMemAttribute                                   = 641,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceSetGraphMemAttribute                                   = 642,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiateWithFlags                                    = 643,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetExecAffinitySupport                                 = 644,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxCreate_v3                                                 = 645,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetExecAffinity                                           = 646,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetUuid_v2                                             = 647,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphMemAllocNodeGetParams                                   = 648,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphMemFreeNodeGetParams                                    = 649,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeSetEnabled                                          = 650,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetEnabled                                          = 651,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx                                               = 652,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx_ptsz                                          = 653,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayGetMemoryRequirements                                   = 654,
+    CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayGetMemoryRequirements                          = 655,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiateWithParams                                   = 656,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiateWithParams_ptsz                              = 657,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecGetFlags                                            = 658,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue32_v2                                         = 659,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue32_v2_ptsz                                    = 660,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue64_v2                                         = 661,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue64_v2_ptsz                                    = 662,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue32_v2                                        = 663,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue32_v2_ptsz                                   = 664,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue64_v2                                        = 665,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue64_v2_ptsz                                   = 666,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBatchMemOp_v2                                          = 667,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBatchMemOp_v2_ptsz                                     = 668,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddBatchMemOpNode                                       = 669,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphBatchMemOpNodeGetParams                                 = 670,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphBatchMemOpNodeSetParams                                 = 671,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecBatchMemOpNodeSetParams                             = 672,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetLoadingMode                                         = 673,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetHandleForAddressRange                                  = 674,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxPotentialClusterSize                             = 675,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxActiveClusters                                   = 676,
+    CUPTI_DRIVER_TRACE_CBID_SIZE                                                           = 677,
+    CUPTI_DRIVER_TRACE_CBID_FORCE_INT                                                      = 0x7fffffff
+} CUpti_driver_api_trace_cbid;

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_metrics.h ADDED Viewed

	@@ -0,0 +1,825 @@

+/*
+ * Copyright 2011-2020   NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(_CUPTI_METRIC_H_)
+#define _CUPTI_METRIC_H_
+#include <cuda.h>
+#include <string.h>
+#include <cuda_stdint.h>
+#include <cupti_result.h>
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+/**
+ * \defgroup CUPTI_METRIC_API CUPTI Metric API
+ * Functions, types, and enums that implement the CUPTI Metric API.
+ *
+ * \note CUPTI metric API from the header cupti_metrics.h are not supported on devices
+ * with compute capability 7.5 and higher (i.e. Turing and later GPU architectures).
+ * These API will be deprecated in a future CUDA release. These are replaced by
+ * Profiling API in the header cupti_profiler_target.h and Perfworks metrics API
+ * in the headers nvperf_host.h and nvperf_target.h which are supported on
+ * devices with compute capability 7.0 and higher (i.e. Volta and later GPU
+ * architectures).
+ *
+ * @{
+ */
+/**
+ * \brief ID for a metric.
+ *
+ * A metric provides a measure of some aspect of the device.
+ */
+typedef uint32_t CUpti_MetricID;
+/**
+ * \brief A metric category.
+ *
+ * Each metric is assigned to a category that represents the general
+ * type of the metric. A metric's category is accessed using \ref
+ * cuptiMetricGetAttribute and the CUPTI_METRIC_ATTR_CATEGORY
+ * attribute.
+ */
+typedef enum {
+  /**
+   * A memory related metric.
+   */
+  CUPTI_METRIC_CATEGORY_MEMORY          = 0,
+  /**
+   * An instruction related metric.
+   */
+  CUPTI_METRIC_CATEGORY_INSTRUCTION     = 1,
+  /**
+   * A multiprocessor related metric.
+   */
+  CUPTI_METRIC_CATEGORY_MULTIPROCESSOR  = 2,
+  /**
+   * A cache related metric.
+   */
+  CUPTI_METRIC_CATEGORY_CACHE           = 3,
+  /**
+   * A texture related metric.
+   */
+  CUPTI_METRIC_CATEGORY_TEXTURE         = 4,
+  /**
+   *A Nvlink related metric.
+   */
+  CUPTI_METRIC_CATEGORY_NVLINK          = 5,
+  /**
+   *A PCIe related metric.
+   */
+  CUPTI_METRIC_CATEGORY_PCIE           = 6,
+  CUPTI_METRIC_CATEGORY_FORCE_INT                         = 0x7fffffff,
+} CUpti_MetricCategory;
+/**
+ * \brief A metric evaluation mode.
+ *
+ * A metric can be evaluated per hardware instance to know the load balancing
+ * across instances of a domain or the metric can be evaluated in aggregate mode
+ * when the events involved in metric evaluation are from different event
+ * domains. It might be possible to evaluate some metrics in both
+ * modes for convenience. A metric's evaluation mode is accessed using \ref
+ * CUpti_MetricEvaluationMode and the CUPTI_METRIC_ATTR_EVALUATION_MODE
+ * attribute.
+ */
+typedef enum {
+  /**
+   * If this bit is set, the metric can be profiled for each instance of the
+   * domain. The event values passed to \ref cuptiMetricGetValue can contain
+   * values for one instance of the domain. And \ref cuptiMetricGetValue can
+   * be called for each instance.
+   */
+  CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE         = 1,
+  /**
+   * If this bit is set, the metric can be profiled over all instances. The
+   * event values passed to \ref cuptiMetricGetValue can be aggregated values
+   * of events for all instances of the domain.
+   */
+  CUPTI_METRIC_EVALUATION_MODE_AGGREGATE            = 1 << 1,
+  CUPTI_METRIC_EVALUATION_MODE_FORCE_INT            = 0x7fffffff,
+} CUpti_MetricEvaluationMode;
+/**
+ * \brief Kinds of metric values.
+ *
+ * Metric values can be one of several different kinds. Corresponding
+ * to each kind is a member of the CUpti_MetricValue union. The metric
+ * value returned by \ref cuptiMetricGetValue should be accessed using
+ * the appropriate member of that union based on its value kind.
+ */
+typedef enum {
+  /**
+   * The metric value is a 64-bit double.
+   */
+  CUPTI_METRIC_VALUE_KIND_DOUBLE            = 0,
+  /**
+   * The metric value is a 64-bit unsigned integer.
+   */
+  CUPTI_METRIC_VALUE_KIND_UINT64            = 1,
+  /**
+   * The metric value is a percentage represented by a 64-bit
+   * double. For example, 57.5% is represented by the value 57.5.
+   */
+  CUPTI_METRIC_VALUE_KIND_PERCENT           = 2,
+  /**
+   * The metric value is a throughput represented by a 64-bit
+   * integer. The unit for throughput values is bytes/second.
+   */
+  CUPTI_METRIC_VALUE_KIND_THROUGHPUT        = 3,
+  /**
+   * The metric value is a 64-bit signed integer.
+   */
+  CUPTI_METRIC_VALUE_KIND_INT64             = 4,
+  /**
+   * The metric value is a utilization level, as represented by
+   * CUpti_MetricValueUtilizationLevel.
+   */
+  CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL = 5,
+  CUPTI_METRIC_VALUE_KIND_FORCE_INT  = 0x7fffffff
+} CUpti_MetricValueKind;
+/**
+ * \brief Enumeration of utilization levels for metrics values of kind
+ * CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL. Utilization values can
+ * vary from IDLE (0) to MAX (10) but the enumeration only provides
+ * specific names for a few values.
+ */
+typedef enum {
+  CUPTI_METRIC_VALUE_UTILIZATION_IDLE      = 0,
+  CUPTI_METRIC_VALUE_UTILIZATION_LOW       = 2,
+  CUPTI_METRIC_VALUE_UTILIZATION_MID       = 5,
+  CUPTI_METRIC_VALUE_UTILIZATION_HIGH      = 8,
+  CUPTI_METRIC_VALUE_UTILIZATION_MAX       = 10,
+  CUPTI_METRIC_VALUE_UTILIZATION_FORCE_INT = 0x7fffffff
+} CUpti_MetricValueUtilizationLevel;
+/**
+ * \brief Metric attributes.
+ *
+ * Metric attributes describe properties of a metric. These attributes
+ * can be read using \ref cuptiMetricGetAttribute.
+ */
+typedef enum {
+  /**
+   * Metric name. Value is a null terminated const c-string.
+   */
+  CUPTI_METRIC_ATTR_NAME              = 0,
+  /**
+   * Short description of metric. Value is a null terminated const c-string.
+   */
+  CUPTI_METRIC_ATTR_SHORT_DESCRIPTION = 1,
+  /**
+   * Long description of metric. Value is a null terminated const c-string.
+   */
+  CUPTI_METRIC_ATTR_LONG_DESCRIPTION  = 2,
+  /**
+   * Category of the metric. Value is of type CUpti_MetricCategory.
+   */
+  CUPTI_METRIC_ATTR_CATEGORY          = 3,
+  /**
+   * Value type of the metric. Value is of type CUpti_MetricValueKind.
+   */
+  CUPTI_METRIC_ATTR_VALUE_KIND          = 4,
+  /**
+   * Metric evaluation mode. Value is of type CUpti_MetricEvaluationMode.
+   */
+  CUPTI_METRIC_ATTR_EVALUATION_MODE     = 5,
+  CUPTI_METRIC_ATTR_FORCE_INT         = 0x7fffffff,
+} CUpti_MetricAttribute;
+/**
+ * \brief A metric value.
+ *
+ * Metric values can be one of several different kinds. Corresponding
+ * to each kind is a member of the CUpti_MetricValue union. The metric
+ * value returned by \ref cuptiMetricGetValue should be accessed using
+ * the appropriate member of that union based on its value kind.
+ */
+typedef union {
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_DOUBLE.
+   */
+  double metricValueDouble;
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_UINT64.
+   */
+  uint64_t metricValueUint64;
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_INT64.
+   */
+  int64_t metricValueInt64;
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_PERCENT. For example, 57.5% is
+   * represented by the value 57.5.
+   */
+  double metricValuePercent;
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_THROUGHPUT.  The unit for
+   * throughput values is bytes/second.
+   */
+  uint64_t metricValueThroughput;
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL.
+   */
+  CUpti_MetricValueUtilizationLevel metricValueUtilizationLevel;
+} CUpti_MetricValue;
+/**
+ * \brief Device class.
+ *
+ * Enumeration of device classes for metric property
+ * CUPTI_METRIC_PROPERTY_DEVICE_CLASS.
+ */
+typedef enum {
+  CUPTI_METRIC_PROPERTY_DEVICE_CLASS_TESLA          = 0,
+  CUPTI_METRIC_PROPERTY_DEVICE_CLASS_QUADRO         = 1,
+  CUPTI_METRIC_PROPERTY_DEVICE_CLASS_GEFORCE        = 2,
+  CUPTI_METRIC_PROPERTY_DEVICE_CLASS_TEGRA          = 3,
+} CUpti_MetricPropertyDeviceClass;
+/**
+ * \brief Metric device properties.
+ *
+ * Metric device properties describe device properties which are needed for a metric.
+ * Some of these properties can be collected using cuDeviceGetAttribute.
+ */
+typedef enum {
+  /*
+   * Number of multiprocessors on a device.  This can be collected
+   * using value of \param CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT of
+   * cuDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_MULTIPROCESSOR_COUNT,
+  /*
+   * Maximum number of warps on a multiprocessor. This can be
+   * collected using ratio of value of \param
+   * CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR and \param
+   * CU_DEVICE_ATTRIBUTE_WARP_SIZE of cuDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_WARPS_PER_MULTIPROCESSOR,
+  /*
+   * GPU Time for kernel in ns. This should be profiled using CUPTI
+   * Activity API.
+   */
+  CUPTI_METRIC_PROPERTY_KERNEL_GPU_TIME,
+  /*
+   * Clock rate for device in KHz.  This should be collected using
+   * value of \param CU_DEVICE_ATTRIBUTE_CLOCK_RATE of
+   * cuDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_CLOCK_RATE,
+  /*
+   * Number of Frame buffer units for device. This should be collected
+   * using value of \param CUPTI_DEVICE_ATTRIBUTE_MAX_FRAME_BUFFERS of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_FRAME_BUFFER_COUNT,
+  /*
+   * Global memory bandwidth in KBytes/sec. This should be collected
+   * using value of \param CUPTI_DEVICE_ATTR_GLOBAL_MEMORY_BANDWIDTH
+   * of cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_GLOBAL_MEMORY_BANDWIDTH,
+  /*
+   * PCIE link rate in Mega bits/sec. This should be collected using
+   * value of \param CUPTI_DEVICE_ATTR_PCIE_LINK_RATE of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_PCIE_LINK_RATE,
+  /*
+   * PCIE link width for device. This should be collected using
+   * value of \param CUPTI_DEVICE_ATTR_PCIE_LINK_WIDTH of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_PCIE_LINK_WIDTH,
+  /*
+   * PCIE generation for device. This should be collected using
+   * value of \param CUPTI_DEVICE_ATTR_PCIE_GEN of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_PCIE_GEN,
+  /*
+   * The device class. This should be collected using
+   * value of \param CUPTI_DEVICE_ATTR_DEVICE_CLASS of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_DEVICE_CLASS,
+  /*
+   * Peak single precision floating point operations that
+   * can be performed in one cycle by the device.
+   * This should be collected using value of
+   * \param CUPTI_DEVICE_ATTR_FLOP_SP_PER_CYCLE of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_FLOP_SP_PER_CYCLE,
+  /*
+   * Peak double precision floating point operations that
+   * can be performed in one cycle by the device.
+   * This should be collected using value of
+   * \param CUPTI_DEVICE_ATTR_FLOP_DP_PER_CYCLE of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_FLOP_DP_PER_CYCLE,
+  /*
+   * Number of L2 units on a device. This can be collected
+   * using value of \param CUPTI_DEVICE_ATTR_MAX_L2_UNITS of
+   * cuDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_L2_UNITS,
+  /*
+   * Whether ECC support is enabled on the device. This can be
+   * collected using value of \param CU_DEVICE_ATTRIBUTE_ECC_ENABLED of
+   * cuDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_ECC_ENABLED,
+  /*
+   * Peak half precision floating point operations that
+   * can be performed in one cycle by the device.
+   * This should be collected using value of
+   * \param CUPTI_DEVICE_ATTR_FLOP_HP_PER_CYCLE of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_FLOP_HP_PER_CYCLE,
+  /*
+   * NVLINK Bandwitdh for device. This should be collected
+   * using value of \param CUPTI_DEVICE_ATTR_GPU_CPU_NVLINK_BW of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_GPU_CPU_NVLINK_BANDWIDTH,
+} CUpti_MetricPropertyID;
+/**
+ * \brief Get the total number of metrics available on any device.
+ *
+ * Returns the total number of metrics available on any CUDA-capable
+ * devices.
+ *
+ * \param numMetrics Returns the number of metrics
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numMetrics is NULL
+*/
+CUptiResult CUPTIAPI cuptiGetNumMetrics(uint32_t *numMetrics);
+/**
+ * \brief Get all the metrics available on any device.
+ *
+ * Returns the metric IDs in \p metricArray for all CUDA-capable
+ * devices.  The size of the \p metricArray buffer is given by \p
+ * *arraySizeBytes. The size of the \p metricArray buffer must be at
+ * least \p numMetrics * sizeof(CUpti_MetricID) or all metric IDs will
+ * not be returned. The value returned in \p *arraySizeBytes contains
+ * the number of bytes returned in \p metricArray.
+ *
+ * \param arraySizeBytes The size of \p metricArray in bytes, and
+ * returns the number of bytes written to \p metricArray
+ * \param metricArray Returns the IDs of the metrics
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or
+ * \p metricArray are NULL
+*/
+CUptiResult CUPTIAPI cuptiEnumMetrics(size_t *arraySizeBytes,
+                                      CUpti_MetricID *metricArray);
+/**
+ * \brief Get the number of metrics for a device.
+ *
+ * Returns the number of metrics available for a device.
+ *
+ * \param device The CUDA device
+ * \param numMetrics Returns the number of metrics available for the
+ * device
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numMetrics is NULL
+ */
+CUptiResult CUPTIAPI cuptiDeviceGetNumMetrics(CUdevice device,
+                                              uint32_t *numMetrics);
+/**
+ * \brief Get the metrics for a device.
+ *
+ * Returns the metric IDs in \p metricArray for a device.  The size of
+ * the \p metricArray buffer is given by \p *arraySizeBytes. The size
+ * of the \p metricArray buffer must be at least \p numMetrics *
+ * sizeof(CUpti_MetricID) or else all metric IDs will not be
+ * returned. The value returned in \p *arraySizeBytes contains the
+ * number of bytes returned in \p metricArray.
+ *
+ * \param device The CUDA device
+ * \param arraySizeBytes The size of \p metricArray in bytes, and
+ * returns the number of bytes written to \p metricArray
+ * \param metricArray Returns the IDs of the metrics for the device
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or
+ * \p metricArray are NULL
+ */
+CUptiResult CUPTIAPI cuptiDeviceEnumMetrics(CUdevice device,
+                                            size_t *arraySizeBytes,
+                                            CUpti_MetricID *metricArray);
+/**
+ * \brief Get a metric attribute.
+ *
+ * Returns a metric attribute in \p *value. The size of the \p
+ * value buffer is given by \p *valueSize. The value returned in \p
+ * *valueSize contains the number of bytes returned in \p value.
+ *
+ * If the attribute value is a c-string that is longer than \p
+ * *valueSize, then only the first \p *valueSize characters will be
+ * returned and there will be no terminating null byte.
+ *
+ * \param metric ID of the metric
+ * \param attrib The metric attribute to read
+ * \param valueSize The size of the \p value buffer in bytes, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the attribute's value
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not a metric attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
+ * attribute values, indicates that the \p value buffer is too small
+ * to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiMetricGetAttribute(CUpti_MetricID metric,
+                                             CUpti_MetricAttribute attrib,
+                                             size_t *valueSize,
+                                             void *value);
+/**
+ * \brief Find an metric by name.
+ *
+ * Find a metric by name and return the metric ID in \p *metric.
+ *
+ * \param device The CUDA device
+ * \param metricName The name of metric to find
+ * \param metric Returns the ID of the found metric or undefined if
+ * unable to find the metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_METRIC_NAME if unable to find a metric
+ * with name \p metricName. In this case \p *metric is undefined
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricName or \p
+ * metric are NULL.
+ */
+CUptiResult CUPTIAPI cuptiMetricGetIdFromName(CUdevice device,
+                                              const char *metricName,
+                                              CUpti_MetricID *metric);
+/**
+ * \brief Get number of events required to calculate a metric.
+ *
+ * Returns the number of events in \p numEvents that are required to
+ * calculate a metric.
+ *
+ * \param metric ID of the metric
+ * \param numEvents Returns the number of events required for the metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numEvents is NULL
+ */
+CUptiResult CUPTIAPI cuptiMetricGetNumEvents(CUpti_MetricID metric,
+                                             uint32_t *numEvents);
+/**
+ * \brief Get the events required to calculating a metric.
+ *
+ * Gets the event IDs in \p eventIdArray required to calculate a \p
+ * metric. The size of the \p eventIdArray buffer is given by \p
+ * *eventIdArraySizeBytes and must be at least \p numEvents *
+ * sizeof(CUpti_EventID) or all events will not be returned. The value
+ * returned in \p *eventIdArraySizeBytes contains the number of bytes
+ * returned in \p eventIdArray.
+ *
+ * \param metric ID of the metric
+ * \param eventIdArraySizeBytes The size of \p eventIdArray in bytes,
+ * and returns the number of bytes written to \p eventIdArray
+ * \param eventIdArray Returns the IDs of the events required to
+ * calculate \p metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventIdArraySizeBytes or \p
+ * eventIdArray are NULL.
+ */
+CUptiResult CUPTIAPI cuptiMetricEnumEvents(CUpti_MetricID metric,
+                                           size_t *eventIdArraySizeBytes,
+                                           CUpti_EventID *eventIdArray);
+/**
+ * \brief Get number of properties required to calculate a metric.
+ *
+ * Returns the number of properties in \p numProp that are required to
+ * calculate a metric.
+ *
+ * \param metric ID of the metric
+ * \param numProp Returns the number of properties required for the
+ * metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numProp is NULL
+ */
+CUptiResult CUPTIAPI cuptiMetricGetNumProperties(CUpti_MetricID metric,
+                                                 uint32_t *numProp);
+/**
+ * \brief Get the properties required to calculating a metric.
+ *
+ * Gets the property IDs in \p propIdArray required to calculate a \p
+ * metric. The size of the \p propIdArray buffer is given by \p
+ * *propIdArraySizeBytes and must be at least \p numProp *
+ * sizeof(CUpti_DeviceAttribute) or all properties will not be
+ * returned. The value returned in \p *propIdArraySizeBytes contains
+ * the number of bytes returned in \p propIdArray.
+ *
+ * \param metric ID of the metric
+ * \param propIdArraySizeBytes The size of \p propIdArray in bytes,
+ * and returns the number of bytes written to \p propIdArray
+ * \param propIdArray Returns the IDs of the properties required to
+ * calculate \p metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p propIdArraySizeBytes or \p
+ * propIdArray are NULL.
+ */
+CUptiResult CUPTIAPI cuptiMetricEnumProperties(CUpti_MetricID metric,
+                                               size_t *propIdArraySizeBytes,
+                                               CUpti_MetricPropertyID *propIdArray);
+/**
+ * \brief For a metric get the groups of events that must be collected
+ * in the same pass.
+ *
+ * For a metric get the groups of events that must be collected in the
+ * same pass to ensure that the metric is calculated correctly. If the
+ * events are not collected as specified then the metric value may be
+ * inaccurate.
+ *
+ * The function returns NULL if a metric does not have any required
+ * event group. In this case the events needed for the metric can be
+ * grouped in any manner for collection.
+ *
+ * \param context The context for event collection
+ * \param metric The metric ID
+ * \param eventGroupSets Returns a CUpti_EventGroupSets object that
+ * indicates the events that must be collected in the same pass to
+ * ensure the metric is calculated correctly.  Returns NULL if no
+ * grouping is required for metric
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ */
+CUptiResult CUPTIAPI cuptiMetricGetRequiredEventGroupSets(CUcontext context,
+                                                          CUpti_MetricID metric,
+                                                          CUpti_EventGroupSets **eventGroupSets);
+/**
+ * \brief For a set of metrics, get the grouping that indicates the
+ * number of passes and the event groups necessary to collect the
+ * events required for those metrics.
+ *
+ * For a set of metrics, get the grouping that indicates the number of
+ * passes and the event groups necessary to collect the events
+ * required for those metrics.
+ *
+ * \see cuptiEventGroupSetsCreate for details on event group set
+ * creation.
+ *
+ * \param context The context for event collection
+ * \param metricIdArraySizeBytes Size of the metricIdArray in bytes
+ * \param metricIdArray Array of metric IDs
+ * \param eventGroupPasses Returns a CUpti_EventGroupSets object that
+ * indicates the number of passes required to collect the events and
+ * the events to collect on each pass
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricIdArray or
+ * \p eventGroupPasses is NULL
+ */
+CUptiResult CUPTIAPI cuptiMetricCreateEventGroupSets(CUcontext context,
+                                                     size_t metricIdArraySizeBytes,
+                                                     CUpti_MetricID *metricIdArray,
+                                                     CUpti_EventGroupSets **eventGroupPasses);
+/**
+ * \brief Calculate the value for a metric.
+ *
+ * Use the events collected for a metric to calculate the metric
+ * value. Metric value evaluation depends on the evaluation mode
+ * \ref CUpti_MetricEvaluationMode that the metric supports.
+ * If a metric has evaluation mode as CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE,
+ * then it assumes that the input event value is for one domain instance.
+ * If a metric has evaluation mode as CUPTI_METRIC_EVALUATION_MODE_AGGREGATE,
+ * it assumes that input event values are
+ * normalized to represent all domain instances on a device. For the
+ * most accurate metric collection, the events required for the metric
+ * should be collected for all profiled domain instances. For example,
+ * to collect all instances of an event, set the
+ * CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES attribute on
+ * the group containing the event to 1. The normalized value for the
+ * event is then: (\p sum_event_values * \p totalInstanceCount) / \p
+ * instanceCount, where \p sum_event_values is the summation of the
+ * event values across all profiled domain instances, \p
+ * totalInstanceCount is obtained from querying
+ * CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT and \p instanceCount
+ * is obtained from querying CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT (or
+ * CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT).
+ *
+ * \param device The CUDA device that the metric is being calculated for
+ * \param metric The metric ID
+ * \param eventIdArraySizeBytes The size of \p eventIdArray in bytes
+ * \param eventIdArray The event IDs required to calculate \p metric
+ * \param eventValueArraySizeBytes The size of \p eventValueArray in bytes
+ * \param eventValueArray The normalized event values required to
+ * calculate \p metric. The values must be order to match the order of
+ * events in \p eventIdArray
+ * \param timeDuration The duration over which the events were
+ * collected, in ns
+ * \param metricValue Returns the value for the metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_OPERATION
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if the
+ * eventIdArray does not contain all the events needed for metric
+ * \retval CUPTI_ERROR_INVALID_EVENT_VALUE if any of the
+ * event values required for the metric is CUPTI_EVENT_OVERFLOW
+ * \retval CUPTI_ERROR_INVALID_METRIC_VALUE if the computed metric value
+ * cannot be represented in the metric's value type. For example,
+ * if the metric value type is unsigned and the computed metric value is negative
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricValue,
+ * \p eventIdArray or \p eventValueArray is NULL
+ */
+CUptiResult CUPTIAPI cuptiMetricGetValue(CUdevice device,
+                                         CUpti_MetricID metric,
+                                         size_t eventIdArraySizeBytes,
+                                         CUpti_EventID *eventIdArray,
+                                         size_t eventValueArraySizeBytes,
+                                         uint64_t *eventValueArray,
+                                         uint64_t timeDuration,
+                                         CUpti_MetricValue *metricValue);
+/**
+ * \brief Calculate the value for a metric.
+ *
+ * Use the events and properties collected for a metric to calculate
+ * the metric value. Metric value evaluation depends on the evaluation
+ * mode \ref CUpti_MetricEvaluationMode that the metric supports.  If
+ * a metric has evaluation mode as
+ * CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE, then it assumes that the
+ * input event value is for one domain instance.  If a metric has
+ * evaluation mode as CUPTI_METRIC_EVALUATION_MODE_AGGREGATE, it
+ * assumes that input event values are normalized to represent all
+ * domain instances on a device. For the most accurate metric
+ * collection, the events required for the metric should be collected
+ * for all profiled domain instances. For example, to collect all
+ * instances of an event, set the
+ * CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES attribute on
+ * the group containing the event to 1. The normalized value for the
+ * event is then: (\p sum_event_values * \p totalInstanceCount) / \p
+ * instanceCount, where \p sum_event_values is the summation of the
+ * event values across all profiled domain instances, \p
+ * totalInstanceCount is obtained from querying
+ * CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT and \p instanceCount
+ * is obtained from querying CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT (or
+ * CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT).
+ *
+ * \param metric The metric ID
+ * \param eventIdArraySizeBytes The size of \p eventIdArray in bytes
+ * \param eventIdArray The event IDs required to calculate \p metric
+ * \param eventValueArraySizeBytes The size of \p eventValueArray in bytes
+ * \param eventValueArray The normalized event values required to
+ * calculate \p metric. The values must be order to match the order of
+ * events in \p eventIdArray
+ * \param propIdArraySizeBytes The size of \p propIdArray in bytes
+ * \param propIdArray The metric property IDs required to calculate \p metric
+ * \param propValueArraySizeBytes The size of \p propValueArray in bytes
+ * \param propValueArray The metric property values required to
+ * calculate \p metric. The values must be order to match the order of
+ * metric properties in \p propIdArray
+ * \param metricValue Returns the value for the metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_OPERATION
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if the
+ * eventIdArray does not contain all the events needed for metric
+ * \retval CUPTI_ERROR_INVALID_EVENT_VALUE if any of the
+ * event values required for the metric is CUPTI_EVENT_OVERFLOW
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if the computed metric value
+ * cannot be represented in the metric's value type. For example,
+ * if the metric value type is unsigned and the computed metric value is negative
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricValue,
+ * \p eventIdArray or \p eventValueArray is NULL
+ */
+CUptiResult CUPTIAPI cuptiMetricGetValue2(CUpti_MetricID metric,
+                                          size_t eventIdArraySizeBytes,
+                                          CUpti_EventID *eventIdArray,
+                                          size_t eventValueArraySizeBytes,
+                                          uint64_t *eventValueArray,
+                                          size_t propIdArraySizeBytes,
+                                          CUpti_MetricPropertyID *propIdArray,
+                                          size_t propValueArraySizeBytes,
+                                          uint64_t *propValueArray,
+                                          CUpti_MetricValue *metricValue);
+/** @} */ /* END CUPTI_METRIC_API */
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+#if defined(__cplusplus)
+}
+#endif
+#endif /*_CUPTI_METRIC_H_*/

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_gl_interop_meta.h ADDED Viewed

	@@ -0,0 +1,71 @@

+// This file is generated.  Any changes you make will be lost during the next clean build.
+// CUDA public interface, for type definitions and api function prototypes
+#include "cuda_gl_interop.h"
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+// Currently used parameter trace structures
+typedef struct cudaGLGetDevices_v4010_params_st {
+    unsigned int *pCudaDeviceCount;
+    int *pCudaDevices;
+    unsigned int cudaDeviceCount;
+    enum cudaGLDeviceList deviceList;
+} cudaGLGetDevices_v4010_params;
+typedef struct cudaGraphicsGLRegisterImage_v3020_params_st {
+    struct cudaGraphicsResource **resource;
+    GLuint image;
+    GLenum target;
+    unsigned int flags;
+} cudaGraphicsGLRegisterImage_v3020_params;
+typedef struct cudaGraphicsGLRegisterBuffer_v3020_params_st {
+    struct cudaGraphicsResource **resource;
+    GLuint buffer;
+    unsigned int flags;
+} cudaGraphicsGLRegisterBuffer_v3020_params;
+typedef struct cudaGLSetGLDevice_v3020_params_st {
+    int device;
+} cudaGLSetGLDevice_v3020_params;
+typedef struct cudaGLRegisterBufferObject_v3020_params_st {
+    GLuint bufObj;
+} cudaGLRegisterBufferObject_v3020_params;
+typedef struct cudaGLMapBufferObject_v3020_params_st {
+    void **devPtr;
+    GLuint bufObj;
+} cudaGLMapBufferObject_v3020_params;
+typedef struct cudaGLUnmapBufferObject_v3020_params_st {
+    GLuint bufObj;
+} cudaGLUnmapBufferObject_v3020_params;
+typedef struct cudaGLUnregisterBufferObject_v3020_params_st {
+    GLuint bufObj;
+} cudaGLUnregisterBufferObject_v3020_params;
+typedef struct cudaGLSetBufferObjectMapFlags_v3020_params_st {
+    GLuint bufObj;
+    unsigned int flags;
+} cudaGLSetBufferObjectMapFlags_v3020_params;
+typedef struct cudaGLMapBufferObjectAsync_v3020_params_st {
+    void **devPtr;
+    GLuint bufObj;
+    cudaStream_t stream;
+} cudaGLMapBufferObjectAsync_v3020_params;
+typedef struct cudaGLUnmapBufferObjectAsync_v3020_params_st {
+    GLuint bufObj;
+    cudaStream_t stream;
+} cudaGLUnmapBufferObjectAsync_v3020_params;
+// Parameter trace structures for removed functions
+// End of parameter trace structures

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/lib/__init__.py ADDED Viewed

File without changes

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/lib/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (222 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/common_functions.h ADDED Viewed

	@@ -0,0 +1,65 @@

+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("common_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "common_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__
+#endif
+#include "crt/common_functions.h"
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__
+#endif

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaEGL.h ADDED Viewed

	@@ -0,0 +1,659 @@

+/*
+ * Copyright 2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef CUDAEGL_H
+#define CUDAEGL_H
+#include "cuda.h"
+#include "EGL/egl.h"
+#include "EGL/eglext.h"
+#ifdef CUDA_FORCE_API_VERSION
+#error "CUDA_FORCE_API_VERSION is no longer supported."
+#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+  * \addtogroup CUDA_TYPES
+  * @{
+  */
+/**
+ * Maximum number of planes per frame
+ */
+#define MAX_PLANES 3
+/**
+  * CUDA EglFrame type - array or pointer
+  */
+typedef enum CUeglFrameType_enum {
+    CU_EGL_FRAME_TYPE_ARRAY = 0,  /**< Frame type CUDA array */
+    CU_EGL_FRAME_TYPE_PITCH = 1,  /**< Frame type pointer */
+} CUeglFrameType;
+/**
+ * Indicates that timeout for ::cuEGLStreamConsumerAcquireFrame is infinite.
+ */
+#define CUDA_EGL_INFINITE_TIMEOUT 0xFFFFFFFF
+/**
+ * Resource location flags- sysmem or vidmem
+ *
+ * For CUDA context on iGPU, since video and system memory are equivalent -
+ * these flags will not have an effect on the execution.
+ *
+ * For CUDA context on dGPU, applications can use the flag ::CUeglResourceLocationFlags
+ * to give a hint about the desired location.
+ *
+ * ::CU_EGL_RESOURCE_LOCATION_SYSMEM - the frame data is made resident on the system memory
+ * to be accessed by CUDA.
+ *
+ * ::CU_EGL_RESOURCE_LOCATION_VIDMEM - the frame data is made resident on the dedicated
+ * video memory to be accessed by CUDA.
+ *
+ * There may be an additional latency due to new allocation and data migration,
+ * if the frame is produced on a different memory.
+  */
+typedef enum CUeglResourceLocationFlags_enum {
+    CU_EGL_RESOURCE_LOCATION_SYSMEM   = 0x00,       /**< Resource location sysmem */
+    CU_EGL_RESOURCE_LOCATION_VIDMEM   = 0x01        /**< Resource location vidmem */
+} CUeglResourceLocationFlags;
+/**
+  * CUDA EGL Color Format - The different planar and multiplanar formats currently supported for CUDA_EGL interops.
+  * Three channel formats are currently not supported for ::CU_EGL_FRAME_TYPE_ARRAY
+  */
+typedef enum CUeglColorFormat_enum {
+    CU_EGL_COLOR_FORMAT_YUV420_PLANAR              = 0x00,  /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR          = 0x01,  /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV420Planar. */
+    CU_EGL_COLOR_FORMAT_YUV422_PLANAR              = 0x02,  /**< Y, U, V  each in a separate  surface, U/V width = 1/2 Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YUV422_SEMIPLANAR          = 0x03,  /**< Y, UV in two surfaces with VU byte ordering, width, height ratio same as YUV422Planar. */
+    CU_EGL_COLOR_FORMAT_RGB                        = 0x04,  /**< R/G/B three channels in one surface with BGR byte ordering. Only pitch linear format supported. */
+    CU_EGL_COLOR_FORMAT_BGR                        = 0x05,  /**< R/G/B three channels in one surface with RGB byte ordering. Only pitch linear format supported. */
+    CU_EGL_COLOR_FORMAT_ARGB                       = 0x06,  /**< R/G/B/A four channels in one surface with BGRA byte ordering. */
+    CU_EGL_COLOR_FORMAT_RGBA                       = 0x07,  /**< R/G/B/A four channels in one surface with ABGR byte ordering. */
+    CU_EGL_COLOR_FORMAT_L                          = 0x08,  /**< single luminance channel in one surface. */
+    CU_EGL_COLOR_FORMAT_R                          = 0x09,  /**< single color channel in one surface. */
+    CU_EGL_COLOR_FORMAT_YUV444_PLANAR              = 0x0A,  /**< Y, U, V in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YUV444_SEMIPLANAR          = 0x0B,  /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV444Planar. */
+    CU_EGL_COLOR_FORMAT_YUYV_422                   = 0x0C,  /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
+    CU_EGL_COLOR_FORMAT_UYVY_422                   = 0x0D,  /**< Y, U, V in one surface, interleaved as YUYV in one channel. */
+    CU_EGL_COLOR_FORMAT_ABGR                       = 0x0E,  /**< R/G/B/A four channels in one surface with RGBA byte ordering. */
+    CU_EGL_COLOR_FORMAT_BGRA                       = 0x0F,  /**< R/G/B/A four channels in one surface with ARGB byte ordering. */
+    CU_EGL_COLOR_FORMAT_A                          = 0x10,  /**< Alpha color format - one channel in one surface. */
+    CU_EGL_COLOR_FORMAT_RG                         = 0x11,  /**< R/G color format - two channels in one surface with GR byte ordering */
+    CU_EGL_COLOR_FORMAT_AYUV                       = 0x12,  /**< Y, U, V, A four channels in one surface, interleaved as VUYA. */
+    CU_EGL_COLOR_FORMAT_YVU444_SEMIPLANAR          = 0x13,  /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YVU422_SEMIPLANAR          = 0x14,  /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR          = 0x15,  /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_444_SEMIPLANAR   = 0x16,  /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR   = 0x17,  /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_Y12V12U12_444_SEMIPLANAR   = 0x18,  /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_Y12V12U12_420_SEMIPLANAR   = 0x19,  /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_VYUY_ER                    = 0x1A,  /**< Extended Range Y, U, V in one surface, interleaved as YVYU in one channel. */
+    CU_EGL_COLOR_FORMAT_UYVY_ER                    = 0x1B,  /**< Extended Range Y, U, V in one surface, interleaved as YUYV in one channel. */
+    CU_EGL_COLOR_FORMAT_YUYV_ER                    = 0x1C,  /**< Extended Range Y, U, V in one surface, interleaved as UYVY in one channel. */
+    CU_EGL_COLOR_FORMAT_YVYU_ER                    = 0x1D,  /**< Extended Range Y, U, V in one surface, interleaved as VYUY in one channel. */
+    CU_EGL_COLOR_FORMAT_YUV_ER                     = 0x1E,  /**< Extended Range Y, U, V three channels in one surface, interleaved as VUY. Only pitch linear format supported. */
+    CU_EGL_COLOR_FORMAT_YUVA_ER                    = 0x1F,  /**< Extended Range Y, U, V, A four channels in one surface, interleaved as AVUY. */
+    CU_EGL_COLOR_FORMAT_AYUV_ER                    = 0x20,  /**< Extended Range Y, U, V, A four channels in one surface, interleaved as VUYA. */
+    CU_EGL_COLOR_FORMAT_YUV444_PLANAR_ER           = 0x21,  /**< Extended Range Y, U, V in three surfaces, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YUV422_PLANAR_ER           = 0x22,  /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YUV420_PLANAR_ER           = 0x23,  /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YUV444_SEMIPLANAR_ER       = 0x24,  /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YUV422_SEMIPLANAR_ER       = 0x25,  /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR_ER       = 0x26,  /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YVU444_PLANAR_ER           = 0x27,  /**< Extended Range Y, V, U in three surfaces, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YVU422_PLANAR_ER           = 0x28,  /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YVU420_PLANAR_ER           = 0x29,  /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YVU444_SEMIPLANAR_ER       = 0x2A,  /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YVU422_SEMIPLANAR_ER       = 0x2B,  /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR_ER       = 0x2C,  /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_BAYER_RGGB                 = 0x2D,  /**< Bayer format - one channel in one surface with interleaved RGGB ordering. */
+    CU_EGL_COLOR_FORMAT_BAYER_BGGR                 = 0x2E,  /**< Bayer format - one channel in one surface with interleaved BGGR ordering. */
+    CU_EGL_COLOR_FORMAT_BAYER_GRBG                 = 0x2F,  /**< Bayer format - one channel in one surface with interleaved GRBG ordering. */
+    CU_EGL_COLOR_FORMAT_BAYER_GBRG                 = 0x30,  /**< Bayer format - one channel in one surface with interleaved GBRG ordering. */
+    CU_EGL_COLOR_FORMAT_BAYER10_RGGB               = 0x31,  /**< Bayer10 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER10_BGGR               = 0x32,  /**< Bayer10 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER10_GRBG               = 0x33,  /**< Bayer10 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER10_GBRG               = 0x34,  /**< Bayer10 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_RGGB               = 0x35,  /**< Bayer12 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_BGGR               = 0x36,  /**< Bayer12 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_GRBG               = 0x37,  /**< Bayer12 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_GBRG               = 0x38,  /**< Bayer12 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER14_RGGB               = 0x39,  /**< Bayer14 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER14_BGGR               = 0x3A,  /**< Bayer14 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER14_GRBG               = 0x3B,  /**< Bayer14 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER14_GBRG               = 0x3C,  /**< Bayer14 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER20_RGGB               = 0x3D,  /**< Bayer20 format - one channel in one surface with interleaved RGGB ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER20_BGGR               = 0x3E,  /**< Bayer20 format - one channel in one surface with interleaved BGGR ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER20_GRBG               = 0x3F,  /**< Bayer20 format - one channel in one surface with interleaved GRBG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER20_GBRG               = 0x40,  /**< Bayer20 format - one channel in one surface with interleaved GBRG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    CU_EGL_COLOR_FORMAT_YVU444_PLANAR              = 0x41,  /**< Y, V, U in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YVU422_PLANAR              = 0x42,  /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YVU420_PLANAR              = 0x43,  /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_BAYER_ISP_RGGB             = 0x44,  /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved RGGB ordering and mapped to opaque integer datatype. */
+    CU_EGL_COLOR_FORMAT_BAYER_ISP_BGGR             = 0x45,  /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved BGGR ordering and mapped to opaque integer datatype. */
+    CU_EGL_COLOR_FORMAT_BAYER_ISP_GRBG             = 0x46,  /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GRBG ordering and mapped to opaque integer datatype. */
+    CU_EGL_COLOR_FORMAT_BAYER_ISP_GBRG             = 0x47,  /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GBRG ordering and mapped to opaque integer datatype. */
+    CU_EGL_COLOR_FORMAT_BAYER_BCCR                 = 0x48,  /**< Bayer format - one channel in one surface with interleaved BCCR ordering. */
+    CU_EGL_COLOR_FORMAT_BAYER_RCCB                 = 0x49,  /**< Bayer format - one channel in one surface with interleaved RCCB ordering. */
+    CU_EGL_COLOR_FORMAT_BAYER_CRBC                 = 0x4A,  /**< Bayer format - one channel in one surface with interleaved CRBC ordering. */
+    CU_EGL_COLOR_FORMAT_BAYER_CBRC                 = 0x4B,  /**< Bayer format - one channel in one surface with interleaved CBRC ordering. */
+    CU_EGL_COLOR_FORMAT_BAYER10_CCCC               = 0x4C,  /**< Bayer10 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_BCCR               = 0x4D,  /**< Bayer12 format - one channel in one surface with interleaved BCCR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_RCCB               = 0x4E,  /**< Bayer12 format - one channel in one surface with interleaved RCCB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_CRBC               = 0x4F,  /**< Bayer12 format - one channel in one surface with interleaved CRBC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_CBRC               = 0x50,  /**< Bayer12 format - one channel in one surface with interleaved CBRC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_CCCC               = 0x51,  /**< Bayer12 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_Y                          = 0x52, /**< Color format for single Y plane. */
+    CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR_2020     = 0x53, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR_2020     = 0x54, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YUV420_PLANAR_2020         = 0x55, /**< Y, U, V  each in a separate  surface, U/V width = 1/2 Y width, U/V height= 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YVU420_PLANAR_2020         = 0x56, /**< Y, V, U each in a separate surface, U/V width = 1/2 Y width, U/V height
+= 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR_709      = 0x57, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR_709      = 0x58, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YUV420_PLANAR_709          = 0x59, /**< Y, U, V  each in a separate  surface, U/V width = 1/2 Y width, U/V height
+= 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YVU420_PLANAR_709          = 0x5A,  /**< Y, V, U each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_709  = 0x5B, /**< Y10, V10U10 in two surfaces (VU as one surface), U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_2020 = 0x5C, /**< Y10, V10U10 in two surfaces (VU as one surface), U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_422_SEMIPLANAR_2020 = 0x5D, /**< Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height  = Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_422_SEMIPLANAR      = 0x5E, /**< Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height  = Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_422_SEMIPLANAR_709  = 0x5F, /**< Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height  = Y height. */
+    CU_EGL_COLOR_FORMAT_Y_ER                          = 0x60, /**< Extended Range Color format for single Y plane. */
+    CU_EGL_COLOR_FORMAT_Y_709_ER                      = 0x61, /**< Extended Range Color format for single Y plane. */
+    CU_EGL_COLOR_FORMAT_Y10_ER                        = 0x62, /**< Extended Range Color format for single Y10 plane. */
+    CU_EGL_COLOR_FORMAT_Y10_709_ER                    = 0x63, /**< Extended Range Color format for single Y10 plane. */
+    CU_EGL_COLOR_FORMAT_Y12_ER                        = 0x64, /**< Extended Range Color format for single Y12 plane. */
+    CU_EGL_COLOR_FORMAT_Y12_709_ER                    = 0x65, /**< Extended Range Color format for single Y12 plane. */
+    CU_EGL_COLOR_FORMAT_YUVA                          = 0x66, /**< Y, U, V, A four channels in one surface, interleaved as AVUY. */
+    CU_EGL_COLOR_FORMAT_YUV                           = 0x67, /**< Y, U, V three channels in one surface, interleaved as VUY. Only pitch linear format supported. */
+    CU_EGL_COLOR_FORMAT_YVYU                          = 0x68, /**< Y, U, V in one surface, interleaved as YVYU in one channel. */
+    CU_EGL_COLOR_FORMAT_VYUY                          = 0x69, /**< Y, U, V in one surface, interleaved as VYUY in one channel. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_ER     = 0x6A, /**< Extended Range Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_709_ER = 0x6B, /**< Extended Range Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_444_SEMIPLANAR_ER     = 0x6C, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_444_SEMIPLANAR_709_ER = 0x6D, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface)  U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_Y12V12U12_420_SEMIPLANAR_ER     = 0x6E, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_Y12V12U12_420_SEMIPLANAR_709_ER = 0x6F, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_Y12V12U12_444_SEMIPLANAR_ER     = 0x70, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_Y12V12U12_444_SEMIPLANAR_709_ER = 0x71, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_MAX
+} CUeglColorFormat;
+/**
+ * CUDA EGLFrame structure Descriptor - structure defining one frame of EGL.
+ *
+ * Each frame may contain one or more planes depending on whether the surface  * is Multiplanar or not.
+ */
+typedef struct CUeglFrame_st {
+    union {
+        CUarray pArray[MAX_PLANES];     /**< Array of CUarray corresponding to each plane*/
+        void*   pPitch[MAX_PLANES];     /**< Array of Pointers corresponding to each plane*/
+    } frame;
+    unsigned int width;                 /**< Width of first plane */
+    unsigned int height;                /**< Height of first plane */
+    unsigned int depth;                 /**< Depth of first plane */
+    unsigned int pitch;                 /**< Pitch of first plane */
+    unsigned int planeCount;            /**< Number of planes */
+    unsigned int numChannels;           /**< Number of channels for the plane */
+    CUeglFrameType frameType;           /**< Array or Pitch */
+    CUeglColorFormat eglColorFormat;    /**< CUDA EGL Color Format*/
+    CUarray_format cuFormat;            /**< CUDA Array Format*/
+} CUeglFrame_v1;
+typedef CUeglFrame_v1 CUeglFrame;
+/**
+  * CUDA EGLSream Connection
+  */
+typedef struct CUeglStreamConnection_st* CUeglStreamConnection;
+/** @} */ /* END CUDA_TYPES */
+/**
+ * \file cudaEGL.h
+ * \brief Header file for the EGL interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ */
+/**
+ * \defgroup CUDA_EGL EGL Interoperability
+ * \ingroup CUDA_DRIVER
+ *
+ * ___MANBRIEF___ EGL interoperability functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the EGL interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+/**
+ * \brief Registers an EGL image
+ *
+ * Registers the EGLImageKHR specified by \p image for access by
+ * CUDA. A handle to the registered object is returned as \p pCudaResource.
+ * Additional Mapping/Unmapping is not required for the registered resource and
+ * ::cuGraphicsResourceGetMappedEglFrame can be directly called on the \p pCudaResource.
+ *
+ * The application will be responsible for synchronizing access to shared objects.
+ * The application must ensure that any pending operation which access the objects have completed
+ * before passing control to CUDA. This may be accomplished by issuing and waiting for
+ * glFinish command on all GLcontexts (for OpenGL and likewise for other APIs).
+ * The application will be also responsible for ensuring that any pending operation on the
+ * registered CUDA resource has completed prior to executing subsequent commands in other APIs
+ * accesing the same memory objects.
+ * This can be accomplished by calling cuCtxSynchronize or cuEventSynchronize (preferably).
+ *
+ * The surface's intended usage is specified using \p flags, as follows:
+ *
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * The EGLImageKHR is an object which can be used to create EGLImage target resource. It is defined as a void pointer.
+ * typedef void* EGLImageKHR
+ *
+ * \param pCudaResource   - Pointer to the returned object handle
+ * \param image           - An EGLImageKHR image which can be used to create target resource.
+ * \param flags           - Map flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ *
+ * \sa ::cuGraphicsEGLRegisterImage, ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
+ * ::cuGraphicsUnmapResources,
+ * ::cudaGraphicsEGLRegisterImage
+ */
+CUresult CUDAAPI cuGraphicsEGLRegisterImage(CUgraphicsResource *pCudaResource, EGLImageKHR image, unsigned int flags);
+/**
+ * \brief Connect CUDA to EGLStream as a consumer.
+ *
+ * Connect CUDA as a consumer to EGLStreamKHR specified by \p stream.
+ *
+ * The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
+ * API to another.
+ *
+ * \param conn            - Pointer to the returned connection handle
+ * \param stream          - EGLStreamKHR handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ *
+ * \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
+ * ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
+ * ::cudaEGLStreamConsumerConnect
+ */
+CUresult CUDAAPI cuEGLStreamConsumerConnect(CUeglStreamConnection *conn, EGLStreamKHR stream);
+/**
+ * \brief Connect CUDA to EGLStream as a consumer with given flags.
+ *
+ * Connect CUDA as a consumer to EGLStreamKHR specified by \p stream with specified \p flags defined by CUeglResourceLocationFlags.
+ *
+ * The flags specify whether the consumer wants to access frames from system memory or video memory.
+ * Default is ::CU_EGL_RESOURCE_LOCATION_VIDMEM.
+ *
+ * \param conn              - Pointer to the returned connection handle
+ * \param stream            - EGLStreamKHR handle
+ * \param flags             - Flags denote intended location - system or video.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ *
+ * \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
+ * ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
+ * ::cudaEGLStreamConsumerConnectWithFlags
+ */
+CUresult CUDAAPI cuEGLStreamConsumerConnectWithFlags(CUeglStreamConnection *conn, EGLStreamKHR stream, unsigned int flags);
+/**
+ * \brief Disconnect CUDA as a consumer to EGLStream .
+ *
+ * Disconnect CUDA as a consumer to EGLStreamKHR.
+ *
+ * \param conn            - Conection to disconnect.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ *
+ * \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
+ * ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
+ * ::cudaEGLStreamConsumerDisconnect
+ */
+CUresult CUDAAPI cuEGLStreamConsumerDisconnect(CUeglStreamConnection *conn);
+/**
+ * \brief Acquire an image frame from the EGLStream with CUDA as a consumer.
+ *
+ * Acquire an image frame from EGLStreamKHR. This API can also acquire an old frame presented
+ * by the producer unless explicitly disabled by setting EGL_SUPPORT_REUSE_NV flag to EGL_FALSE
+ * during stream initialization. By default, EGLStream is created with this flag set to EGL_TRUE.
+ * ::cuGraphicsResourceGetMappedEglFrame can be called on \p pCudaResource to get
+ * ::CUeglFrame.
+ *
+ * \param conn            - Connection on which to acquire
+ * \param pCudaResource   - CUDA resource on which the stream frame will be mapped for use.
+ * \param pStream         - CUDA stream for synchronization and any data migrations
+ *                          implied by ::CUeglResourceLocationFlags.
+ * \param timeout         - Desired timeout in usec for a new frame to be acquired.
+ *                          If set as ::CUDA_EGL_INFINITE_TIMEOUT, acquire waits infinitely.
+ *                          After timeout occurs CUDA consumer tries to acquire an old frame
+ *                          if available and EGL_SUPPORT_REUSE_NV flag is set.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ *
+ * \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
+ * ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
+ * ::cudaEGLStreamConsumerAcquireFrame
+ */
+CUresult CUDAAPI cuEGLStreamConsumerAcquireFrame(CUeglStreamConnection *conn,
+                                                  CUgraphicsResource *pCudaResource, CUstream *pStream, unsigned int timeout);
+/**
+ * \brief Releases the last frame acquired from the EGLStream.
+ *
+ * Release the acquired image frame specified by \p pCudaResource to EGLStreamKHR.
+ * If EGL_SUPPORT_REUSE_NV flag is set to EGL_TRUE, at the time of EGL creation
+ * this API doesn't release the last frame acquired on the EGLStream.
+ * By default, EGLStream is created with this flag set to EGL_TRUE.
+ *
+ * \param conn            - Connection on which to release
+ * \param pCudaResource   - CUDA resource whose corresponding frame is to be released
+ * \param pStream         - CUDA stream on which release will be done.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ *
+ * \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
+ * ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
+ * ::cudaEGLStreamConsumerReleaseFrame
+ */
+CUresult CUDAAPI cuEGLStreamConsumerReleaseFrame(CUeglStreamConnection *conn,
+                                                  CUgraphicsResource pCudaResource, CUstream *pStream);
+/**
+ * \brief Connect CUDA to EGLStream as a producer.
+ *
+ * Connect CUDA as a producer to EGLStreamKHR specified by \p stream.
+ *
+ * The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
+ * API to another.
+ *
+ * \param conn   - Pointer to the returned connection handle
+ * \param stream - EGLStreamKHR handle
+ * \param width  - width of the image to be submitted to the stream
+ * \param height - height of the image to be submitted to the stream
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ *
+ * \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
+ * ::cuEGLStreamProducerPresentFrame,
+ * ::cudaEGLStreamProducerConnect
+ */
+CUresult CUDAAPI cuEGLStreamProducerConnect(CUeglStreamConnection *conn, EGLStreamKHR stream,
+                                             EGLint width, EGLint height);
+/**
+ * \brief Disconnect CUDA as a producer  to EGLStream .
+ *
+ * Disconnect CUDA as a producer to EGLStreamKHR.
+ *
+ * \param conn            - Conection to disconnect.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ *
+ * \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
+ * ::cuEGLStreamProducerPresentFrame,
+ * ::cudaEGLStreamProducerDisconnect
+ */
+CUresult CUDAAPI cuEGLStreamProducerDisconnect(CUeglStreamConnection *conn);
+/**
+ * \brief Present a CUDA eglFrame to the EGLStream with CUDA as a producer.
+ *
+ * When a frame is presented by the producer, it gets associated with the EGLStream
+ * and thus it is illegal to free the frame before the producer is disconnected.
+ * If a frame is freed and reused it may lead to undefined behavior.
+ *
+ * If producer and consumer are on different GPUs (iGPU and dGPU) then frametype
+ * ::CU_EGL_FRAME_TYPE_ARRAY is not supported. ::CU_EGL_FRAME_TYPE_PITCH can be used for
+ * such cross-device applications.
+ *
+ * The ::CUeglFrame is defined as:
+ * \code
+ * typedef struct CUeglFrame_st {
+ *     union {
+ *         CUarray pArray[MAX_PLANES];
+ *         void*   pPitch[MAX_PLANES];
+ *     } frame;
+ *     unsigned int width;
+ *     unsigned int height;
+ *     unsigned int depth;
+ *     unsigned int pitch;
+ *     unsigned int planeCount;
+ *     unsigned int numChannels;
+ *     CUeglFrameType frameType;
+ *     CUeglColorFormat eglColorFormat;
+ *     CUarray_format cuFormat;
+ * } CUeglFrame;
+ * \endcode
+ *
+ * For ::CUeglFrame of type ::CU_EGL_FRAME_TYPE_PITCH, the application may present sub-region of a memory
+ * allocation. In that case, the pitched pointer will specify the start address of the sub-region in
+ * the allocation and corresponding ::CUeglFrame fields will specify the dimensions of the sub-region.
+ *
+ * \param conn            - Connection on which to present the CUDA array
+ * \param eglframe        - CUDA Eglstream Proucer Frame handle to be sent to the consumer over EglStream.
+ * \param pStream         - CUDA stream on which to present the frame.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ *
+ * \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
+ * ::cuEGLStreamProducerReturnFrame,
+ * ::cudaEGLStreamProducerPresentFrame
+ */
+CUresult CUDAAPI cuEGLStreamProducerPresentFrame(CUeglStreamConnection *conn,
+                                                 CUeglFrame eglframe, CUstream *pStream);
+/**
+ * \brief Return the CUDA eglFrame to the EGLStream released by the consumer.
+ *
+ * This API can potentially return CUDA_ERROR_LAUNCH_TIMEOUT if the consumer has not
+ * returned a frame to EGL stream. If timeout is returned the application can retry.
+ *
+ * \param conn            - Connection on which to return
+ * \param eglframe        - CUDA Eglstream Proucer Frame handle returned from the consumer over EglStream.
+ * \param pStream         - CUDA stream on which to return the frame.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT
+ *
+ * \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
+ * ::cuEGLStreamProducerPresentFrame,
+ * ::cudaEGLStreamProducerReturnFrame
+ */
+CUresult CUDAAPI cuEGLStreamProducerReturnFrame(CUeglStreamConnection *conn,
+                                                CUeglFrame *eglframe, CUstream *pStream);
+/**
+ * \brief Get an eglFrame through which to access a registered EGL graphics resource.
+ *
+ * Returns in \p *eglFrame an eglFrame pointer through which the registered graphics resource
+ * \p resource may be accessed.
+ * This API can only be called for registered EGL graphics resources.
+ *
+ * The ::CUeglFrame is defined as:
+ * \code
+ * typedef struct CUeglFrame_st {
+ *     union {
+ *         CUarray pArray[MAX_PLANES];
+ *         void*   pPitch[MAX_PLANES];
+ *     } frame;
+ *     unsigned int width;
+ *     unsigned int height;
+ *     unsigned int depth;
+ *     unsigned int pitch;
+ *     unsigned int planeCount;
+ *     unsigned int numChannels;
+ *     CUeglFrameType frameType;
+ *     CUeglColorFormat eglColorFormat;
+ *     CUarray_format cuFormat;
+ * } CUeglFrame;
+ * \endcode
+ *
+ * If \p resource is not registered then ::CUDA_ERROR_NOT_MAPPED is returned.
+ * *
+ * \param eglFrame   - Returned eglFrame.
+ * \param resource   - Registered resource to access.
+ * \param index      - Index for cubemap surfaces.
+ * \param mipLevel   - Mipmap level for the subresource to access.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED
+ *
+ * \sa
+ * ::cuGraphicsMapResources,
+ * ::cuGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsResourceGetMappedPointer,
+ * ::cudaGraphicsResourceGetMappedEglFrame
+ */
+CUresult CUDAAPI cuGraphicsResourceGetMappedEglFrame(CUeglFrame* eglFrame, CUgraphicsResource resource, unsigned int index, unsigned int mipLevel);
+/**
+ * \brief Creates an event from EGLSync object
+ *
+ * Creates an event *phEvent from an EGLSyncKHR eglSync with the flags specified
+ * via \p flags. Valid flags include:
+ * - ::CU_EVENT_DEFAULT: Default event creation flag.
+ * - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking
+ * synchronization.  A CPU thread that uses ::cuEventSynchronize() to wait on
+ * an event created with this flag will block until the event has actually
+ * been completed.
+ *
+ * Once the \p eglSync gets destroyed, ::cuEventDestroy is the only API
+ * that can be invoked on the event.
+ *
+ * ::cuEventRecord and TimingData are not supported for events created from EGLSync.
+ *
+ * The EGLSyncKHR is an opaque handle to an EGL sync object.
+ * typedef void* EGLSyncKHR
+ *
+ * \param phEvent - Returns newly created event
+ * \param eglSync - Opaque handle to EGLSync object
+ * \param flags   - Event creation flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuEventDestroy
+ */
+CUresult CUDAAPI cuEventCreateFromEGLSync(CUevent *phEvent, EGLSyncKHR eglSync, unsigned int flags);
+/** @} */ /* END CUDA_EGL */
+#ifdef __cplusplus
+};
+#endif
+#endif

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_egl_interop.h ADDED Viewed

	@@ -0,0 +1,642 @@

+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__CUDA_EGL_INTEROP_H__)
+#define __CUDA_EGL_INTEROP_H__
+#include "cuda_runtime_api.h"
+#include "cuda_runtime.h"
+#include "cudart_platform.h"
+#include "EGL/egl.h"
+#include "EGL/eglext.h"
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+/**
+ * \addtogroup CUDART_TYPES
+ * @{
+ */
+ /**
+ * Maximum number of planes per frame
+ */
+#define CUDA_EGL_MAX_PLANES 3
+/**
+ * CUDA EglFrame type - array or pointer
+ */
+typedef enum cudaEglFrameType_enum
+{
+    cudaEglFrameTypeArray = 0,  /**< Frame type CUDA array */
+    cudaEglFrameTypePitch = 1,  /**< Frame type CUDA pointer */
+} cudaEglFrameType;
+/**
+ * Resource location flags- sysmem or vidmem
+ *
+ * For CUDA context on iGPU, since video and system memory are equivalent -
+ * these flags will not have an effect on the execution.
+ *
+ * For CUDA context on dGPU, applications can use the flag ::cudaEglResourceLocationFlags
+ * to give a hint about the desired location.
+ *
+ * ::cudaEglResourceLocationSysmem - the frame data is made resident on the system memory
+ * to be accessed by CUDA.
+ *
+ * ::cudaEglResourceLocationVidmem - the frame data is made resident on the dedicated
+ * video memory to be accessed by CUDA.
+ *
+ * There may be an additional latency due to new allocation and data migration,
+ * if the frame is produced on a different memory.
+ */
+typedef enum cudaEglResourceLocationFlags_enum {
+    cudaEglResourceLocationSysmem   = 0x00,       /**< Resource location sysmem */
+    cudaEglResourceLocationVidmem   = 0x01,       /**< Resource location vidmem */
+} cudaEglResourceLocationFlags;
+/**
+ * CUDA EGL Color Format - The different planar and multiplanar formats currently supported for CUDA_EGL interops.
+ */
+typedef enum cudaEglColorFormat_enum {
+    cudaEglColorFormatYUV420Planar            = 0,  /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYUV420SemiPlanar        = 1,  /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV420Planar. */
+    cudaEglColorFormatYUV422Planar            = 2,  /**< Y, U, V  each in a separate  surface, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYUV422SemiPlanar        = 3,  /**< Y, UV in two surfaces with VU byte ordering, width, height ratio same as YUV422Planar. */
+    cudaEglColorFormatARGB                    = 6,  /**< R/G/B/A four channels in one surface with BGRA byte ordering. */
+    cudaEglColorFormatRGBA                    = 7,  /**< R/G/B/A four channels in one surface with ABGR byte ordering. */
+    cudaEglColorFormatL                       = 8,  /**< single luminance channel in one surface. */
+    cudaEglColorFormatR                       = 9,  /**< single color channel in one surface. */
+    cudaEglColorFormatYUV444Planar            = 10, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYUV444SemiPlanar        = 11, /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV444Planar. */
+    cudaEglColorFormatYUYV422                 = 12, /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
+    cudaEglColorFormatUYVY422                 = 13, /**< Y, U, V in one surface, interleaved as YUYV in one channel. */
+    cudaEglColorFormatABGR                    = 14, /**< R/G/B/A four channels in one surface with RGBA byte ordering. */
+    cudaEglColorFormatBGRA                    = 15, /**< R/G/B/A four channels in one surface with ARGB byte ordering. */
+    cudaEglColorFormatA                       = 16, /**< Alpha color format - one channel in one surface. */
+    cudaEglColorFormatRG                      = 17, /**< R/G color format - two channels in one surface with GR byte ordering */
+    cudaEglColorFormatAYUV                    = 18, /**< Y, U, V, A four channels in one surface, interleaved as VUYA. */
+    cudaEglColorFormatYVU444SemiPlanar        = 19, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU422SemiPlanar        = 20, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU420SemiPlanar        = 21, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY10V10U10_444SemiPlanar = 22, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatY10V10U10_420SemiPlanar = 23, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY12V12U12_444SemiPlanar = 24, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatY12V12U12_420SemiPlanar = 25, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatVYUY_ER                 = 26, /**< Extended Range Y, U, V in one surface, interleaved as YVYU in one channel. */
+    cudaEglColorFormatUYVY_ER                 = 27, /**< Extended Range Y, U, V in one surface, interleaved as YUYV in one channel. */
+    cudaEglColorFormatYUYV_ER                 = 28, /**< Extended Range Y, U, V in one surface, interleaved as UYVY in one channel. */
+    cudaEglColorFormatYVYU_ER                 = 29, /**< Extended Range Y, U, V in one surface, interleaved as VYUY in one channel. */
+    cudaEglColorFormatYUVA_ER                 = 31, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as AVUY. */
+    cudaEglColorFormatAYUV_ER                 = 32, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as VUYA. */
+    cudaEglColorFormatYUV444Planar_ER         = 33, /**< Extended Range Y, U, V in three surfaces, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYUV422Planar_ER         = 34, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYUV420Planar_ER         = 35, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYUV444SemiPlanar_ER     = 36, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYUV422SemiPlanar_ER     = 37, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYUV420SemiPlanar_ER     = 38, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYVU444Planar_ER         = 39, /**< Extended Range Y, V, U in three surfaces, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU422Planar_ER         = 40, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU420Planar_ER         = 41, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYVU444SemiPlanar_ER     = 42, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU422SemiPlanar_ER     = 43, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU420SemiPlanar_ER     = 44, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatBayerRGGB               = 45, /**< Bayer format - one channel in one surface with interleaved RGGB ordering. */
+    cudaEglColorFormatBayerBGGR               = 46, /**< Bayer format - one channel in one surface with interleaved BGGR ordering. */
+    cudaEglColorFormatBayerGRBG               = 47, /**< Bayer format - one channel in one surface with interleaved GRBG ordering. */
+    cudaEglColorFormatBayerGBRG               = 48, /**< Bayer format - one channel in one surface with interleaved GBRG ordering. */
+    cudaEglColorFormatBayer10RGGB             = 49, /**< Bayer10 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    cudaEglColorFormatBayer10BGGR             = 50, /**< Bayer10 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    cudaEglColorFormatBayer10GRBG             = 51, /**< Bayer10 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    cudaEglColorFormatBayer10GBRG             = 52, /**< Bayer10 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    cudaEglColorFormatBayer12RGGB             = 53, /**< Bayer12 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12BGGR             = 54, /**< Bayer12 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12GRBG             = 55, /**< Bayer12 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12GBRG             = 56, /**< Bayer12 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer14RGGB             = 57, /**< Bayer14 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    cudaEglColorFormatBayer14BGGR             = 58, /**< Bayer14 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    cudaEglColorFormatBayer14GRBG             = 59, /**< Bayer14 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    cudaEglColorFormatBayer14GBRG             = 60, /**< Bayer14 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    cudaEglColorFormatBayer20RGGB             = 61, /**< Bayer20 format - one channel in one surface with interleaved RGGB ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    cudaEglColorFormatBayer20BGGR             = 62, /**< Bayer20 format - one channel in one surface with interleaved BGGR ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    cudaEglColorFormatBayer20GRBG             = 63, /**< Bayer20 format - one channel in one surface with interleaved GRBG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    cudaEglColorFormatBayer20GBRG             = 64, /**< Bayer20 format - one channel in one surface with interleaved GBRG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    cudaEglColorFormatYVU444Planar            = 65, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU422Planar            = 66, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU420Planar            = 67, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatBayerIspRGGB            = 68, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved RGGB ordering and mapped to opaque integer datatype. */
+    cudaEglColorFormatBayerIspBGGR            = 69, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved BGGR ordering and mapped to opaque integer datatype. */
+    cudaEglColorFormatBayerIspGRBG            = 70, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GRBG ordering and mapped to opaque integer datatype. */
+    cudaEglColorFormatBayerIspGBRG            = 71, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GBRG ordering and mapped to opaque integer datatype. */
+    cudaEglColorFormatBayerBCCR               = 72, /**< Bayer format - one channel in one surface with interleaved BCCR ordering. */
+    cudaEglColorFormatBayerRCCB               = 73, /**< Bayer format - one channel in one surface with interleaved RCCB ordering. */
+    cudaEglColorFormatBayerCRBC               = 74, /**< Bayer format - one channel in one surface with interleaved CRBC ordering. */
+    cudaEglColorFormatBayerCBRC               = 75, /**< Bayer format - one channel in one surface with interleaved CBRC ordering. */
+    cudaEglColorFormatBayer10CCCC             = 76, /**< Bayer10 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    cudaEglColorFormatBayer12BCCR             = 77, /**< Bayer12 format - one channel in one surface with interleaved BCCR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12RCCB             = 78, /**< Bayer12 format - one channel in one surface with interleaved RCCB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12CRBC             = 79, /**< Bayer12 format - one channel in one surface with interleaved CRBC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12CBRC             = 80, /**< Bayer12 format - one channel in one surface with interleaved CBRC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12CCCC             = 81, /**< Bayer12 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatY                       = 82, /**< Color format for single Y plane. */
+    cudaEglColorFormatYUV420SemiPlanar_2020   = 83, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYVU420SemiPlanar_2020   = 84, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYUV420Planar_2020       = 85, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYVU420Planar_2020       = 86, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYUV420SemiPlanar_709    = 87, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYVU420SemiPlanar_709    = 88, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYUV420Planar_709        = 89, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYVU420Planar_709        = 90, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY10V10U10_420SemiPlanar_709  = 91, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY10V10U10_420SemiPlanar_2020 = 92, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY10V10U10_422SemiPlanar_2020 = 93, /**< Y10, V10U10  in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height =  Y height. */
+    cudaEglColorFormatY10V10U10_422SemiPlanar      = 94, /**< Y10, V10U10  in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height =  Y height. */
+    cudaEglColorFormatY10V10U10_422SemiPlanar_709  = 95, /**< Y10, V10U10  in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height =  Y height. */
+    cudaEglColorFormatY_ER                         = 96, /**< Extended Range Color format for single Y plane. */
+    cudaEglColorFormatY_709_ER                     = 97, /**< Extended Range Color format for single Y plane. */
+    cudaEglColorFormatY10_ER                       = 98, /**< Extended Range Color format for single Y10 plane. */
+    cudaEglColorFormatY10_709_ER                   = 99, /**< Extended Range Color format for single Y10 plane. */
+    cudaEglColorFormatY12_ER                       = 100, /**< Extended Range Color format for single Y12 plane. */
+    cudaEglColorFormatY12_709_ER                   = 101, /**< Extended Range Color format for single Y12 plane. */
+    cudaEglColorFormatYUVA                         = 102, /**< Y, U, V, A four channels in one surface, interleaved as AVUY. */
+    cudaEglColorFormatYVYU                         = 104, /**< Y, U, V in one surface, interleaved as YVYU in one channel. */
+    cudaEglColorFormatVYUY                         = 105, /**< Y, U, V in one surface, interleaved as VYUY in one channel. */
+    cudaEglColorFormatY10V10U10_420SemiPlanar_ER     = 106, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY10V10U10_420SemiPlanar_709_ER = 107, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY10V10U10_444SemiPlanar_ER     = 108, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatY10V10U10_444SemiPlanar_709_ER = 109, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatY12V12U12_420SemiPlanar_ER     = 110, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY12V12U12_420SemiPlanar_709_ER = 111, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY12V12U12_444SemiPlanar_ER     = 112, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatY12V12U12_444SemiPlanar_709_ER = 113, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
+} cudaEglColorFormat;
+/**
+ * CUDA EGL Plane Descriptor - structure defining each plane of a CUDA EGLFrame
+ */
+typedef struct cudaEglPlaneDesc_st {
+    unsigned int width;                         /**< Width of plane */
+    unsigned int height;                        /**< Height of plane */
+    unsigned int depth;                         /**< Depth of plane */
+    unsigned int pitch;                         /**< Pitch of plane */
+    unsigned int numChannels;                   /**< Number of channels for the plane */
+    struct cudaChannelFormatDesc channelDesc;   /**< Channel Format Descriptor */
+    unsigned int reserved[4];                   /**< Reserved for future use */
+} cudaEglPlaneDesc;
+/**
+ * CUDA EGLFrame Descriptor - structure defining one frame of EGL.
+ *
+ * Each frame may contain one or more planes depending on whether the surface is Multiplanar or not.
+ * Each plane of EGLFrame is represented by ::cudaEglPlaneDesc which is defined as:
+ * \code
+ * typedef struct cudaEglPlaneDesc_st {
+ *     unsigned int width;
+ *     unsigned int height;
+ *     unsigned int depth;
+ *     unsigned int pitch;
+ *     unsigned int numChannels;
+ *     struct cudaChannelFormatDesc channelDesc;
+ *     unsigned int reserved[4];
+ * } cudaEglPlaneDesc;
+ * \endcode
+*/
+typedef struct cudaEglFrame_st {
+   union {
+       cudaArray_t            pArray[CUDA_EGL_MAX_PLANES];     /**< Array of CUDA arrays corresponding to each plane*/
+       struct cudaPitchedPtr  pPitch[CUDA_EGL_MAX_PLANES];     /**< Array of Pointers corresponding to each plane*/
+   } frame;
+   cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES];     /**< CUDA EGL Plane Descriptor ::cudaEglPlaneDesc*/
+   unsigned int planeCount;                             /**< Number of planes */
+   cudaEglFrameType frameType;                          /**< Array or Pitch */
+   cudaEglColorFormat eglColorFormat;                   /**< CUDA EGL Color Format*/
+} cudaEglFrame;
+/**
+ * CUDA EGLSream Connection
+ */
+typedef struct  CUeglStreamConnection_st *cudaEglStreamConnection;
+/** @} */ /* END CUDART_TYPES */
+/**
+ * \addtogroup CUDART_EGL EGL Interoperability
+ * This section describes the EGL interoperability functions of the CUDA
+ * runtime application programming interface.
+ *
+ * @{
+ */
+/**
+ * \brief Registers an EGL image
+ *
+ * Registers the EGLImageKHR specified by \p image for access by
+ * CUDA. A handle to the registered object is returned as \p pCudaResource.
+ * Additional Mapping/Unmapping is not required for the registered resource and
+ * ::cudaGraphicsResourceGetMappedEglFrame can be directly called on the \p pCudaResource.
+ *
+ * The application will be responsible for synchronizing access to shared objects.
+ * The application must ensure that any pending operation which access the objects have completed
+ * before passing control to CUDA. This may be accomplished by issuing and waiting for
+ * glFinish command on all GLcontexts (for OpenGL and likewise for other APIs).
+ * The application will be also responsible for ensuring that any pending operation on the
+ * registered CUDA resource has completed prior to executing subsequent commands in other APIs
+ * accesing the same memory objects.
+ * This can be accomplished by calling cuCtxSynchronize or cuEventSynchronize (preferably).
+ *
+ * The surface's intended usage is specified using \p flags, as follows:
+ *
+ * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * The EGLImageKHR is an object which can be used to create EGLImage target resource. It is defined as a void pointer.
+ * typedef void* EGLImageKHR
+ *
+ * \param pCudaResource   - Pointer to the returned object handle
+ * \param image           - An EGLImageKHR image which can be used to create target resource.
+ * \param flags           - Map flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsResourceGetMappedEglFrame,
+ * ::cuGraphicsEGLRegisterImage
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsEGLRegisterImage(struct cudaGraphicsResource **pCudaResource, EGLImageKHR image, unsigned int flags);
+/**
+ * \brief Connect CUDA to EGLStream as a consumer.
+ *
+ * Connect CUDA as a consumer to EGLStreamKHR specified by \p eglStream.
+ *
+ * The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
+ * API to another.
+ *
+ * \param conn              - Pointer to the returned connection handle
+ * \param eglStream         - EGLStreamKHR handle
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamConsumerDisconnect,
+ * ::cudaEGLStreamConsumerAcquireFrame,
+ * ::cudaEGLStreamConsumerReleaseFrame,
+ * ::cuEGLStreamConsumerConnect
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerConnect(cudaEglStreamConnection *conn, EGLStreamKHR eglStream);
+/**
+ * \brief Connect CUDA to EGLStream as a consumer with given flags.
+ *
+ * Connect CUDA as a consumer to EGLStreamKHR specified by \p stream with specified \p flags defined by
+ * ::cudaEglResourceLocationFlags.
+ *
+ * The flags specify whether the consumer wants to access frames from system memory or video memory.
+ * Default is ::cudaEglResourceLocationVidmem.
+ *
+ * \param conn              - Pointer to the returned connection handle
+ * \param eglStream         - EGLStreamKHR handle
+ * \param flags             - Flags denote intended location - system or video.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamConsumerDisconnect,
+ * ::cudaEGLStreamConsumerAcquireFrame,
+ * ::cudaEGLStreamConsumerReleaseFrame,
+ * ::cuEGLStreamConsumerConnectWithFlags
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerConnectWithFlags(cudaEglStreamConnection *conn, EGLStreamKHR eglStream, unsigned int flags);
+/**
+ * \brief Disconnect CUDA as a consumer to EGLStream .
+ *
+ * Disconnect CUDA as a consumer to EGLStreamKHR.
+ *
+ * \param conn            - Conection to disconnect.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamConsumerConnect,
+ * ::cudaEGLStreamConsumerAcquireFrame,
+ * ::cudaEGLStreamConsumerReleaseFrame,
+ * ::cuEGLStreamConsumerDisconnect
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerDisconnect(cudaEglStreamConnection *conn);
+/**
+ * \brief Acquire an image frame from the EGLStream with CUDA as a consumer.
+ *
+ * Acquire an image frame from EGLStreamKHR.
+ * ::cudaGraphicsResourceGetMappedEglFrame can be called on \p pCudaResource to get
+ * ::cudaEglFrame.
+ *
+ * \param conn            - Connection on which to acquire
+ * \param pCudaResource   - CUDA resource on which the EGLStream frame will be mapped for use.
+ * \param pStream         - CUDA stream for synchronization and any data migrations
+ * implied by ::cudaEglResourceLocationFlags.
+ * \param timeout         - Desired timeout in usec.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * ::cudaErrorLaunchTimeout
+ *
+ * \sa
+ * ::cudaEGLStreamConsumerConnect,
+ * ::cudaEGLStreamConsumerDisconnect,
+ * ::cudaEGLStreamConsumerReleaseFrame,
+ * ::cuEGLStreamConsumerAcquireFrame
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerAcquireFrame(cudaEglStreamConnection *conn,
+        cudaGraphicsResource_t *pCudaResource, cudaStream_t *pStream, unsigned int timeout);
+/**
+ * \brief Releases the last frame acquired from the EGLStream.
+ *
+ * Release the acquired image frame specified by \p pCudaResource to EGLStreamKHR.
+ *
+ * \param conn            - Connection on which to release
+ * \param pCudaResource   - CUDA resource whose corresponding frame is to be released
+ * \param pStream         - CUDA stream on which release will be done.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamConsumerConnect,
+ * ::cudaEGLStreamConsumerDisconnect,
+ * ::cudaEGLStreamConsumerAcquireFrame,
+ * ::cuEGLStreamConsumerReleaseFrame
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerReleaseFrame(cudaEglStreamConnection *conn,
+                                                  cudaGraphicsResource_t pCudaResource, cudaStream_t *pStream);
+/**
+ * \brief Connect CUDA to EGLStream as a producer.
+ *
+ * Connect CUDA as a producer to EGLStreamKHR specified by \p stream.
+ *
+ * The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
+ * API to another.
+ *
+ * \param conn   - Pointer to the returned connection handle
+ * \param eglStream - EGLStreamKHR handle
+ * \param width  - width of the image to be submitted to the stream
+ * \param height - height of the image to be submitted to the stream
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamProducerDisconnect,
+ * ::cudaEGLStreamProducerPresentFrame,
+ * ::cudaEGLStreamProducerReturnFrame,
+ * ::cuEGLStreamProducerConnect
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerConnect(cudaEglStreamConnection *conn,
+                                                EGLStreamKHR eglStream, EGLint width, EGLint height);
+/**
+ * \brief Disconnect CUDA as a producer  to EGLStream .
+ *
+ * Disconnect CUDA as a producer to EGLStreamKHR.
+ *
+ * \param conn            - Conection to disconnect.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamProducerConnect,
+ * ::cudaEGLStreamProducerPresentFrame,
+ * ::cudaEGLStreamProducerReturnFrame,
+ * ::cuEGLStreamProducerDisconnect
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerDisconnect(cudaEglStreamConnection *conn);
+/**
+ * \brief Present a CUDA eglFrame to the EGLStream with CUDA as a producer.
+ *
+ * The ::cudaEglFrame is defined as:
+ * \code
+ * typedef struct cudaEglFrame_st {
+ *     union {
+ *         cudaArray_t            pArray[CUDA_EGL_MAX_PLANES];
+ *         struct cudaPitchedPtr  pPitch[CUDA_EGL_MAX_PLANES];
+ *     } frame;
+ *     cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES];
+ *     unsigned int planeCount;
+ *     cudaEglFrameType frameType;
+ *     cudaEglColorFormat eglColorFormat;
+ * } cudaEglFrame;
+ * \endcode
+ *
+ * For ::cudaEglFrame of type ::cudaEglFrameTypePitch, the application may present sub-region of a memory
+ * allocation. In that case, ::cudaPitchedPtr::ptr will specify the start address of the sub-region in
+ * the allocation and ::cudaEglPlaneDesc will specify the dimensions of the sub-region.
+ *
+ * \param conn            - Connection on which to present the CUDA array
+ * \param eglframe        - CUDA Eglstream Proucer Frame handle to be sent to the consumer over EglStream.
+ * \param pStream         - CUDA stream on which to present the frame.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamProducerConnect,
+ * ::cudaEGLStreamProducerDisconnect,
+ * ::cudaEGLStreamProducerReturnFrame,
+ * ::cuEGLStreamProducerPresentFrame
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerPresentFrame(cudaEglStreamConnection *conn,
+                                                 cudaEglFrame eglframe, cudaStream_t *pStream);
+/**
+ * \brief Return the CUDA eglFrame to the EGLStream last released by the consumer.
+ *
+ * This API can potentially return cudaErrorLaunchTimeout if the consumer has not
+ * returned a frame to EGL stream. If timeout is returned the application can retry.
+ *
+ * \param conn            - Connection on which to present the CUDA array
+ * \param eglframe        - CUDA Eglstream Proucer Frame handle returned from the consumer over EglStream.
+ * \param pStream         - CUDA stream on which to return the frame.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamProducerConnect,
+ * ::cudaEGLStreamProducerDisconnect,
+ * ::cudaEGLStreamProducerPresentFrame,
+ * ::cuEGLStreamProducerReturnFrame
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerReturnFrame(cudaEglStreamConnection *conn,
+                                                cudaEglFrame *eglframe, cudaStream_t *pStream);
+/**
+ * \brief Get an eglFrame through which to access a registered EGL graphics resource.
+ *
+ * Returns in \p *eglFrame an eglFrame pointer through which the registered graphics resource
+ * \p resource may be accessed.
+ * This API can only be called for EGL graphics resources.
+ *
+ * The ::cudaEglFrame is defined as
+ * \code
+ * typedef struct cudaEglFrame_st {
+ *     union {
+ *         cudaArray_t             pArray[CUDA_EGL_MAX_PLANES];
+ *         struct cudaPitchedPtr   pPitch[CUDA_EGL_MAX_PLANES];
+ *     } frame;
+ *     cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES];
+ *     unsigned int planeCount;
+ *     cudaEglFrameType frameType;
+ *     cudaEglColorFormat eglColorFormat;
+ * } cudaEglFrame;
+ * \endcode
+ *
+ *
+ * \param eglFrame   - Returned eglFrame.
+ * \param resource   - Registered resource to access.
+ * \param index      - Index for cubemap surfaces.
+ * \param mipLevel   - Mipmap level for the subresource to access.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \note Note that in case of multiplanar \p *eglFrame, pitch of only first plane (unsigned int cudaEglPlaneDesc::pitch) is to be considered by the application.
+ *
+ * \sa
+ * ::cudaGraphicsSubResourceGetMappedArray,
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuGraphicsResourceGetMappedEglFrame
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedEglFrame(cudaEglFrame* eglFrame,
+                                        cudaGraphicsResource_t resource, unsigned int index, unsigned int mipLevel);
+/**
+ * \brief Creates an event from EGLSync object
+ *
+ * Creates an event *phEvent from an EGLSyncKHR eglSync with the flages specified
+ * via \p flags. Valid flags include:
+ * - ::cudaEventDefault: Default event creation flag.
+ * - ::cudaEventBlockingSync: Specifies that the created event should use blocking
+ * synchronization.  A CPU thread that uses ::cudaEventSynchronize() to wait on
+ * an event created with this flag will block until the event has actually
+ * been completed.
+ *
+ * ::cudaEventRecord and TimingData are not supported for events created from EGLSync.
+ *
+ * The EGLSyncKHR is an opaque handle to an EGL sync object.
+ * typedef void* EGLSyncKHR
+ *
+ * \param phEvent - Returns newly created event
+ * \param eglSync - Opaque handle to EGLSync object
+ * \param flags   - Event creation flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorMemoryAllocation
+ *
+ * \sa
+ * ::cudaEventQuery,
+ * ::cudaEventSynchronize,
+ * ::cudaEventDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEventCreateFromEGLSync(cudaEvent_t *phEvent, EGLSyncKHR eglSync, unsigned int flags);
+/** @} */ /* END CUDART_EGL */
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+#endif /* __CUDA_EGL_INTEROP_H__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp8.hpp ADDED Viewed

	@@ -0,0 +1,1546 @@

+/*
+ * Copyright 2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__CUDA_FP8_HPP__)
+#define __CUDA_FP8_HPP__
+#if !defined(__CUDA_FP8_H__)
+#error "Do not include this file directly. Instead, include cuda_fp8.h."
+#endif
+/* C++ header for std::memcpy (used for type punning in host-side
+ * implementations). When compiling as a CUDA source file memcpy is provided
+ * implicitly. !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
+ */
+#if defined(__cplusplus) && !defined(__CUDACC__)
+#include <cstring>
+#elif !defined(__cplusplus) && !defined(__CUDACC__)
+#include <string.h>
+#endif /* defined(__cplusplus) && !defined(__CUDACC__) */
+/* Set up structure-alignment attribute */
+#if !(defined __CUDA_ALIGN__)
+#if defined(__CUDACC__)
+#define __CUDA_ALIGN__(align) __align__(align)
+#else
+/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas"
+ * is available) */
+#if __cplusplus >= 201103L
+#define __CUDA_ALIGN__(n)                                                      \
+    alignas(n) /* C++11 kindly gives us a keyword for this */
+#else          /* !defined(__CPP_VERSION_AT_LEAST_11_FP8)*/
+#if defined(__GNUC__)
+#define __CUDA_ALIGN__(n) __attribute__((aligned(n)))
+#elif defined(_MSC_VER)
+#define __CUDA_ALIGN__(n) __declspec(align(n))
+#else
+#define __CUDA_ALIGN__(n)
+#endif /* defined(__GNUC__) */
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+#endif /* defined(__CUDACC__) */
+#endif /* !(defined __CUDA_ALIGN__) */
+#if !(defined __CPP_VERSION_AT_LEAST_11_FP8)
+/* need c++11 for explicit operators */
+#define __CUDA_NO_FP8_CONVERSION_OPERATORS__
+#endif
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
+__nv_cvt_double_to_fp8(const double x, const __nv_saturation_t saturate,
+                       const __nv_fp8_interpretation_t fp8_interpretation) {
+    unsigned char res;
+    unsigned long long int xbits;
+#if defined(__CUDACC__) || (!defined __cplusplus)
+    (void)memcpy(&xbits, &x, sizeof(x));
+#else
+    (void)std::memcpy(&xbits, &x, sizeof(x));
+#endif
+    unsigned char FP8_MAXNORM;
+    unsigned char FP8_MANTISSA_MASK;
+    unsigned short int FP8_EXP_BIAS;
+    unsigned long long int FP8_SIGNIFICAND_BITS;
+    const unsigned long long int DP_INF_BITS = 0x7FF0000000000000ULL;
+    unsigned long long int FP8_MINDENORM_O2;
+    unsigned long long int FP8_OVERFLOW_THRESHOLD;
+    unsigned long long int FP8_MINNORM;
+    if (fp8_interpretation == __NV_E4M3) {
+        FP8_EXP_BIAS = 7U;
+        FP8_SIGNIFICAND_BITS = 4ULL;
+        FP8_MANTISSA_MASK = 0x7U;
+        FP8_MINDENORM_O2 = 0x3F50000000000000ULL; // mindenorm/2 = 2^-10
+        FP8_OVERFLOW_THRESHOLD =
+            0x407D000000000000ULL; // maxnorm + 1/2ulp = 0x1.Cp+8 + 0x1p+4
+        FP8_MAXNORM = 0x7EU;
+        FP8_MINNORM = 0x3F90000000000000ULL; // minnorm = 2^-6
+    } else {                                 //__NV_E5M2
+        FP8_EXP_BIAS = 15U;
+        FP8_SIGNIFICAND_BITS = 3ULL;
+        FP8_MANTISSA_MASK = 0x3U;
+        FP8_MINDENORM_O2 = 0x3EE0000000000000ULL; // mindenorm/2 = 2^-17
+        FP8_OVERFLOW_THRESHOLD =
+            0x40EE000000000000ULL -
+            1ULL; // maxnorm + 1/2ulp = 0x1.Ep+15, and -1 to have common code
+        FP8_MAXNORM = 0x7BU;
+        FP8_MINNORM = 0x3F10000000000000ULL; // minnorm = 2^-14
+    }
+    // 1/2 LSB of the target format, positioned in double precision mantissa
+    // helpful in midpoints detection during round-to-nearest-even step
+    const unsigned long long int FP8_DP_HALF_ULP =
+        (unsigned long long int)1ULL << (53ULL - FP8_SIGNIFICAND_BITS - 1ULL);
+    // prepare sign bit in target format
+    unsigned char sign = (unsigned char)((xbits >> 63ULL) << 7U);
+    // prepare exponent field in target format
+    unsigned char exp =
+        (unsigned char)((((unsigned short int)(xbits >> 52ULL)) & 0x7FFU) -
+                        1023U + FP8_EXP_BIAS);
+    // round mantissa to target format width, rounding towards zero
+    unsigned char mantissa =
+        (unsigned char)(xbits >> (53ULL - FP8_SIGNIFICAND_BITS)) &
+        FP8_MANTISSA_MASK;
+    unsigned long long int absx = xbits & 0x7FFFFFFFFFFFFFFFULL;
+    if (absx <= FP8_MINDENORM_O2) {
+        // zero or underflow
+        res = 0U;
+    } else if (absx > DP_INF_BITS) {
+        // NaN
+        if (fp8_interpretation == __NV_E4M3) {
+            res = 0x7FU;
+        } else {
+            // NaN --> QNaN
+            res = 0x7EU | mantissa;
+        }
+    } else if (absx > FP8_OVERFLOW_THRESHOLD) {
+        if (saturate == __NV_SATFINITE) {
+            res = FP8_MAXNORM;
+        } else {
+            // __NV_NOSAT
+            if (fp8_interpretation == __NV_E4M3) {
+                // no Inf in E4M3
+                res = 0x7FU; // NaN
+            } else {
+                res = 0x7CU; // Inf in E5M2
+            }
+        }
+    } else if (absx >= FP8_MINNORM) {
+        res = (unsigned char)((exp << (FP8_SIGNIFICAND_BITS - 1U)) | mantissa);
+        // rounded-off bits
+        unsigned long long int round =
+            xbits & ((FP8_DP_HALF_ULP << 1ULL) - 1ULL);
+        // round-to-nearest-even adjustment
+        if ((round > FP8_DP_HALF_ULP) ||
+            ((round == FP8_DP_HALF_ULP) && (mantissa & 1U))) {
+            res = (unsigned char)(res + 1U);
+        }
+    } else // Denormal range
+    {
+        unsigned char shift = (unsigned char)(1U - exp);
+        // add implicit leading bit
+        mantissa |= (unsigned char)(1U << (FP8_SIGNIFICAND_BITS - 1U));
+        // additional round-off due to denormalization
+        res = (unsigned char)(mantissa >> shift);
+        // rounded-off bits, including implicit leading bit
+        unsigned long long int round =
+            (xbits | ((unsigned long long int)1ULL << (53ULL - 1ULL))) &
+            ((FP8_DP_HALF_ULP << (shift + 1ULL)) - 1ULL);
+        // round-to-nearest-even adjustment
+        if ((round > (FP8_DP_HALF_ULP << shift)) ||
+            ((round == (FP8_DP_HALF_ULP << shift)) && (res & 1U))) {
+            res = (unsigned char)(res + 1U);
+        }
+    }
+    res |= sign;
+    return (__nv_fp8_storage_t)res;
+}
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
+__nv_cvt_double2_to_fp8x2(const double2 x, const __nv_saturation_t saturate,
+                          const __nv_fp8_interpretation_t fp8_interpretation) {
+    __nv_fp8x2_storage_t storage = (__nv_fp8x2_storage_t)__nv_cvt_double_to_fp8(
+        x.y, saturate, fp8_interpretation);
+    storage = (__nv_fp8x2_storage_t)(storage << 8U);
+    storage = (__nv_fp8x2_storage_t)(storage |
+                                     __nv_cvt_double_to_fp8(
+                                         x.x, saturate, fp8_interpretation));
+    return storage;
+}
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
+__nv_cvt_float_to_fp8(const float x, const __nv_saturation_t saturate,
+                      const __nv_fp8_interpretation_t fp8_interpretation) {
+    __nv_fp8_storage_t res = 0U;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    if (saturate == __NV_SATFINITE) {
+        __nv_fp8x2_storage_t storage;
+        if (fp8_interpretation == __NV_E5M2) {
+            asm("{cvt.rn.satfinite.e5m2x2.f32 %0, %2, %1;}\n"
+                : "=h"(storage)
+                : "f"(x), "f"(0.0f));
+        } else {
+            asm("{cvt.rn.satfinite.e4m3x2.f32 %0, %2, %1;}\n"
+                : "=h"(storage)
+                : "f"(x), "f"(0.0f));
+        }
+        res = (__nv_fp8_storage_t)storage;
+    } else
+#endif
+    {
+        unsigned int xbits;
+#if defined(__CUDACC__) || (!defined __cplusplus)
+        (void)memcpy(&xbits, &x, sizeof(x));
+#else
+        (void)std::memcpy(&xbits, &x, sizeof(x));
+#endif
+        // isnan
+        if ((xbits & 0x7FFFFFFFU) > 0x7F800000U) {
+            // Canonical NaN
+            xbits = 0x7FFFFFFFU;
+        }
+        float fx;
+#if defined(__CUDACC__) || (!defined __cplusplus)
+        (void)memcpy(&fx, &xbits, sizeof(xbits));
+#else
+        (void)std::memcpy(&fx, &xbits, sizeof(xbits));
+#endif
+        const double dx = (double)fx;
+        res = __nv_cvt_double_to_fp8(dx, saturate, fp8_interpretation);
+    }
+    return res;
+}
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
+__nv_cvt_float2_to_fp8x2(const float2 x, const __nv_saturation_t saturate,
+                         const __nv_fp8_interpretation_t fp8_interpretation) {
+    __nv_fp8x2_storage_t storage;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    if (saturate == __NV_SATFINITE) {
+        if (fp8_interpretation == __NV_E5M2) {
+            asm("{cvt.rn.satfinite.e5m2x2.f32 %0, %2, %1;}\n"
+                : "=h"(storage)
+                : "f"(x.x), "f"(x.y));
+        } else {
+            asm("{cvt.rn.satfinite.e4m3x2.f32 %0, %2, %1;}\n"
+                : "=h"(storage)
+                : "f"(x.x), "f"(x.y));
+        }
+    } else
+#endif
+    {
+        storage = (__nv_fp8x2_storage_t)__nv_cvt_float_to_fp8(
+            x.y, saturate, fp8_interpretation);
+        storage = (__nv_fp8x2_storage_t)(storage << 8U);
+        storage = (__nv_fp8x2_storage_t)(storage | __nv_cvt_float_to_fp8(
+                                                       x.x, saturate,
+                                                       fp8_interpretation));
+    }
+    return storage;
+}
+__CUDA_HOSTDEVICE_FP8_DECL__ float
+__internal_halfraw_to_float(const __half_raw x) {
+    float f;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    asm("{cvt.f32.f16 %0, %1;}\n" : "=f"(f) : "h"(x.x));
+#else
+    const unsigned int ux = (unsigned int)x.x;
+    unsigned int sign = (ux >> 15U) & 1U;
+    unsigned int exponent = (ux >> 10U) & 0x1fU;
+    unsigned int mantissa = (ux & 0x3ffU) << 13U;
+    if (exponent == 0x1fU) { /* NaN or Inf */
+        /* discard sign of a NaN */
+        sign = ((mantissa != 0U) ? (sign >> 1U) : sign);
+        mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U);
+        exponent = 0xffU;
+    } else if (exponent == 0U) { /* Denorm or Zero */
+        if (mantissa != 0U) {
+            unsigned int msb;
+            exponent = 0x71U;
+            do {
+                msb = (mantissa & 0x400000U);
+                mantissa <<= 1U; /* normalize */
+                --exponent;
+            } while (msb == 0U);
+            mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
+        }
+    } else {
+        exponent += 0x70U;
+    }
+    const unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa);
+#if defined(__CUDACC__) || (!defined __cplusplus)
+    (void)memcpy(&f, &u, sizeof(u));
+#else
+    (void)std::memcpy(&f, &u, sizeof(u));
+#endif
+#endif /* (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) */
+    return f;
+}
+__CUDA_HOSTDEVICE_FP8_DECL__ float2
+__internal_halfraw2_to_float2(const __half2_raw x) {
+    __half_raw raw;
+    float2 res;
+    raw.x = x.x;
+    res.x = __internal_halfraw_to_float(raw);
+    raw.x = x.y;
+    res.y = __internal_halfraw_to_float(raw);
+    return res;
+}
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
+__nv_cvt_halfraw_to_fp8(const __half_raw x, const __nv_saturation_t saturate,
+                        const __nv_fp8_interpretation_t fp8_interpretation) {
+    __nv_fp8_storage_t res = 0U;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    if (saturate == __NV_SATFINITE) {
+        unsigned int half2_storage = (unsigned int)(x.x);
+        __nv_fp8x2_storage_t tmp;
+        if (fp8_interpretation == __NV_E5M2) {
+            asm("{cvt.rn.satfinite.e5m2x2.f16x2 %0, %1;}\n"
+                : "=h"(tmp)
+                : "r"(half2_storage));
+        } else {
+            asm("{cvt.rn.satfinite.e4m3x2.f16x2 %0, %1;}\n"
+                : "=h"(tmp)
+                : "r"(half2_storage));
+        }
+        res = (__nv_fp8_storage_t)tmp;
+    } else
+#endif
+    {
+        float fx = __internal_halfraw_to_float(x);
+        res = __nv_cvt_float_to_fp8(fx, saturate, fp8_interpretation);
+    }
+    return res;
+}
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_halfraw2_to_fp8x2(
+    const __half2_raw x, const __nv_saturation_t saturate,
+    const __nv_fp8_interpretation_t fp8_interpretation) {
+    __nv_fp8x2_storage_t tmp;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    if (saturate == __NV_SATFINITE) {
+        unsigned int half2_storage;
+        (void)memcpy(&half2_storage, &x, sizeof(x));
+        if (fp8_interpretation == __NV_E5M2) {
+            asm("{cvt.rn.satfinite.e5m2x2.f16x2 %0, %1;}\n"
+                : "=h"(tmp)
+                : "r"(half2_storage));
+        } else {
+            asm("{cvt.rn.satfinite.e4m3x2.f16x2 %0, %1;}\n"
+                : "=h"(tmp)
+                : "r"(half2_storage));
+        }
+    } else
+#endif
+    {
+        __half_raw raw;
+        raw.x = x.x;
+        __nv_fp8_storage_t lo =
+            __nv_cvt_halfraw_to_fp8(raw, saturate, fp8_interpretation);
+        raw.x = x.y;
+        __nv_fp8_storage_t hi =
+            __nv_cvt_halfraw_to_fp8(raw, saturate, fp8_interpretation);
+        tmp = hi;
+        tmp = (__nv_fp8x2_storage_t)(tmp << 8U);
+        tmp = (__nv_fp8x2_storage_t)(tmp | lo);
+    }
+    return tmp;
+}
+__CUDA_HOSTDEVICE_FP8_DECL__ float
+__internal_bf16raw_to_float(const __nv_bfloat16_raw x) {
+    const unsigned int ux = ((unsigned int)x.x) << 16U;
+    float fx;
+#if defined(__CUDACC__) || (!defined __cplusplus)
+    (void)memcpy(&fx, &ux, sizeof(ux));
+#else
+    (void)std::memcpy(&fx, &ux, sizeof(ux));
+#endif
+    return fx;
+}
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_bfloat16_raw
+__internal_float_to_bf16raw_rz(const float x) {
+    unsigned int ux;
+    __nv_bfloat16_raw r;
+#if defined(__CUDACC__) || (!defined __cplusplus)
+    (void)memcpy(&ux, &x, sizeof(x));
+#else
+    (void)std::memcpy(&ux, &x, sizeof(x));
+#endif
+    r.x = (unsigned short int)(ux >> 16U);
+    return r;
+}
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_bfloat16raw_to_fp8(
+    const __nv_bfloat16_raw x, const __nv_saturation_t saturate,
+    const __nv_fp8_interpretation_t fp8_interpretation) {
+    const float fx = __internal_bf16raw_to_float(x);
+    const __nv_fp8_storage_t res =
+        __nv_cvt_float_to_fp8(fx, saturate, fp8_interpretation);
+    return res;
+}
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
+__nv_cvt_bfloat16raw2_to_fp8x2(
+    const __nv_bfloat162_raw x, const __nv_saturation_t saturate,
+    const __nv_fp8_interpretation_t fp8_interpretation) {
+    __nv_bfloat16_raw raw;
+    raw.x = x.y;
+    __nv_fp8x2_storage_t storage =
+        (__nv_fp8x2_storage_t)__nv_cvt_bfloat16raw_to_fp8(raw, saturate,
+                                                          fp8_interpretation);
+    storage = (__nv_fp8x2_storage_t)(storage << 8U);
+    raw.x = x.x;
+    storage = (__nv_fp8x2_storage_t)(storage |
+                                     __nv_cvt_bfloat16raw_to_fp8(
+                                         raw, saturate, fp8_interpretation));
+    return storage;
+}
+__CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw
+__nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x,
+                           const __nv_fp8_interpretation_t fp8_interpretation);
+__CUDA_HOSTDEVICE_FP8_DECL__ __half_raw
+__nv_cvt_fp8_to_halfraw(const __nv_fp8_storage_t x,
+                        const __nv_fp8_interpretation_t fp8_interpretation) {
+    __half_raw res;
+    res.x = 0U;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    res.x =
+        __nv_cvt_fp8x2_to_halfraw2((__nv_fp8x2_storage_t)x, fp8_interpretation)
+            .x;
+#else
+    unsigned short int ur = (unsigned short int)x;
+    ur = (unsigned short int)(ur << 8U);
+    if (fp8_interpretation == __NV_E5M2) {
+        if ((ur & 0x7FFFU) > 0x7C00U) {
+            /* If NaN, return canonical NaN */
+            ur = 0x7FFFU;
+        }
+    } else { // __NV_E4M3
+        unsigned short int sign = ur & 0x8000U;
+        unsigned short int exponent =
+            (unsigned short int)(((ur & 0x7800U) >> 1U) + 0x2000U);
+        unsigned short int mantissa = (ur & 0x0700U) >> 1U;
+        unsigned char absx = 0x7FU & (unsigned char)x;
+        if (absx == 0x7FU) // NaN
+        {
+            ur = 0x7FFFU; // fp16 canonical NaN, discard sign
+        } else if (exponent == 0x2000U) {
+            // zero or denormal
+            if (mantissa != 0U) {
+                // normalize
+                mantissa = (unsigned short int)(mantissa << 1U);
+                while ((mantissa & 0x0400U) == 0U) {
+                    mantissa = (unsigned short int)(mantissa << 1U);
+                    exponent = (unsigned short int)(exponent - 0x0400U);
+                }
+                // discard implicit leading bit
+                mantissa &= 0x03FFU;
+            } else { // Zero
+                exponent = 0U;
+            }
+            ur = (sign | exponent) | mantissa;
+        } else {
+            ur = (sign | exponent) | mantissa;
+        }
+    }
+    res.x = ur;
+#endif
+    return res;
+}
+__CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw
+__nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x,
+                           const __nv_fp8_interpretation_t fp8_interpretation) {
+    __half2_raw res;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    unsigned int half2_storage;
+    if (fp8_interpretation == __NV_E5M2) {
+        asm("{cvt.rn.f16x2.e5m2x2 %0, %1;}\n" : "=r"(half2_storage) : "h"(x));
+    } else {
+        asm("{cvt.rn.f16x2.e4m3x2 %0, %1;}\n" : "=r"(half2_storage) : "h"(x));
+    }
+    (void)memcpy(&res, &half2_storage, sizeof(half2_storage));
+#else
+    res.x =
+        __nv_cvt_fp8_to_halfraw((__nv_fp8_storage_t)x, fp8_interpretation).x;
+    res.y = __nv_cvt_fp8_to_halfraw((__nv_fp8_storage_t)(x >> 8U),
+                                    fp8_interpretation)
+                .x;
+#endif
+    return res;
+}
+/* All other definitions in this file are only visible to C++ compilers */
+#if defined(__cplusplus)
+/**
+ * \defgroup CUDA_MATH_FP8_E5M2_STRUCT C++ struct for handling fp8 data type of e5m2 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+/**
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+ * \brief __nv_fp8_e5m2 datatype
+ *
+ * \details This structure implements the datatype for handling
+ * \p fp8 floating-point numbers of \p e5m2 kind:
+ * with 1 sign, 5 exponent, 1 implicit and 2 explicit mantissa bits.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(1) __nv_fp8_e5m2 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Storage variable contains the \p fp8 floating-point data.
+     */
+    __nv_fp8_storage_t __x;
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8_e5m2() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+    /* Construct from wider FP types */
+    /* Note we do avoid constructor init-list because of special host/device
+     * compilation rules */
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Constructor from \p __half data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const __half f) {
+        __x = __nv_cvt_halfraw_to_fp8(static_cast<__half_raw>(f),
+                                      __NV_SATFINITE, __NV_E5M2);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Constructor from \p __nv_bfloat16 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const __nv_bfloat16 f) {
+        __x = __nv_cvt_bfloat16raw_to_fp8(static_cast<__nv_bfloat16_raw>(f),
+                                          __NV_SATFINITE, __NV_E5M2);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Constructor from \p float data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const float f) {
+        __x = __nv_cvt_float_to_fp8(f, __NV_SATFINITE, __NV_E5M2);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Constructor from \p double data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const double f) {
+        __x = __nv_cvt_double_to_fp8(f, __NV_SATFINITE, __NV_E5M2);
+    }
+    /* Converts from integral */
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Constructor from \p unsigned \p short \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__
+    __nv_fp8_e5m2(const unsigned short int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Constructor from \p unsigned \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const unsigned int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Constructor from \p unsigned \p long \p long \p int data type, relies on
+     * \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__
+    __nv_fp8_e5m2(const unsigned long long int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Constructor from \p short \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const short int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Constructor from \p int data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Constructor from \p long \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const long long int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening FP converts */
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p __half data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __half() const {
+        return static_cast<__half>(__nv_cvt_fp8_to_halfraw(__x, __NV_E5M2));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p float data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float() const {
+        return __internal_halfraw_to_float(
+            __nv_cvt_fp8_to_halfraw(__x, __NV_E5M2));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p __nv_bfloat16 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __nv_bfloat16() const {
+        return static_cast<__nv_bfloat16>(
+            __internal_float_to_bf16raw_rz(float(*this)));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p double data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator double() const {
+        return static_cast<double>(float(*this));
+    }
+    /* Convert to integral */
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p unsigned \p char data type.
+     * Clamps negative and too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned char() const {
+        unsigned char i;
+        const float f = float(*this);
+        const unsigned char max_val = 0xFFU;
+        const unsigned char min_val = 0U;
+        const unsigned char bits = (*this).__x;
+        // saturation fixup
+        if ((bits & 0x7FU) > 0x7CU) {
+            // NaN
+            i = 0;
+        } else if (f > static_cast<float>(max_val)) {
+            // saturate maximum
+            i = max_val;
+        } else if (f < static_cast<float>(min_val)) {
+            // saturate minimum
+            i = min_val;
+        } else {
+            // normal value
+            i = static_cast<unsigned char>(f);
+        }
+        return i;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p unsigned \p short \p int data type.
+     * Clamps negative and too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned short int() const {
+        return __half2ushort_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p unsigned \p int data type.
+     * Clamps negative and too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned int() const {
+        return __half2uint_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p unsigned \p long \p long \p int data type.
+     * Clamps negative and too large inputs to the output range.
+     * \p NaN inputs convert to \p 0x8000000000000000ULL.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long long int() const {
+        return __half2ull_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p signed \p char data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator signed char() const {
+        signed char i;
+        const float f = float(*this);
+        const signed char max_val = (signed char)0x7FU;
+        const signed char min_val = (signed char)0x80U;
+        const unsigned char bits = (*this).__x;
+        // saturation fixup
+        if ((bits & 0x7FU) > 0x7CU) {
+            // NaN
+            i = 0;
+        } else if (f > static_cast<float>(max_val)) {
+            // saturate maximum
+            i = max_val;
+        } else if (f < static_cast<float>(min_val)) {
+            // saturate minimum
+            i = min_val;
+        } else {
+            // normal value
+            i = static_cast<signed char>(f);
+        }
+        return i;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p short \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator short int() const {
+        return __half2short_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator int() const {
+        return __half2int_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p long \p long \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p 0x8000000000000000LL.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator long long int() const {
+        return __half2ll_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p bool data type.
+     * +0 and -0 inputs convert to \p false.
+     * Non-zero inputs convert to \p true.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator bool() const {
+        return (__x & 0x7FU) != 0U;
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+/**
+ * \defgroup CUDA_MATH_FP8X2_E5M2_STRUCT C++ struct for handling vector type of two fp8 values of e5m2 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+/**
+ * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
+ * \brief __nv_fp8x2_e5m2 datatype
+ *
+ * \details This structure implements the datatype for handling two
+ * \p fp8 floating-point numbers of \p e5m2 kind each:
+ * with 1 sign, 5 exponent, 1 implicit and 2 explicit mantissa bits.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(2) __nv_fp8x2_e5m2 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
+     * Storage variable contains the vector of two \p fp8 floating-point data
+     * values.
+     */
+    __nv_fp8x2_storage_t __x;
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8x2_e5m2() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+    /* Construct from wider types */
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
+     * Constructor from \p __half2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const __half2 f) {
+        __x = __nv_cvt_halfraw2_to_fp8x2(static_cast<__half2_raw>(f),
+                                         __NV_SATFINITE, __NV_E5M2);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
+     * Constructor from \p __nv_bfloat162 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const __nv_bfloat162 f) {
+        __x = __nv_cvt_bfloat16raw2_to_fp8x2(static_cast<__nv_bfloat162_raw>(f),
+                                             __NV_SATFINITE, __NV_E5M2);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
+     * Constructor from \p float2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const float2 f) {
+        __x = __nv_cvt_float2_to_fp8x2(f, __NV_SATFINITE, __NV_E5M2);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
+     * Constructor from \p double2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const double2 f) {
+        __x = __nv_cvt_double2_to_fp8x2(f, __NV_SATFINITE, __NV_E5M2);
+    }
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening converts */
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
+     * Conversion operator to \p __half2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __half2() const {
+        return static_cast<__half2>(__nv_cvt_fp8x2_to_halfraw2(__x, __NV_E5M2));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
+     * Conversion operator to \p float2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float2() const {
+        return __internal_halfraw2_to_float2(
+            __nv_cvt_fp8x2_to_halfraw2(__x, __NV_E5M2));
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+__CUDA_HOSTDEVICE_FP8_DECL__ unsigned int
+__internal_pack_u16x2_to_u32(const unsigned short int src_lo,
+                             const unsigned short int src_hi) {
+    unsigned int dst;
+#if (defined __CUDACC__) && (defined __CUDA_ARCH__)
+    asm("{  mov.b32 %0, {%1,%2};}\n" : "=r"(dst) : "h"(src_lo), "h"(src_hi));
+#else
+    dst = (static_cast<unsigned int>(src_hi) << 16U) |
+          static_cast<unsigned int>(src_lo);
+#endif
+    return dst;
+}
+/**
+ * \defgroup CUDA_MATH_FP8X4_E5M2_STRUCT C++ struct for handling vector type of four fp8 values of e5m2 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+/**
+ * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
+ * \brief __nv_fp8x4_e5m2 datatype
+ *
+ * \details This structure implements the datatype for handling four
+ * \p fp8 floating-point numbers of \p e5m2 kind each:
+ * with 1 sign, 5 exponent, 1 implicit and 2 explicit mantissa bits.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(4) __nv_fp8x4_e5m2 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
+     * Storage variable contains the vector of four \p fp8 floating-point data
+     * values.
+     */
+    __nv_fp8x4_storage_t __x;
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8x4_e5m2() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+    /* Construct from wider types */
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
+     * Constructor from a pair of \p __half2 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const __half2 flo,
+                                                     const __half2 fhi) {
+        const __nv_fp8x2_storage_t rlo = __nv_cvt_halfraw2_to_fp8x2(
+            static_cast<__half2_raw>(flo), __NV_SATFINITE, __NV_E5M2);
+        const __nv_fp8x2_storage_t rhi = __nv_cvt_halfraw2_to_fp8x2(
+            static_cast<__half2_raw>(fhi), __NV_SATFINITE, __NV_E5M2);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
+     * Constructor from a pair of \p __nv_bfloat162 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const __nv_bfloat162 flo,
+                                                     const __nv_bfloat162 fhi) {
+        const __nv_fp8x2_storage_t rlo = __nv_cvt_bfloat16raw2_to_fp8x2(
+            static_cast<__nv_bfloat162_raw>(flo), __NV_SATFINITE, __NV_E5M2);
+        const __nv_fp8x2_storage_t rhi = __nv_cvt_bfloat16raw2_to_fp8x2(
+            static_cast<__nv_bfloat162_raw>(fhi), __NV_SATFINITE, __NV_E5M2);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
+     * Constructor from \p float4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const float4 f) {
+        const float2 flo = {f.x, f.y};
+        const float2 fhi = {f.z, f.w};
+        const __nv_fp8x2_storage_t rlo =
+            __nv_cvt_float2_to_fp8x2(flo, __NV_SATFINITE, __NV_E5M2);
+        const __nv_fp8x2_storage_t rhi =
+            __nv_cvt_float2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E5M2);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
+     * Constructor from \p double4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const double4 f) {
+        const double2 flo = {f.x, f.y};
+        const double2 fhi = {f.z, f.w};
+        const __nv_fp8x2_storage_t rlo =
+            __nv_cvt_double2_to_fp8x2(flo, __NV_SATFINITE, __NV_E5M2);
+        const __nv_fp8x2_storage_t rhi =
+            __nv_cvt_double2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E5M2);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening converts */
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
+     * Conversion operator to \p float4 vector data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float4() const {
+        const __nv_fp8x2_storage_t slo = static_cast<__nv_fp8x2_storage_t>(__x);
+        const __nv_fp8x2_storage_t shi =
+            static_cast<__nv_fp8x2_storage_t>(__x >> 16U);
+        float2 rlo = __internal_halfraw2_to_float2(
+            __nv_cvt_fp8x2_to_halfraw2(slo, __NV_E5M2));
+        float2 rhi = __internal_halfraw2_to_float2(
+            __nv_cvt_fp8x2_to_halfraw2(shi, __NV_E5M2));
+        float4 res = {rlo.x, rlo.y, rhi.x, rhi.y};
+        return res;
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+/**
+ * \defgroup CUDA_MATH_FP8_E4M3_STRUCT C++ struct for handling fp8 data type of e4m3 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+/**
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+ * \brief __nv_fp8_e4m3 datatype
+ *
+ * \details This structure implements the datatype for storing
+ * \p fp8 floating-point numbers of \p e4m3 kind:
+ * with 1 sign, 4 exponent, 1 implicit and 3 explicit mantissa bits.
+ * The encoding doesn't support Infinity.
+ * NaNs are limited to 0x7F and 0xFF values.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(1) __nv_fp8_e4m3 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Storage variable contains the \p fp8 floating-point data.
+     */
+    __nv_fp8_storage_t __x;
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8_e4m3() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+    /* Construct from wider FP types */
+    /* Note we do avoid constructor init-list because of special host/device
+     * compilation rules */
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Constructor from \p __half data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const __half f) {
+        __x = __nv_cvt_halfraw_to_fp8(static_cast<__half_raw>(f),
+                                      __NV_SATFINITE, __NV_E4M3);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Constructor from \p __nv_bfloat16 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const __nv_bfloat16 f) {
+        __x = __nv_cvt_bfloat16raw_to_fp8(static_cast<__nv_bfloat16_raw>(f),
+                                          __NV_SATFINITE, __NV_E4M3);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Constructor from \p float data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const float f) {
+        __x = __nv_cvt_float_to_fp8(f, __NV_SATFINITE, __NV_E4M3);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Constructor from \p double data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const double f) {
+        __x = __nv_cvt_double_to_fp8(f, __NV_SATFINITE, __NV_E4M3);
+    }
+    /* Converts from integral */
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Constructor from \p unsigned \p short \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__
+    __nv_fp8_e4m3(const unsigned short int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Constructor from \p unsigned \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const unsigned int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Constructor from \p unsigned \p long \p long \p int data type, relies on
+     * \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__
+    __nv_fp8_e4m3(const unsigned long long int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Constructor from \p short \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const short int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Constructor from \p int data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Constructor from \p long \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const long long int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening FP converts */
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p __half data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __half() const {
+        return static_cast<__half>(__nv_cvt_fp8_to_halfraw(__x, __NV_E4M3));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p float data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float() const {
+        return __internal_halfraw_to_float(
+            __nv_cvt_fp8_to_halfraw(__x, __NV_E4M3));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p __nv_bfloat16 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __nv_bfloat16() const {
+        return static_cast<__nv_bfloat16>(
+            __internal_float_to_bf16raw_rz(float(*this)));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p double data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator double() const {
+        return static_cast<double>(float(*this));
+    }
+    /* Convert to integral */
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p unsigned \p char data type.
+     * Clamps negative and too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned char() const {
+        unsigned char i;
+        const float f = float(*this);
+        const unsigned char max_val = 0xFFU;
+        const unsigned char min_val = 0U;
+        const unsigned char bits = (*this).__x;
+        // saturation fixup
+        if ((bits & 0x7FU) == 0x7FU) {
+            // NaN
+            i = 0;
+        } else if (f > static_cast<float>(max_val)) {
+            // saturate maximum
+            i = max_val;
+        } else if (f < static_cast<float>(min_val)) {
+            // saturate minimum
+            i = min_val;
+        } else {
+            // normal value
+            i = static_cast<unsigned char>(f);
+        }
+        return i;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p unsigned \p short \p int data type.
+     * Clamps negative inputs to zero.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned short int() const {
+        return __half2ushort_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p unsigned \p int data type.
+     * Clamps negative inputs to zero.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned int() const {
+        return __half2uint_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p unsigned \p long \p long \p int data type.
+     * Clamps negative inputs to zero.
+     * \p NaN inputs convert to \p 0x8000000000000000ULL.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long long int() const {
+        return __half2ull_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p signed \p char data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator signed char() const {
+        signed char i;
+        const float f = float(*this);
+        const signed char max_val = (signed char)0x7FU;
+        const signed char min_val = (signed char)0x80U;
+        const unsigned char bits = (*this).__x;
+        // saturation fixup
+        if ((bits & 0x7FU) == 0x7FU) {
+            // NaN
+            i = 0;
+        } else if (f > static_cast<float>(max_val)) {
+            // saturate maximum
+            i = max_val;
+        } else if (f < static_cast<float>(min_val)) {
+            // saturate minimum
+            i = min_val;
+        } else {
+            // normal value
+            i = static_cast<signed char>(f);
+        }
+        return i;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p short \p int data type.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator short int() const {
+        return __half2short_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p int data type.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator int() const {
+        return __half2int_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p long \p long \p int data type.
+     * \p NaN inputs convert to \p 0x8000000000000000LL.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator long long int() const {
+        return __half2ll_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p bool data type.
+     * +0 and -0 inputs convert to \p false.
+     * Non-zero inputs convert to \p true.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator bool() const {
+        return (__x & 0x7FU) != 0U;
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+/**
+ * \defgroup CUDA_MATH_FP8X2_E4M3_STRUCT C++ struct for handling vector type of two fp8 values of e4m3 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+/**
+ * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
+ * \brief __nv_fp8x2_e4m3 datatype
+ *
+ * \details This structure implements the datatype for storage
+ * and operations on the vector of two \p fp8 values of \p e4m3 kind each:
+ * with 1 sign, 4 exponent, 1 implicit and 3 explicit mantissa bits.
+ * The encoding doesn't support Infinity.
+ * NaNs are limited to 0x7F and 0xFF values.
+ */
+struct __CUDA_ALIGN__(2) __nv_fp8x2_e4m3 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
+     * Storage variable contains the vector of two \p fp8 floating-point data
+     * values.
+     */
+    __nv_fp8x2_storage_t __x;
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8x2_e4m3() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+    /* Construct from wider types */
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
+     * Constructor from \p __half2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const __half2 f) {
+        __x = __nv_cvt_halfraw2_to_fp8x2(static_cast<__half2_raw>(f),
+                                         __NV_SATFINITE, __NV_E4M3);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
+     * Constructor from \p __nv_bfloat162 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const __nv_bfloat162 f) {
+        __x = __nv_cvt_bfloat16raw2_to_fp8x2(static_cast<__nv_bfloat162_raw>(f),
+                                             __NV_SATFINITE, __NV_E4M3);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
+     * Constructor from \p float2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const float2 f) {
+        __x = __nv_cvt_float2_to_fp8x2(f, __NV_SATFINITE, __NV_E4M3);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
+     * Constructor from \p double2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const double2 f) {
+        __x = __nv_cvt_double2_to_fp8x2(f, __NV_SATFINITE, __NV_E4M3);
+    }
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening converts */
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
+     * Conversion operator to \p __half2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __half2() const {
+        return static_cast<__half2>(__nv_cvt_fp8x2_to_halfraw2(__x, __NV_E4M3));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
+     * Conversion operator to \p float2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float2() const {
+        return __internal_halfraw2_to_float2(
+            __nv_cvt_fp8x2_to_halfraw2(__x, __NV_E4M3));
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+/**
+ * \defgroup CUDA_MATH_FP8X4_E4M3_STRUCT C++ struct for handling vector type of four fp8 values of e4m3 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+/**
+ * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
+ * \brief __nv_fp8x4_e4m3 datatype
+ *
+ * \details This structure implements the datatype for storage
+ * and operations on the vector of four \p fp8 values of \p e4m3 kind each:
+ * with 1 sign, 4 exponent, 1 implicit and 3 explicit mantissa bits.
+ * The encoding doesn't support Infinity.
+ * NaNs are limited to 0x7F and 0xFF values.
+ */
+struct __CUDA_ALIGN__(4) __nv_fp8x4_e4m3 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
+     * Storage variable contains the vector of four \p fp8 floating-point data
+     * values.
+     */
+    __nv_fp8x4_storage_t __x;
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8x4_e4m3() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+    /* Construct from wider types */
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
+     * Constructor from a pair of \p __half2 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const __half2 flo,
+                                                     const __half2 fhi) {
+        const __nv_fp8x2_storage_t rlo = __nv_cvt_halfraw2_to_fp8x2(
+            static_cast<__half2_raw>(flo), __NV_SATFINITE, __NV_E4M3);
+        const __nv_fp8x2_storage_t rhi = __nv_cvt_halfraw2_to_fp8x2(
+            static_cast<__half2_raw>(fhi), __NV_SATFINITE, __NV_E4M3);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
+     * Constructor from a pair of \p __nv_bfloat162 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const __nv_bfloat162 flo,
+                                                     const __nv_bfloat162 fhi) {
+        const __nv_fp8x2_storage_t rlo = __nv_cvt_bfloat16raw2_to_fp8x2(
+            static_cast<__nv_bfloat162_raw>(flo), __NV_SATFINITE, __NV_E4M3);
+        const __nv_fp8x2_storage_t rhi = __nv_cvt_bfloat16raw2_to_fp8x2(
+            static_cast<__nv_bfloat162_raw>(fhi), __NV_SATFINITE, __NV_E4M3);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
+     * Constructor from \p float4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const float4 f) {
+        const float2 flo = {f.x, f.y};
+        const float2 fhi = {f.z, f.w};
+        const __nv_fp8x2_storage_t rlo =
+            __nv_cvt_float2_to_fp8x2(flo, __NV_SATFINITE, __NV_E4M3);
+        const __nv_fp8x2_storage_t rhi =
+            __nv_cvt_float2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E4M3);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
+     * Constructor from \p double4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const double4 f) {
+        const double2 flo = {f.x, f.y};
+        const double2 fhi = {f.z, f.w};
+        const __nv_fp8x2_storage_t rlo =
+            __nv_cvt_double2_to_fp8x2(flo, __NV_SATFINITE, __NV_E4M3);
+        const __nv_fp8x2_storage_t rhi =
+            __nv_cvt_double2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E4M3);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening converts */
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
+     * Conversion operator to \p float4 vector data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float4() const {
+        const __nv_fp8x2_storage_t slo = static_cast<__nv_fp8x2_storage_t>(__x);
+        const __nv_fp8x2_storage_t shi =
+            static_cast<__nv_fp8x2_storage_t>(__x >> 16U);
+        float2 rlo = __internal_halfraw2_to_float2(
+            __nv_cvt_fp8x2_to_halfraw2(slo, __NV_E4M3));
+        float2 rhi = __internal_halfraw2_to_float2(
+            __nv_cvt_fp8x2_to_halfraw2(shi, __NV_E4M3));
+        float4 res = {rlo.x, rlo.y, rhi.x, rhi.y};
+        return res;
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+#endif /* defined(__cplusplus) */
+#endif /* end of include guard: __CUDA_FP8_HPP__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline_helpers.h ADDED Viewed

	@@ -0,0 +1,373 @@

+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef _CUDA_PIPELINE_HELPERS_H_
+# define _CUDA_PIPELINE_HELPERS_H_
+# define _CUDA_PIPELINE_NAMESPACE       nvcuda::experimental
+# define _CUDA_PIPELINE_BEGIN_NAMESPACE namespace nvcuda { namespace experimental {
+# define _CUDA_PIPELINE_END_NAMESPACE   } }
+# define _CUDA_PIPELINE_INTERNAL_NAMESPACE       _CUDA_PIPELINE_NAMESPACE::__pipeline_internal
+# define _CUDA_PIPELINE_BEGIN_INTERNAL_NAMESPACE _CUDA_PIPELINE_BEGIN_NAMESPACE namespace __pipeline_internal {
+# define _CUDA_PIPELINE_END_INTERNAL_NAMESPACE   } _CUDA_PIPELINE_END_NAMESPACE
+# if !defined(_CUDA_PIPELINE_QUALIFIER)
+#  define _CUDA_PIPELINE_QUALIFIER inline __device__
+# endif
+# if !defined(_CUDA_PIPELINE_STATIC_QUALIFIER)
+#  define _CUDA_PIPELINE_STATIC_QUALIFIER static inline __device__
+# endif
+# if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+#  define _CUDA_PIPELINE_ARCH_700_OR_LATER
+# endif
+# if (__CUDA_ARCH__ >= 800)
+#  define _CUDA_PIPELINE_HAS_ASYNC_COPY 1
+# else
+#  define _CUDA_PIPELINE_HAS_ASYNC_COPY 0
+# endif
+# if !defined(_CUDA_PIPELINE_MAX_STAGES)
+#  define _CUDA_PIPELINE_MAX_STAGES 8
+# endif
+# if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && (_MSC_VER >= 1900)))
+#  define _CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER
+# endif
+# if !defined(_CUDA_PIPELINE_DEBUG)
+#  if defined(__CUDACC_DEBUG__)
+#   define _CUDA_PIPELINE_DEBUG 1
+#  else
+#   define _CUDA_PIPELINE_DEBUG 0
+#  endif
+# endif
+# if defined(_CUDA_PIPELINE_DEBUG) && (_CUDA_PIPELINE_DEBUG == 1) && !defined(NDEBUG)
+#  if !defined(__CUDACC_RTC__)
+#   include <cassert>
+#  endif
+#  define _CUDA_PIPELINE_ASSERT(x) assert((x));
+#  define _CUDA_PIPELINE_ABORT() assert(0);
+# else
+#  define _CUDA_PIPELINE_ASSERT(x)
+#  define _CUDA_PIPELINE_ABORT() __trap();
+# endif
+# if defined(_CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER)
+#  define _CUDA_PIPELINE_STATIC_ASSERT(c, m) static_assert(c, m)
+# else
+#  define _CUDA_PIPELINE_STATIC_ASSERT(c, m)
+# endif
+# if (defined(_MSC_VER) && !defined(_WIN64)) || defined(__arm__)
+#  define _CUDA_PIPELINE_ASM_PTR_CONSTRAINT "r"
+# else
+#  define _CUDA_PIPELINE_ASM_PTR_CONSTRAINT "l"
+# endif
+# if defined(__CUDACC_RTC__)
+typedef unsigned int       uint32_t;
+typedef unsigned long long uint64_t;
+typedef uint64_t           uintptr_t;
+# else
+#  include <stdint.h>
+# endif
+_CUDA_PIPELINE_BEGIN_INTERNAL_NAMESPACE
+_CUDA_PIPELINE_STATIC_ASSERT(sizeof(short) ==  2, "Size mismatch for type 'short'");
+_CUDA_PIPELINE_STATIC_ASSERT(sizeof(int)   ==  4, "Size mismatch for type 'int'");
+_CUDA_PIPELINE_STATIC_ASSERT(sizeof(int2)  ==  8, "Size mismatch for type 'int2'");
+_CUDA_PIPELINE_STATIC_ASSERT(sizeof(int4)  == 16, "Size mismatch for type 'int4'");
+extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void *);
+template<size_t CopySize, size_t SourceSize>
+_CUDA_PIPELINE_QUALIFIER
+void pipeline_memcpy_sync(void* __restrict__ dst, const void* __restrict__ src)
+{
+    _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
+    _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
+    char* const d = reinterpret_cast<char*>(dst);
+    const char* const s = reinterpret_cast<const char*>(src);
+    size_t copy_step_size;
+    if (SourceSize == 0) {
+        copy_step_size = CopySize;
+    } else if (SourceSize == 2 || SourceSize == 4 || SourceSize == 8 || SourceSize == 16) {
+        copy_step_size = SourceSize;
+    } else {
+        copy_step_size = 1;
+    }
+    for (size_t i = 0; i < CopySize; i += copy_step_size) {
+        const bool copy_source = SourceSize && (i < SourceSize);
+        switch (copy_step_size) {
+        case 1:
+            d[i] = copy_source ? s[i] : char();
+            break;
+        case 2:
+            *reinterpret_cast<short*>(d + i) = copy_source ? *reinterpret_cast<const short*>(s + i) : short();
+            break;
+        case 4:
+            *reinterpret_cast<int*>(d + i) = copy_source ? *reinterpret_cast<const int*>(s + i) : int();
+            break;
+        case 8:
+            *reinterpret_cast<int2*>(d + i) = copy_source ? *reinterpret_cast<const int2*>(s + i) : int2();
+            break;
+        case 16:
+            *reinterpret_cast<int4*>(d + i) = copy_source ? *reinterpret_cast<const int4*>(s + i) : int4();
+            break;
+        }
+    }
+}
+template<bool UseHwAsyncCopy>
+struct ImplementationChooser;
+template<>
+struct ImplementationChooser<true> {
+    template<size_t CopySize, size_t SourceSize>
+    struct CpAsyncChooser {
+        _CUDA_PIPELINE_STATIC_QUALIFIER
+        void cp_async(void* __restrict__ dst, const void* __restrict__ src)
+        {
+            asm volatile ("cp.async.ca.shared.global [%0], [%1], %2, %3;"
+                :
+                : "r"(__nvvm_get_smem_pointer(dst)), _CUDA_PIPELINE_ASM_PTR_CONSTRAINT(src), "n"(CopySize),
+                  "n"(SourceSize)
+                : "memory");
+        }
+    };
+    template<size_t SourceSize>
+    struct CpAsyncChooser<16, SourceSize> {
+        _CUDA_PIPELINE_STATIC_QUALIFIER
+        void cp_async(void* __restrict__ dst, const void* __restrict__ src)
+        {
+            asm volatile ("cp.async.cg.shared.global [%0], [%1], %2, %3;"
+                :
+                : "r"(__nvvm_get_smem_pointer(dst)), _CUDA_PIPELINE_ASM_PTR_CONSTRAINT(src), "n"(16), "n"(SourceSize)
+                : "memory");
+        }
+    };
+    template<size_t CopySize, size_t SourceSize>
+    _CUDA_PIPELINE_STATIC_QUALIFIER
+    void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src)
+    {
+        _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
+        _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
+        _CUDA_PIPELINE_ASSERT(__isShared(dst));
+        _CUDA_PIPELINE_ASSERT(__isGlobal(src));
+        _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
+        _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
+        CpAsyncChooser<CopySize, SourceSize>::cp_async(dst, src);
+    }
+    _CUDA_PIPELINE_STATIC_QUALIFIER
+    void pipeline_commit()
+    {
+        asm volatile ("cp.async.commit_group;");
+    }
+    template<unsigned N>
+    _CUDA_PIPELINE_STATIC_QUALIFIER
+    void pipeline_wait_prior()
+    {
+        asm volatile ("cp.async.wait_group %0;"
+            :
+            : "n"(N < _CUDA_PIPELINE_MAX_STAGES ? N : _CUDA_PIPELINE_MAX_STAGES));
+    }
+    _CUDA_PIPELINE_STATIC_QUALIFIER
+    void pipeline_arrive_on(uint64_t* barrier)
+    {
+        _CUDA_PIPELINE_ASSERT(__isShared(barrier));
+        asm volatile ("cp.async.mbarrier.arrive.shared.b64 [%0];"
+            :
+            : "r"(__nvvm_get_smem_pointer(barrier)));
+    }
+};
+template<>
+struct ImplementationChooser<false> {
+    template<size_t CopySize, size_t SourceSize>
+    _CUDA_PIPELINE_STATIC_QUALIFIER
+    void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src)
+    {
+        _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
+        _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
+        _CUDA_PIPELINE_ASSERT(__isShared(dst));
+        _CUDA_PIPELINE_ASSERT(__isGlobal(src));
+        _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
+        _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
+        pipeline_memcpy_sync<CopySize, SourceSize>(dst, src);
+    }
+    _CUDA_PIPELINE_STATIC_QUALIFIER
+    void pipeline_commit()
+    {
+    }
+    template<unsigned N>
+    _CUDA_PIPELINE_STATIC_QUALIFIER
+    void pipeline_wait_prior()
+    {
+    }
+    _CUDA_PIPELINE_STATIC_QUALIFIER
+    void pipeline_arrive_on(uint64_t* barrier)
+    {
+    }
+};
+template<size_t CopySize, size_t SourceSize>
+_CUDA_PIPELINE_QUALIFIER
+void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src)
+{
+    _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
+    _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
+    _CUDA_PIPELINE_ASSERT(__isShared(dst));
+    _CUDA_PIPELINE_ASSERT(__isGlobal(src));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
+    ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_memcpy_async<CopySize, SourceSize>(dst, src);
+}
+_CUDA_PIPELINE_QUALIFIER
+void pipeline_commit()
+{
+    ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_commit();
+}
+template<unsigned N>
+_CUDA_PIPELINE_QUALIFIER
+void pipeline_wait_prior()
+{
+    ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_wait_prior<N>();
+}
+_CUDA_PIPELINE_QUALIFIER
+void pipeline_arrive_on(uint64_t* barrier)
+{
+    ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_arrive_on(barrier);
+}
+template<size_t CopySize, size_t SourceSize>
+_CUDA_PIPELINE_QUALIFIER
+void pipeline_copy_strict(void* __restrict__ dst, const void* __restrict__ src)
+{
+    _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
+    _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size.");
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
+    if (__isGlobal(src) && __isShared(dst)) {
+        pipeline_memcpy_async<CopySize, SourceSize>(dst, src);
+    } else {
+        pipeline_memcpy_sync<CopySize, SourceSize>(dst, src);
+    }
+}
+template<size_t CopySize, size_t Align>
+_CUDA_PIPELINE_QUALIFIER
+void pipeline_copy_relaxed(void* __restrict__ dst, const void* __restrict__ src)
+{
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (Align - 1)));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (Align - 1)));
+    const char* s = reinterpret_cast<const char*>(src);
+    char* d = reinterpret_cast<char*>(dst);
+    size_t remaining = CopySize;
+    while (remaining) {
+        if ((Align >= 16) && (remaining >= 16)) {
+            pipeline_copy_strict<16, 16>(dst, src);
+            d += 16;
+            s += 16;
+            remaining -= 16;
+        } else if ((Align >= 8) && (remaining >= 8)) {
+            pipeline_copy_strict<8, 8>(dst, src);
+            d += 8;
+            s += 8;
+            remaining -= 8;
+        } else if ((Align >= 4) && (remaining >= 4)) {
+            pipeline_copy_strict<4, 4>(dst, src);
+            d += 4;
+            s += 4;
+            remaining -= 4;
+        } else if ((Align >= 2) && (remaining >= 2)) {
+            *reinterpret_cast<short*>(d) = *reinterpret_cast<const short*>(s);
+            d += 2;
+            s += 2;
+            remaining -= 2;
+        } else {
+            *d = *s;
+            d += 1;
+            s += 1;
+            remaining -= 1;
+        }
+    }
+}
+_CUDA_PIPELINE_END_INTERNAL_NAMESPACE
+#endif /* !_CUDA_PIPELINE_HELPERS_H_ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline_primitives.h ADDED Viewed

	@@ -0,0 +1,148 @@

+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef _CUDA_PIPELINE_PRIMITIVES_H_
+# define _CUDA_PIPELINE_PRIMITIVES_H_
+# include "cuda_pipeline_helpers.h"
+_CUDA_PIPELINE_STATIC_QUALIFIER
+void __pipeline_memcpy_async(void* __restrict__ dst_shared, const void* __restrict__ src_global, size_t size_and_align,
+                             size_t zfill = 0)
+{
+    _CUDA_PIPELINE_ASSERT(size_and_align == 4 || size_and_align == 8 || size_and_align == 16);
+    _CUDA_PIPELINE_ASSERT(zfill <= size_and_align);
+    _CUDA_PIPELINE_ASSERT(__isShared(dst_shared));
+    _CUDA_PIPELINE_ASSERT(__isGlobal(src_global));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst_shared) & (size_and_align - 1)));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src_global) & (size_and_align - 1)));
+    switch (size_and_align) {
+    case 16:
+        switch (zfill) {
+        case  0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 16>(dst_shared, src_global); return;
+        case  1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 15>(dst_shared, src_global); return;
+        case  2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 14>(dst_shared, src_global); return;
+        case  3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 13>(dst_shared, src_global); return;
+        case  4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 12>(dst_shared, src_global); return;
+        case  5: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 11>(dst_shared, src_global); return;
+        case  6: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 10>(dst_shared, src_global); return;
+        case  7: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  9>(dst_shared, src_global); return;
+        case  8: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  8>(dst_shared, src_global); return;
+        case  9: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  7>(dst_shared, src_global); return;
+        case 10: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  6>(dst_shared, src_global); return;
+        case 11: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  5>(dst_shared, src_global); return;
+        case 12: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  4>(dst_shared, src_global); return;
+        case 13: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  3>(dst_shared, src_global); return;
+        case 14: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  2>(dst_shared, src_global); return;
+        case 15: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  1>(dst_shared, src_global); return;
+        case 16: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  0>(dst_shared, src_global); return;
+        default: _CUDA_PIPELINE_ABORT();                                                                   return;
+        }
+    case 8:
+        switch (zfill) {
+        case  0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  8>(dst_shared, src_global); return;
+        case  1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  7>(dst_shared, src_global); return;
+        case  2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  6>(dst_shared, src_global); return;
+        case  3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  5>(dst_shared, src_global); return;
+        case  4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  4>(dst_shared, src_global); return;
+        case  5: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  3>(dst_shared, src_global); return;
+        case  6: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  2>(dst_shared, src_global); return;
+        case  7: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  1>(dst_shared, src_global); return;
+        case  8: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  0>(dst_shared, src_global); return;
+        default: _CUDA_PIPELINE_ABORT();                                                                   return;
+        }
+    case 4:
+        switch (zfill) {
+        case  0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4,  4>(dst_shared, src_global); return;
+        case  1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4,  3>(dst_shared, src_global); return;
+        case  2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4,  2>(dst_shared, src_global); return;
+        case  3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4,  1>(dst_shared, src_global); return;
+        case  4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4,  0>(dst_shared, src_global); return;
+        default: _CUDA_PIPELINE_ABORT();                                                                   return;
+        }
+    default:
+        _CUDA_PIPELINE_ABORT();
+        return;
+    }
+}
+_CUDA_PIPELINE_STATIC_QUALIFIER
+void __pipeline_commit()
+{
+    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_commit();
+}
+_CUDA_PIPELINE_STATIC_QUALIFIER
+void __pipeline_wait_prior(size_t prior)
+{
+    switch (prior) {
+    case  0 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<0>(); return;
+    case  1 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<1>(); return;
+    case  2 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<2>(); return;
+    case  3 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<3>(); return;
+    case  4 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<4>(); return;
+    case  5 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<5>(); return;
+    case  6 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<6>(); return;
+    case  7 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<7>(); return;
+    default : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<8>(); return;
+    }
+}
+# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
+#  include "cuda_awbarrier_primitives.h"
+_CUDA_PIPELINE_STATIC_QUALIFIER
+void __pipeline_arrive_on(__mbarrier_t* barrier)
+{
+    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(barrier);
+}
+# endif
+#endif /* !_CUDA_PIPELINE_PRIMITIVES_H_ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_types.h ADDED Viewed

	@@ -0,0 +1,81 @@

+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__DEVICE_TYPES_H__)
+#define __DEVICE_TYPES_H__
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__
+#endif
+#ifndef __DOXYGEN_ONLY__
+#include "crt/host_defines.h"
+#endif
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+enum __device_builtin__ cudaRoundMode
+{
+    cudaRoundNearest,
+    cudaRoundZero,
+    cudaRoundPosInf,
+    cudaRoundMinInf
+};
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__
+#endif
+#endif /* !__DEVICE_TYPES_H__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/host_defines.h ADDED Viewed

	@@ -0,0 +1,65 @@

+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("host_defines.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "host_defines.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__
+#endif
+#include "crt/host_defines.h"
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__
+#endif

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/mma.h ADDED Viewed

	@@ -0,0 +1,60 @@

+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__
+#endif
+#include "crt/mma.h"
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__
+#endif

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_61_intrinsics.h ADDED Viewed

	@@ -0,0 +1,123 @@

+/*
+ * Copyright 2016 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__SM_61_INTRINSICS_H__)
+#define __SM_61_INTRINSICS_H__
+#if defined(__CUDACC_RTC__)
+#define __SM_61_INTRINSICS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_61_INTRINSICS_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+#if defined(__cplusplus) && defined(__CUDACC__)
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 610
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#include "cuda_runtime_api.h"
+#ifndef __CUDA_ARCH__
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+/*******************************************************************************
+*                                                                              *
+*  Below are declarations of SM-6.1 intrinsics which are included as           *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+/******************************************************************************
+ *                                   __dp2a                                   *
+ ******************************************************************************/
+// Generic [_lo]
+__SM_61_INTRINSICS_DECL__ int __dp2a_lo(int srcA, int srcB, int c) __DEF_IF_HOST
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(unsigned int srcA, unsigned int srcB, unsigned int c) __DEF_IF_HOST
+// Vector-style [_lo]
+__SM_61_INTRINSICS_DECL__ int __dp2a_lo(short2 srcA, char4 srcB, int c) __DEF_IF_HOST
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(ushort2 srcA, uchar4 srcB, unsigned int c) __DEF_IF_HOST
+// Generic [_hi]
+__SM_61_INTRINSICS_DECL__ int __dp2a_hi(int srcA, int srcB, int c) __DEF_IF_HOST
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(unsigned int srcA, unsigned int srcB, unsigned int c) __DEF_IF_HOST
+// Vector-style [_hi]
+__SM_61_INTRINSICS_DECL__ int __dp2a_hi(short2 srcA, char4 srcB, int c) __DEF_IF_HOST
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(ushort2 srcA, uchar4 srcB, unsigned int c) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __dp4a                                   *
+ ******************************************************************************/
+// Generic
+__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, int srcB, int c) __DEF_IF_HOST
+__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(unsigned int srcA, unsigned int srcB, unsigned int c) __DEF_IF_HOST
+// Vector-style
+__SM_61_INTRINSICS_DECL__ int __dp4a(char4 srcA, char4 srcB, int c) __DEF_IF_HOST
+__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(uchar4 srcA, uchar4 srcB, unsigned int c) __DEF_IF_HOST
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 610 */
+#endif /* __cplusplus && __CUDACC__ */
+#undef __DEF_IF_HOST
+#undef __SM_61_INTRINSICS_DECL__
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "sm_61_intrinsics.hpp"
+#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
+#endif /* !__SM_61_INTRINSICS_H__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_types.h ADDED Viewed

	@@ -0,0 +1,281 @@

+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__TEXTURE_TYPES_H__)
+#define __TEXTURE_TYPES_H__
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#include "driver_types.h"
+/**
+ * \addtogroup CUDART_TYPES
+ *
+ * @{
+ */
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#define cudaTextureType1D              0x01
+#define cudaTextureType2D              0x02
+#define cudaTextureType3D              0x03
+#define cudaTextureTypeCubemap         0x0C
+#define cudaTextureType1DLayered       0xF1
+#define cudaTextureType2DLayered       0xF2
+#define cudaTextureTypeCubemapLayered  0xFC
+/**
+ * CUDA texture address modes
+ */
+enum __device_builtin__ cudaTextureAddressMode
+{
+    cudaAddressModeWrap   = 0,    /**< Wrapping address mode */
+    cudaAddressModeClamp  = 1,    /**< Clamp to edge address mode */
+    cudaAddressModeMirror = 2,    /**< Mirror address mode */
+    cudaAddressModeBorder = 3     /**< Border address mode */
+};
+/**
+ * CUDA texture filter modes
+ */
+enum __device_builtin__ cudaTextureFilterMode
+{
+    cudaFilterModePoint  = 0,     /**< Point filter mode */
+    cudaFilterModeLinear = 1      /**< Linear filter mode */
+};
+/**
+ * CUDA texture read modes
+ */
+enum __device_builtin__ cudaTextureReadMode
+{
+    cudaReadModeElementType     = 0,  /**< Read texture as specified element type */
+    cudaReadModeNormalizedFloat = 1   /**< Read texture as normalized float */
+};
+/**
+ * CUDA texture reference
+ */
+struct __device_builtin__ textureReference
+{
+    /**
+     * Indicates whether texture reads are normalized or not
+     */
+    int                          normalized;
+    /**
+     * Texture filter mode
+     */
+    enum cudaTextureFilterMode   filterMode;
+    /**
+     * Texture address mode for up to 3 dimensions
+     */
+    enum cudaTextureAddressMode  addressMode[3];
+    /**
+     * Channel descriptor for the texture reference
+     */
+    struct cudaChannelFormatDesc channelDesc;
+    /**
+     * Perform sRGB->linear conversion during texture read
+     */
+    int                          sRGB;
+    /**
+     * Limit to the anisotropy ratio
+     */
+    unsigned int                 maxAnisotropy;
+    /**
+     * Mipmap filter mode
+     */
+    enum cudaTextureFilterMode   mipmapFilterMode;
+    /**
+     * Offset applied to the supplied mipmap level
+     */
+    float                        mipmapLevelBias;
+    /**
+     * Lower end of the mipmap level range to clamp access to
+     */
+    float                        minMipmapLevelClamp;
+    /**
+     * Upper end of the mipmap level range to clamp access to
+     */
+    float                        maxMipmapLevelClamp;
+    /**
+     * Disable any trilinear filtering optimizations.
+     */
+    int                          disableTrilinearOptimization;
+    int                          __cudaReserved[14];
+};
+/**
+ * CUDA texture descriptor
+ */
+struct __device_builtin__ cudaTextureDesc
+{
+    /**
+     * Texture address mode for up to 3 dimensions
+     */
+    enum cudaTextureAddressMode addressMode[3];
+    /**
+     * Texture filter mode
+     */
+    enum cudaTextureFilterMode  filterMode;
+    /**
+     * Texture read mode
+     */
+    enum cudaTextureReadMode    readMode;
+    /**
+     * Perform sRGB->linear conversion during texture read
+     */
+    int                         sRGB;
+    /**
+     * Texture Border Color
+     */
+    float                       borderColor[4];
+    /**
+     * Indicates whether texture reads are normalized or not
+     */
+    int                         normalizedCoords;
+    /**
+     * Limit to the anisotropy ratio
+     */
+    unsigned int                maxAnisotropy;
+    /**
+     * Mipmap filter mode
+     */
+    enum cudaTextureFilterMode  mipmapFilterMode;
+    /**
+     * Offset applied to the supplied mipmap level
+     */
+    float                       mipmapLevelBias;
+    /**
+     * Lower end of the mipmap level range to clamp access to
+     */
+    float                       minMipmapLevelClamp;
+    /**
+     * Upper end of the mipmap level range to clamp access to
+     */
+    float                       maxMipmapLevelClamp;
+    /**
+     * Disable any trilinear filtering optimizations.
+     */
+    int                         disableTrilinearOptimization;
+};
+struct __device_builtin__ cudaTextureDesc_v2
+{
+    /**
+     * Texture address mode for up to 3 dimensions
+     */
+    enum cudaTextureAddressMode addressMode[3];
+    /**
+     * Texture filter mode
+     */
+    enum cudaTextureFilterMode  filterMode;
+    /**
+     * Texture read mode
+     */
+    enum cudaTextureReadMode    readMode;
+    /**
+     * Perform sRGB->linear conversion during texture read
+     */
+    int                         sRGB;
+    /**
+     * Texture Border Color
+     */
+    float                       borderColor[4];
+    /**
+     * Indicates whether texture reads are normalized or not
+     */
+    int                         normalizedCoords;
+    /**
+     * Limit to the anisotropy ratio
+     */
+    unsigned int                maxAnisotropy;
+    /**
+     * Mipmap filter mode
+     */
+    enum cudaTextureFilterMode  mipmapFilterMode;
+    /**
+     * Offset applied to the supplied mipmap level
+     */
+    float                       mipmapLevelBias;
+    /**
+     * Lower end of the mipmap level range to clamp access to
+     */
+    float                       minMipmapLevelClamp;
+    /**
+     * Upper end of the mipmap level range to clamp access to
+     */
+    float                       maxMipmapLevelClamp;
+    /**
+     * Disable any trilinear filtering optimizations.
+     */
+    int                         disableTrilinearOptimization;
+    /**
+     * Enable seamless cube map filtering.
+     */
+    int                         seamlessCubemap;
+};
+/**
+ * An opaque value that represents a CUDA texture object
+ */
+typedef __device_builtin__ unsigned long long cudaTextureObject_t;
+/** @} */
+/** @} */ /* END CUDART_TYPES */
+#endif /* !__TEXTURE_TYPES_H__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/vector_functions.h ADDED Viewed

	@@ -0,0 +1,175 @@

+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__VECTOR_FUNCTIONS_H__)
+#define __VECTOR_FUNCTIONS_H__
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#include "cuda_runtime_api.h"
+#if defined(__CUDACC_RTC__)
+#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
+#endif /* __CUDACC_RTC__ */
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x);
+__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x);
+__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y);
+__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y);
+__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z);
+__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z);
+__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w);
+__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w);
+__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x);
+__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x);
+__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y);
+__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y);
+__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z);
+__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z);
+__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w);
+__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
+__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x);
+__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x);
+__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y);
+__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y);
+__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z);
+__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z);
+__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w);
+__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w);
+__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x);
+__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x);
+__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y);
+__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y);
+__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z);
+__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z);
+__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w);
+__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w);
+__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x);
+__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y);
+__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z);
+__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w);
+__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x);
+__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x);
+__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y);
+__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y);
+__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z);
+__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z);
+__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w);
+__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w);
+__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x);
+__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y);
+__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z);
+__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w);
+#undef __VECTOR_FUNCTIONS_DECL__
+#if !defined(__CUDACC_RTC__)
+#include "vector_functions.hpp"
+#endif /* !__CUDACC_RTC__ */
+#endif /* !__VECTOR_FUNCTIONS_H__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_globals.h ADDED Viewed

	@@ -0,0 +1,93 @@

+ /* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#ifndef CURAND_GLOBALS_H
+#define CURAND_GLOBALS_H
+#define MAX_XOR_N (5)
+#define SKIPAHEAD_BLOCKSIZE (4)
+#define SKIPAHEAD_MASK ((1<<SKIPAHEAD_BLOCKSIZE)-1)
+#define CURAND_2POW32 (4294967296.f)
+#define CURAND_2POW32_DOUBLE (4294967296.)
+#define CURAND_2POW32_INV (2.3283064e-10f)
+#define CURAND_2POW32_INV_DOUBLE (2.3283064365386963e-10)
+#define CURAND_2POW53_INV_DOUBLE (1.1102230246251565e-16)
+#define CURAND_2POW32_INV_2PI (2.3283064e-10f * 6.2831855f)
+#define CURAND_2PI (6.2831855f)
+#define CURAND_2POW53_INV_2PI_DOUBLE (1.1102230246251565e-16 * 6.2831853071795860)
+#define CURAND_PI_DOUBLE  (3.1415926535897932)
+#define CURAND_2PI_DOUBLE (6.2831853071795860)
+#define CURAND_SQRT2 (-1.4142135f)
+#define CURAND_SQRT2_DOUBLE (-1.4142135623730951)
+#define SOBOL64_ITR_BINARY_DIVIDE 2
+#define SOBOL_M2_BINARY_DIVIDE 10
+#define MTGP32_M2_BINARY_DIVIDE 32
+#define MAX_LAMBDA 400000
+#define MIN_GAUSS_LAMBDA 2000
+struct normal_args_st {
+    float mean;
+    float stddev;
+};
+typedef struct normal_args_st normal_args_t;
+struct normal_args_double_st {
+    double mean;
+    double stddev;
+};
+typedef struct normal_args_double_st normal_args_double_t;
+#endif

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_lognormal.h ADDED Viewed

	@@ -0,0 +1,697 @@

+ /* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#if !defined(CURAND_LOGNORMAL_H_)
+#define CURAND_LOGNORMAL_H_
+/**
+ * \defgroup DEVICE Device API
+ *
+ * @{
+ */
+#ifndef __CUDACC_RTC__
+#include <math.h>
+#endif // __CUDACC_RTC__
+#include "curand_mrg32k3a.h"
+#include "curand_mtgp32_kernel.h"
+#include "curand_philox4x32_x.h"
+/**
+ * \brief Return a log-normally distributed float from an XORWOW generator.
+ *
+ * Return a single log-normally distributed float derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the XORWOW generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, transforms them to log-normal distribution,
+ * then returns them one at a time.
+ * See ::curand_log_normal2() for a more efficient version that returns
+ * both results at once.
+ *
+ * \param state  - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS float curand_log_normal(curandStateXORWOW_t *state, float mean, float stddev)
+{
+    if(state->boxmuller_flag != EXTRA_FLAG_LOG_NORMAL) {
+        unsigned int x, y;
+        x = curand(state);
+        y = curand(state);
+        float2 v = _curand_box_muller(x, y);
+        state->boxmuller_extra = expf(mean + (stddev * v.y));
+        state->boxmuller_flag = EXTRA_FLAG_LOG_NORMAL;
+        return expf(mean + (stddev * v.x));
+    }
+    state->boxmuller_flag = 0;
+    return state->boxmuller_extra;
+}
+/**
+ * \brief Return a log-normally distributed float from an Philox4_32_10 generator.
+ *
+ * Return a single log-normally distributed float derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the Philox4_32_10 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, transforms them to log-normal distribution,
+ * then returns them one at a time.
+ * See ::curand_log_normal2() for a more efficient version that returns
+ * both results at once.
+ *
+ * \param state  - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS float curand_log_normal(curandStatePhilox4_32_10_t *state, float mean, float stddev)
+{
+    if(state->boxmuller_flag != EXTRA_FLAG_LOG_NORMAL) {
+        unsigned int x, y;
+        x = curand(state);
+        y = curand(state);
+        float2 v = _curand_box_muller(x, y);
+        state->boxmuller_extra = expf(mean + (stddev * v.y));
+        state->boxmuller_flag = EXTRA_FLAG_LOG_NORMAL;
+        return expf(mean + (stddev * v.x));
+    }
+    state->boxmuller_flag = 0;
+    return state->boxmuller_extra;
+}
+/**
+ * \brief Return two normally distributed floats from an XORWOW generator.
+ *
+ * Return two log-normally distributed floats derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the XORWOW generator in \p state,
+ * increment position of generator by two.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, then transforms them to log-normal.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed float2 where each element is from a
+ * distribution with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS float2 curand_log_normal2(curandStateXORWOW_t *state, float mean, float stddev)
+{
+    float2 v = curand_box_muller(state);
+    v.x = expf(mean + (stddev * v.x));
+    v.y = expf(mean + (stddev * v.y));
+    return v;
+}
+/**
+ * \brief Return two normally distributed floats from an Philox4_32_10 generator.
+ *
+ * Return two log-normally distributed floats derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the Philox4_32_10 generator in \p state,
+ * increment position of generator by two.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, then transforms them to log-normal.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed float2 where each element is from a
+ * distribution with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS float2 curand_log_normal2(curandStatePhilox4_32_10_t *state, float mean, float stddev)
+{
+    float2 v = curand_box_muller(state);
+    v.x = expf(mean + (stddev * v.x));
+    v.y = expf(mean + (stddev * v.y));
+    return v;
+}
+/**
+ * \brief Return four normally distributed floats from an Philox4_32_10 generator.
+ *
+ * Return four log-normally distributed floats derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the Philox4_32_10 generator in \p state,
+ * increment position of generator by four.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, then transforms them to log-normal.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed float4 where each element is from a
+ * distribution with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS float4 curand_log_normal4(curandStatePhilox4_32_10_t *state, float mean, float stddev)
+{
+    float4 v = curand_box_muller4(state);
+    v.x = expf(mean + (stddev * v.x));
+    v.y = expf(mean + (stddev * v.y));
+    v.z = expf(mean + (stddev * v.z));
+    v.w = expf(mean + (stddev * v.w));
+    return v;
+}
+/**
+ * \brief Return a log-normally distributed float from an MRG32k3a generator.
+ *
+ * Return a single log-normally distributed float derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the MRG32k3a generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, transforms them to log-normal distribution,
+ * then returns them one at a time.
+ * See ::curand_log_normal2() for a more efficient version that returns
+ * both results at once.
+ *
+ * \param state  - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS float curand_log_normal(curandStateMRG32k3a_t *state, float mean, float stddev)
+{
+    if(state->boxmuller_flag != EXTRA_FLAG_LOG_NORMAL) {
+        float2 v = curand_box_muller_mrg(state);
+        state->boxmuller_extra = expf(mean + (stddev * v.y));
+        state->boxmuller_flag = EXTRA_FLAG_LOG_NORMAL;
+        return expf(mean + (stddev * v.x));
+    }
+    state->boxmuller_flag = 0;
+    return state->boxmuller_extra;
+}
+/**
+ * \brief Return two normally distributed floats from an MRG32k3a generator.
+ *
+ * Return two log-normally distributed floats derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the MRG32k3a generator in \p state,
+ * increment position of generator by two.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, then transforms them to log-normal.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed float2 where each element is from a
+ * distribution with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS float2 curand_log_normal2(curandStateMRG32k3a_t *state, float mean, float stddev)
+{
+    float2 v = curand_box_muller_mrg(state);
+    v.x = expf(mean + (stddev * v.x));
+    v.y = expf(mean + (stddev * v.y));
+    return v;
+}
+/**
+ * \brief Return a log-normally distributed float from an MTGP32 generator.
+ *
+ * Return a single log-normally distributed float derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the MTGP32 generator in \p state,
+ * increment position of generator.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate a normally distributed result, then transforms the result
+ * to log-normal.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS float curand_log_normal(curandStateMtgp32_t *state, float mean, float stddev)
+{
+    return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
+}
+/**
+ * \brief Return a log-normally distributed float from a Sobol32 generator.
+ *
+ * Return a single log-normally distributed float derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate a normally distributed result, then transforms the result
+ * to log-normal.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS float curand_log_normal(curandStateSobol32_t *state, float mean, float stddev)
+{
+    return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
+}
+/**
+ * \brief Return a log-normally distributed float from a scrambled Sobol32 generator.
+ *
+ * Return a single log-normally distributed float derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the scrambled Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate a normally distributed result, then transforms the result
+ * to log-normal.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS float curand_log_normal(curandStateScrambledSobol32_t *state, float mean, float stddev)
+{
+    return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
+}
+/**
+ * \brief Return a log-normally distributed float from a Sobol64 generator.
+ *
+ * Return a single log-normally distributed float derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results, then converts to log-normal
+ * distribution.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS float curand_log_normal(curandStateSobol64_t *state, float mean, float stddev)
+{
+    return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
+}
+/**
+ * \brief Return a log-normally distributed float from a scrambled Sobol64 generator.
+ *
+ * Return a single log-normally distributed float derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the scrambled Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results, then converts to log-normal
+ * distribution.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS float curand_log_normal(curandStateScrambledSobol64_t *state, float mean, float stddev)
+{
+    return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
+}
+/**
+ * \brief Return a log-normally distributed double from an XORWOW generator.
+ *
+ * Return a single normally distributed double derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the XORWOW generator in \p state,
+ * increment position of generator.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, transforms them to log-normal distribution,
+ * then returns them one at a time.
+ * See ::curand_log_normal2_double() for a more efficient version that returns
+ * both results at once.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS double curand_log_normal_double(curandStateXORWOW_t *state, double mean, double stddev)
+{
+    if(state->boxmuller_flag_double != EXTRA_FLAG_LOG_NORMAL) {
+        unsigned int x0, x1, y0, y1;
+        x0 = curand(state);
+        x1 = curand(state);
+        y0 = curand(state);
+        y1 = curand(state);
+        double2 v = _curand_box_muller_double(x0, x1, y0, y1);
+        state->boxmuller_extra_double = exp(mean + (stddev * v.y));
+        state->boxmuller_flag_double = EXTRA_FLAG_LOG_NORMAL;
+        return exp(mean + (stddev * v.x));
+    }
+    state->boxmuller_flag_double = 0;
+    return state->boxmuller_extra_double;
+}
+/**
+ * \brief Return a log-normally distributed double from an Philox4_32_10 generator.
+ *
+ * Return a single normally distributed double derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the Philox4_32_10 generator in \p state,
+ * increment position of generator.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, transforms them to log-normal distribution,
+ * then returns them one at a time.
+ * See ::curand_log_normal2_double() for a more efficient version that returns
+ * both results at once.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS double curand_log_normal_double(curandStatePhilox4_32_10_t *state, double mean, double stddev)
+{
+    if(state->boxmuller_flag_double != EXTRA_FLAG_LOG_NORMAL) {
+        uint4 _x;
+        _x = curand4(state);
+        double2 v = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
+        state->boxmuller_extra_double = exp(mean + (stddev * v.y));
+        state->boxmuller_flag_double = EXTRA_FLAG_LOG_NORMAL;
+        return exp(mean + (stddev * v.x));
+    }
+    state->boxmuller_flag_double = 0;
+    return state->boxmuller_extra_double;
+}
+/**
+ * \brief Return two log-normally distributed doubles from an XORWOW generator.
+ *
+ * Return two log-normally distributed doubles derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the XORWOW generator in \p state,
+ * increment position of generator by two.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, and transforms them to log-normal distribution,.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed double2 where each element is from a
+ * distribution with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS double2 curand_log_normal2_double(curandStateXORWOW_t *state, double mean, double stddev)
+{
+    double2 v = curand_box_muller_double(state);
+    v.x = exp(mean + (stddev * v.x));
+    v.y = exp(mean + (stddev * v.y));
+    return v;
+}
+/**
+ * \brief Return two log-normally distributed doubles from an Philox4_32_10 generator.
+ *
+ * Return two log-normally distributed doubles derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the Philox4_32_10 generator in \p state,
+ * increment position of generator by four.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, and transforms them to log-normal distribution,.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed double4 where each element is from a
+ * distribution with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS double2 curand_log_normal2_double(curandStatePhilox4_32_10_t *state, double mean, double stddev)
+{
+    double2 v = curand_box_muller2_double(state);
+    v.x = exp(mean + (stddev * v.x));
+    v.y = exp(mean + (stddev * v.y));
+    return v;
+}
+// nor part of API
+QUALIFIERS double4 curand_log_normal4_double(curandStatePhilox4_32_10_t *state, double mean, double stddev)
+{
+    double4 v = curand_box_muller4_double(state);
+    v.x = exp(mean + (stddev * v.x));
+    v.y = exp(mean + (stddev * v.y));
+    v.z = exp(mean + (stddev * v.z));
+    v.w = exp(mean + (stddev * v.w));
+    return v;
+}
+/**
+ * \brief Return a log-normally distributed double from an MRG32k3a generator.
+ *
+ * Return a single normally distributed double derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the MRG32k3a generator in \p state,
+ * increment position of generator.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, transforms them to log-normal distribution,
+ * then returns them one at a time.
+ * See ::curand_log_normal2_double() for a more efficient version that returns
+ * both results at once.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS double curand_log_normal_double(curandStateMRG32k3a_t *state, double mean, double stddev)
+{
+    if(state->boxmuller_flag_double != EXTRA_FLAG_LOG_NORMAL) {
+        double2 v = curand_box_muller_mrg_double(state);
+        state->boxmuller_extra_double = exp(mean + (stddev * v.y));
+        state->boxmuller_flag_double = EXTRA_FLAG_LOG_NORMAL;
+        return exp(mean + (stddev * v.x));
+    }
+    state->boxmuller_flag_double = 0;
+    return state->boxmuller_extra_double;
+}
+/**
+ * \brief Return two log-normally distributed doubles from an MRG32k3a generator.
+ *
+ * Return two log-normally distributed doubles derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the MRG32k3a generator in \p state,
+ * increment position of generator by two.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, and transforms them to log-normal distribution,.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed double2 where each element is from a
+ * distribution with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS double2 curand_log_normal2_double(curandStateMRG32k3a_t *state, double mean, double stddev)
+{
+    double2 v = curand_box_muller_mrg_double(state);
+    v.x = exp(mean + (stddev * v.x));
+    v.y = exp(mean + (stddev * v.y));
+    return v;
+}
+/**
+ * \brief Return a log-normally distributed double from an MTGP32 generator.
+ *
+ * Return a single log-normally distributed double derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the MTGP32 generator in \p state,
+ * increment position of generator.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results, and transforms them into
+ * log-normal distribution.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS double curand_log_normal_double(curandStateMtgp32_t *state, double mean, double stddev)
+{
+    return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
+}
+/**
+ * \brief Return a log-normally distributed double from a Sobol32 generator.
+ *
+ * Return a single log-normally distributed double derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results, and transforms them into
+ * log-normal distribution.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS double curand_log_normal_double(curandStateSobol32_t *state, double mean, double stddev)
+{
+    return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
+}
+/**
+ * \brief Return a log-normally distributed double from a scrambled Sobol32 generator.
+ *
+ * Return a single log-normally distributed double derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the scrambled Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results, and transforms them into
+ * log-normal distribution.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS double curand_log_normal_double(curandStateScrambledSobol32_t *state, double mean, double stddev)
+{
+    return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
+}
+/**
+ * \brief Return a log-normally distributed double from a Sobol64 generator.
+ *
+ * Return a single normally distributed double derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS double curand_log_normal_double(curandStateSobol64_t *state, double mean, double stddev)
+{
+    return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
+}
+/**
+ * \brief Return a log-normally distributed double from a scrambled Sobol64 generator.
+ *
+ * Return a single normally distributed double derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the scrambled Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS double curand_log_normal_double(curandStateScrambledSobol64_t *state, double mean, double stddev)
+{
+    return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
+}
+#endif // !defined(CURAND_LOGNORMAL_H_)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mrg32k3a.h ADDED Viewed

The diff for this file is too large to render. See raw diff

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32dc_p_11213.h ADDED Viewed

The diff for this file is too large to render. See raw diff

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_normal_static.h ADDED Viewed

	@@ -0,0 +1,127 @@

+ /* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#ifndef CURAND_NORMAL_STATIC_H
+#define CURAND_NORMAL_STATIC_H
+#define QUALIFIERS_STATIC __host__ __device__ __forceinline__
+QUALIFIERS_STATIC float _curand_normal_icdf(unsigned int x)
+{
+#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
+    float s = CURAND_SQRT2;
+    // Mirror to avoid loss of precision
+    if(x > 0x80000000UL) {
+        x = 0xffffffffUL - x;
+        s = -s;
+    }
+    float p = x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
+    // p is in (0, 0.5], 2p is in (0, 1]
+    return s * erfcinvf(2.0f * p);
+#else
+    x++;    //suppress warnings
+    return 0.0f;
+#endif
+}
+QUALIFIERS_STATIC float _curand_normal_icdf(unsigned long long x)
+{
+#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
+    unsigned int t = (unsigned int)(x >> 32);
+    float s = CURAND_SQRT2;
+    // Mirror to avoid loss of precision
+    if(t > 0x80000000UL) {
+        t = 0xffffffffUL - t;
+        s = -s;
+    }
+    float p = t * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
+    // p is in (0, 0.5], 2p is in (0, 1]
+    return s * erfcinvf(2.0f * p);
+#else
+    x++;
+    return 0.0f;
+#endif
+}
+QUALIFIERS_STATIC double _curand_normal_icdf_double(unsigned int x)
+{
+#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
+    double s = CURAND_SQRT2_DOUBLE;
+    // Mirror to avoid loss of precision
+    if(x > 0x80000000UL) {
+        x = 0xffffffffUL - x;
+        s = -s;
+    }
+    double p = x * CURAND_2POW32_INV_DOUBLE + (CURAND_2POW32_INV_DOUBLE/2.0);
+    // p is in (0, 0.5], 2p is in (0, 1]
+    return s * erfcinv(2.0 * p);
+#else
+    x++;
+    return 0.0;
+#endif
+}
+QUALIFIERS_STATIC double _curand_normal_icdf_double(unsigned long long x)
+{
+#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
+    double s = CURAND_SQRT2_DOUBLE;
+    x >>= 11;
+    // Mirror to avoid loss of precision
+    if(x > 0x10000000000000UL) {
+        x = 0x1fffffffffffffUL - x;
+        s = -s;
+    }
+    double p = x * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
+    // p is in (0, 0.5], 2p is in (0, 1]
+    return s * erfcinv(2.0 * p);
+#else
+    x++;
+    return 0.0;
+#endif
+}
+#undef QUALIFIERS_STATIC
+#endif

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_philox4x32_x.h ADDED Viewed

	@@ -0,0 +1,194 @@

+/* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * The source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * The Licensed Deliverables contained herein are PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*
+   Copyright 2010-2011, D. E. Shaw Research.
+   All rights reserved.
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions, and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions, and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of D. E. Shaw Research nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef CURAND_PHILOX4X32_X__H_
+#define CURAND_PHILOX4X32_X__H_
+#if !defined(QUALIFIERS)
+#define QUALIFIERS static __forceinline__ __device__
+#endif
+#define PHILOX_W32_0   (0x9E3779B9)
+#define PHILOX_W32_1   (0xBB67AE85)
+#define PHILOX_M4x32_0 (0xD2511F53)
+#define PHILOX_M4x32_1 (0xCD9E8D57)
+struct curandStatePhilox4_32_10 {
+   uint4 ctr;
+   uint4 output;
+   uint2 key;
+   unsigned int STATE;
+   int boxmuller_flag;
+   int boxmuller_flag_double;
+   float boxmuller_extra;
+   double boxmuller_extra_double;
+};
+typedef struct curandStatePhilox4_32_10 curandStatePhilox4_32_10_t;
+QUALIFIERS void Philox_State_Incr(curandStatePhilox4_32_10_t* s, unsigned long long n)
+{
+   unsigned int nlo = (unsigned int)(n);
+   unsigned int nhi = (unsigned int)(n>>32);
+   s->ctr.x += nlo;
+   if( s->ctr.x < nlo )
+      nhi++;
+   s->ctr.y += nhi;
+   if(nhi <= s->ctr.y)
+      return;
+   if(++s->ctr.z) return;
+   ++s->ctr.w;
+}
+QUALIFIERS void Philox_State_Incr_hi(curandStatePhilox4_32_10_t* s, unsigned long long n)
+{
+   unsigned int nlo = (unsigned int)(n);
+   unsigned int nhi = (unsigned int)(n>>32);
+   s->ctr.z += nlo;
+   if( s->ctr.z < nlo )
+      nhi++;
+   s->ctr.w += nhi;
+}
+QUALIFIERS void Philox_State_Incr(curandStatePhilox4_32_10_t* s)
+{
+   if(++s->ctr.x) return;
+   if(++s->ctr.y) return;
+   if(++s->ctr.z) return;
+   ++s->ctr.w;
+}
+QUALIFIERS unsigned int mulhilo32(unsigned int a, unsigned int b, unsigned int* hip)
+{
+#ifndef __CUDA_ARCH__
+   // host code
+   unsigned long long product = ((unsigned long long)a) * ((unsigned long long)b);
+   *hip = product >> 32;
+   return (unsigned int)product;
+#else
+   // device code
+   *hip = __umulhi(a,b);
+   return a*b;
+#endif
+}
+QUALIFIERS uint4 _philox4x32round(uint4 ctr, uint2 key)
+{
+   unsigned int hi0;
+   unsigned int hi1;
+   unsigned int lo0 = mulhilo32(PHILOX_M4x32_0, ctr.x, &hi0);
+   unsigned int lo1 = mulhilo32(PHILOX_M4x32_1, ctr.z, &hi1);
+   uint4 ret  = {hi1^ctr.y^key.x, lo1, hi0^ctr.w^key.y, lo0};
+   return ret;
+}
+QUALIFIERS uint4 curand_Philox4x32_10( uint4 c, uint2 k)
+{
+   c = _philox4x32round(c, k);                           // 1
+   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
+   c = _philox4x32round(c, k);                           // 2
+   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
+   c = _philox4x32round(c, k);                           // 3
+   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
+   c = _philox4x32round(c, k);                           // 4
+   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
+   c = _philox4x32round(c, k);                           // 5
+   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
+   c = _philox4x32round(c, k);                           // 6
+   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
+   c = _philox4x32round(c, k);                           // 7
+   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
+   c = _philox4x32round(c, k);                           // 8
+   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
+   c = _philox4x32round(c, k);                           // 9
+   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
+   return _philox4x32round(c, k);                        // 10
+}
+#endif

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_precalc.h ADDED Viewed

The diff for this file is too large to render. See raw diff

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtCuda.h ADDED Viewed

	@@ -0,0 +1,164 @@

+/*
+* Copyright 2009-2017  NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO USER:
+*
+* This source code is subject to NVIDIA ownership rights under U.S. and
+* international Copyright laws.
+*
+* This software and the information contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+* of a form of NVIDIA software license agreement.
+*
+* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+* OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+* OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+* OR PERFORMANCE OF THIS SOURCE CODE.
+*
+* U.S. Government End Users.   This source code is a "commercial item" as
+* that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+* "commercial computer  software"  and "commercial computer software
+* documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+* and is provided to the U.S. Government only as a commercial end item.
+* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+* source code with only those rights set forth herein.
+*
+* Any use of this source code in individual and commercial software must
+* include, in the user documentation and internal comments to the code,
+* the above Disclaimer and U.S. Government End Users Notice.
+*/
+#ifndef NVTOOLSEXT_CUDA_H_
+#define NVTOOLSEXT_CUDA_H_
+#include "cuda.h"
+#include "nvToolsExt.h"
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+/* ========================================================================= */
+/** \name Functions for CUDA Resource Naming
+*/
+/** \addtogroup RESOURCE_NAMING
+ * \section RESOURCE_NAMING_CUDA CUDA Resource Naming
+ *
+ * This section covers the API functions that allow to annotate CUDA resources
+ * with user-provided names.
+ *
+ * @{
+ */
+/*  ------------------------------------------------------------------------- */
+/* \cond SHOW_HIDDEN
+* \brief Used to build a non-colliding value for resource types separated class
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_RESOURCE_CLASS_CUDA  4
+/** \endcond */
+/*  ------------------------------------------------------------------------- */
+/** \brief Resource types for CUDA
+*/
+typedef enum nvtxResourceCUDAType_t
+{
+    NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */
+    NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */
+    NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */
+    NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4)  /* CUevent */
+} nvtxResourceCUDAType_t;
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA device.
+ *
+ * Allows the user to associate a CUDA device with a user-provided name.
+ *
+ * \param device - The handle of the CUDA device to name.
+ * \param name   - The name of the CUDA device.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA context.
+ *
+ * Allows the user to associate a CUDA context with a user-provided name.
+ *
+ * \param context - The handle of the CUDA context to name.
+ * \param name    - The name of the CUDA context.
+ *
+ * \par Example:
+ * \code
+ * CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );
+ * if ( CUDA_SUCCESS != status )
+ *     goto Error;
+ * nvtxNameCuContext(cuContext, "CTX_NAME");
+ * \endcode
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA stream.
+ *
+ * Allows the user to associate a CUDA stream with a user-provided name.
+ *
+ * \param stream - The handle of the CUDA stream to name.
+ * \param name   - The name of the CUDA stream.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA event.
+ *
+ * Allows the user to associate a CUDA event with a user-provided name.
+ *
+ * \param event - The handle of the CUDA event to name.
+ * \param name  - The name of the CUDA event.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name);
+/** @} */
+/** @} */ /* END RESOURCE_NAMING */
+/* ========================================================================= */
+#ifdef UNICODE
+  #define nvtxNameCuDevice   nvtxNameCuDeviceW
+  #define nvtxNameCuContext  nvtxNameCuContextW
+  #define nvtxNameCuStream   nvtxNameCuStreamW
+  #define nvtxNameCuEvent    nvtxNameCuEventW
+#else
+  #define nvtxNameCuDevice   nvtxNameCuDeviceA
+  #define nvtxNameCuContext  nvtxNameCuContextA
+  #define nvtxNameCuStream   nvtxNameCuStreamA
+  #define nvtxNameCuEvent    nvtxNameCuEventA
+#endif
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* NVTOOLSEXT_CUDA_H_ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtOpenCL.h ADDED Viewed

	@@ -0,0 +1,214 @@

+/*
+* Copyright 2009-2017 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO USER:
+*
+* This source code is subject to NVIDIA ownership rights under U.S. and
+* international Copyright laws.
+*
+* This software and the information contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+* of a form of NVIDIA software license agreement.
+*
+* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+* OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+* OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+* OR PERFORMANCE OF THIS SOURCE CODE.
+*
+* U.S. Government End Users.   This source code is a "commercial item" as
+* that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+* "commercial computer  software"  and "commercial computer software
+* documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+* and is provided to the U.S. Government only as a commercial end item.
+* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+* source code with only those rights set forth herein.
+*
+* Any use of this source code in individual and commercial software must
+* include, in the user documentation and internal comments to the code,
+* the above Disclaimer and U.S. Government End Users Notice.
+*/
+#ifndef NVTOOLSEXT_OPENCL_H_
+#define NVTOOLSEXT_OPENCL_H_
+#include <CL/cl.h>
+#include "nvToolsExt.h"
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+/* ========================================================================= */
+/** \name Functions for OpenCL Resource Naming
+ */
+/** \addtogroup RESOURCE_NAMING
+ * \section RESOURCE_NAMING_OPENCL OpenCL Resource Naming
+ *
+ * This section covers the API functions that allow to annotate OpenCL resources
+ * with user-provided names.
+ *
+ * @{
+ */
+/*  ------------------------------------------------------------------------- */
+/* \cond SHOW_HIDDEN
+* \brief Used to build a non-colliding value for resource types separated class
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_RESOURCE_CLASS_OPENCL 6
+/** \endcond */
+/*  ------------------------------------------------------------------------- */
+/** \brief Resource types for OpenCL
+*/
+typedef enum nvtxResourceOpenCLType_t
+{
+    NVTX_RESOURCE_TYPE_OPENCL_DEVICE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 1),
+    NVTX_RESOURCE_TYPE_OPENCL_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 2),
+    NVTX_RESOURCE_TYPE_OPENCL_COMMANDQUEUE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 3),
+    NVTX_RESOURCE_TYPE_OPENCL_MEMOBJECT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 4),
+    NVTX_RESOURCE_TYPE_OPENCL_SAMPLER = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 5),
+    NVTX_RESOURCE_TYPE_OPENCL_PROGRAM = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 6),
+    NVTX_RESOURCE_TYPE_OPENCL_EVENT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 7)
+} nvtxResourceOpenCLType_t;
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL device.
+ *
+ * Allows to associate an OpenCL device with a user-provided name.
+ *
+ * \param device - The handle of the OpenCL device to name.
+ * \param name   - The name of the OpenCL device.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL context.
+ *
+ * Allows to associate an OpenCL context with a user-provided name.
+ *
+ * \param context - The handle of the OpenCL context to name.
+ * \param name    - The name of the OpenCL context.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL command queue.
+ *
+ * Allows to associate an OpenCL command queue with a user-provided name.
+ *
+ * \param command_queue - The handle of the OpenCL command queue to name.
+ * \param name          - The name of the OpenCL command queue.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL memory object.
+ *
+ * Allows to associate an OpenCL memory object with a user-provided name.
+ *
+ * \param memobj - The handle of the OpenCL memory object to name.
+ * \param name   - The name of the OpenCL memory object.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL sampler.
+ *
+ * Allows to associate an OpenCL sampler with a user-provided name.
+ *
+ * \param sampler - The handle of the OpenCL sampler to name.
+ * \param name    - The name of the OpenCL sampler.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL program.
+ *
+ * Allows to associate an OpenCL program with a user-provided name.
+ *
+ * \param program - The handle of the OpenCL program to name.
+ * \param name    - The name of the OpenCL program.
+ *
+ * \code
+ * cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
+ *     (const char **) &cSourceCL, &program_length, &ciErrNum);
+ * shrCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+ * nvtxNameClProgram(cpProgram, L"PROGRAM_NAME");
+ * \endcode
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL event.
+ *
+ * Allows to associate an OpenCL event with a user-provided name.
+ *
+ * \param evnt - The handle of the OpenCL event to name.
+ * \param name - The name of the OpenCL event.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name);
+/** @} */
+/** @} */ /* END RESOURCE_NAMING */
+/* ========================================================================= */
+#ifdef UNICODE
+  #define nvtxNameClDevice        nvtxNameClDeviceW
+  #define nvtxNameClContext       nvtxNameClContextW
+  #define nvtxNameClCommandQueue  nvtxNameClCommandQueueW
+  #define nvtxNameClMemObject     nvtxNameClMemObjectW
+  #define nvtxNameClSampler       nvtxNameClSamplerW
+  #define nvtxNameClProgram       nvtxNameClProgramW
+  #define nvtxNameClEvent         nvtxNameClEventW
+#else
+  #define nvtxNameClDevice        nvtxNameClDeviceA
+  #define nvtxNameClContext       nvtxNameClContextA
+  #define nvtxNameClCommandQueue  nvtxNameClCommandQueueA
+  #define nvtxNameClMemObject     nvtxNameClMemObjectA
+  #define nvtxNameClSampler       nvtxNameClSamplerA
+  #define nvtxNameClProgram       nvtxNameClProgramA
+  #define nvtxNameClEvent         nvtxNameClEventA
+#endif
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* NVTOOLSEXT_OPENCL_H_ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExtOpenCL.h ADDED Viewed

	@@ -0,0 +1,220 @@

+/*
+* Copyright 2009-2016  NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO USER:
+*
+* This source code is subject to NVIDIA ownership rights under U.S. and
+* international Copyright laws.
+*
+* This software and the information contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+* of a form of NVIDIA software license agreement.
+*
+* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+* OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+* OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+* OR PERFORMANCE OF THIS SOURCE CODE.
+*
+* U.S. Government End Users.   This source code is a "commercial item" as
+* that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+* "commercial computer  software"  and "commercial computer software
+* documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+* and is provided to the U.S. Government only as a commercial end item.
+* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+* source code with only those rights set forth herein.
+*
+* Any use of this source code in individual and commercial software must
+* include, in the user documentation and internal comments to the code,
+* the above Disclaimer and U.S. Government End Users Notice.
+*/
+#include "nvToolsExt.h"
+#include <CL/cl.h>
+#ifndef NVTOOLSEXT_OPENCL_V3
+#define NVTOOLSEXT_OPENCL_V3
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+/* ========================================================================= */
+/** \name Functions for OpenCL Resource Naming
+ */
+/** \addtogroup RESOURCE_NAMING
+ * \section RESOURCE_NAMING_OPENCL OpenCL Resource Naming
+ *
+ * This section covers the API functions that allow to annotate OpenCL resources
+ * with user-provided names.
+ *
+ * @{
+ */
+/*  ------------------------------------------------------------------------- */
+/* \cond SHOW_HIDDEN
+* \brief Used to build a non-colliding value for resource types separated class
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_RESOURCE_CLASS_OPENCL 6
+/** \endcond */
+/*  ------------------------------------------------------------------------- */
+/** \brief Resource types for OpenCL
+*/
+typedef enum nvtxResourceOpenCLType_t
+{
+    NVTX_RESOURCE_TYPE_OPENCL_DEVICE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 1),
+    NVTX_RESOURCE_TYPE_OPENCL_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 2),
+    NVTX_RESOURCE_TYPE_OPENCL_COMMANDQUEUE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 3),
+    NVTX_RESOURCE_TYPE_OPENCL_MEMOBJECT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 4),
+    NVTX_RESOURCE_TYPE_OPENCL_SAMPLER = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 5),
+    NVTX_RESOURCE_TYPE_OPENCL_PROGRAM = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 6),
+    NVTX_RESOURCE_TYPE_OPENCL_EVENT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 7),
+} nvtxResourceOpenCLType_t;
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL device.
+ *
+ * Allows to associate an OpenCL device with a user-provided name.
+ *
+ * \param device - The handle of the OpenCL device to name.
+ * \param name   - The name of the OpenCL device.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL context.
+ *
+ * Allows to associate an OpenCL context with a user-provided name.
+ *
+ * \param context - The handle of the OpenCL context to name.
+ * \param name    - The name of the OpenCL context.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL command queue.
+ *
+ * Allows to associate an OpenCL command queue with a user-provided name.
+ *
+ * \param command_queue - The handle of the OpenCL command queue to name.
+ * \param name          - The name of the OpenCL command queue.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL memory object.
+ *
+ * Allows to associate an OpenCL memory object with a user-provided name.
+ *
+ * \param memobj - The handle of the OpenCL memory object to name.
+ * \param name   - The name of the OpenCL memory object.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL sampler.
+ *
+ * Allows to associate an OpenCL sampler with a user-provided name.
+ *
+ * \param sampler - The handle of the OpenCL sampler to name.
+ * \param name    - The name of the OpenCL sampler.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL program.
+ *
+ * Allows to associate an OpenCL program with a user-provided name.
+ *
+ * \param program - The handle of the OpenCL program to name.
+ * \param name    - The name of the OpenCL program.
+ *
+ * \code
+ * cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
+ *     (const char **) &cSourceCL, &program_length, &ciErrNum);
+ * shrCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+ * nvtxNameClProgram(cpProgram, L"PROGRAM_NAME");
+ * \endcode
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL event.
+ *
+ * Allows to associate an OpenCL event with a user-provided name.
+ *
+ * \param evnt - The handle of the OpenCL event to name.
+ * \param name - The name of the OpenCL event.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name);
+/** @} */
+/** @} */ /* END RESOURCE_NAMING */
+/* ========================================================================= */
+#ifdef UNICODE
+  #define nvtxNameClDevice        nvtxNameClDeviceW
+  #define nvtxNameClContext       nvtxNameClContextW
+  #define nvtxNameClCommandQueue  nvtxNameClCommandQueueW
+  #define nvtxNameClMemObject     nvtxNameClMemObjectW
+  #define nvtxNameClSampler       nvtxNameClSamplerW
+  #define nvtxNameClProgram       nvtxNameClProgramW
+  #define nvtxNameClEvent         nvtxNameClEventW
+#else
+  #define nvtxNameClDevice        nvtxNameClDeviceA
+  #define nvtxNameClContext       nvtxNameClContextA
+  #define nvtxNameClCommandQueue  nvtxNameClCommandQueueA
+  #define nvtxNameClMemObject     nvtxNameClMemObjectA
+  #define nvtxNameClSampler       nvtxNameClSamplerA
+  #define nvtxNameClProgram       nvtxNameClProgramA
+  #define nvtxNameClEvent         nvtxNameClEventA
+#endif
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#ifndef NVTX_NO_IMPL
+#define NVTX_IMPL_GUARD_OPENCL /* Ensure other headers cannot included directly */
+#include "nvtxDetail/nvtxImplOpenCL_v3.h"
+#undef NVTX_IMPL_GUARD_OPENCL
+#endif /*NVTX_NO_IMPL*/
+#endif /* NVTOOLSEXT_OPENCL_V3 */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/WHEEL ADDED Viewed

	@@ -0,0 +1,5 @@

+Wheel-Version: 1.0
+Generator: setuptools (74.1.2)
+Root-Is-Purelib: true
+Tag: py3-none-any

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11-2.13.6.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ pybind11

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .pyximport import *
+# replicate docstring
+from .pyximport import __doc__

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__pycache__/_pyximport2.cpython-311.pyc ADDED Viewed

Binary file (28.6 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/__pycache__/pyxbuild.cpython-311.pyc ADDED Viewed

Binary file (7.1 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pyximport/_pyximport3.py ADDED Viewed

	@@ -0,0 +1,478 @@

+"""
+Import hooks; when installed with the install() function, these hooks
+allow importing .pyx files as if they were Python modules.
+If you want the hook installed every time you run Python
+you can add it to your Python version by adding these lines to
+sitecustomize.py (which you can create from scratch in site-packages
+if it doesn't exist there or somewhere else on your python path)::
+    import pyximport
+    pyximport.install()
+For instance on the Mac with a non-system Python 2.3, you could create
+sitecustomize.py with only those two lines at
+/usr/local/lib/python2.3/site-packages/sitecustomize.py .
+A custom distutils.core.Extension instance and setup() args
+(Distribution) for for the build can be defined by a <modulename>.pyxbld
+file like:
+# examplemod.pyxbld
+def make_ext(modname, pyxfilename):
+    from distutils.extension import Extension
+    return Extension(name = modname,
+                     sources=[pyxfilename, 'hello.c'],
+                     include_dirs=['/myinclude'] )
+def make_setup_args():
+    return dict(script_args=["--compiler=mingw32"])
+Extra dependencies can be defined by a <modulename>.pyxdep .
+See README.
+Since Cython 0.11, the :mod:`pyximport` module also has experimental
+compilation support for normal Python modules.  This allows you to
+automatically run Cython on every .pyx and .py module that Python
+imports, including parts of the standard library and installed
+packages.  Cython will still fail to compile a lot of Python modules,
+in which case the import mechanism will fall back to loading the
+Python source modules instead.  The .py import mechanism is installed
+like this::
+    pyximport.install(pyimport = True)
+Running this module as a top-level script will run a test and then print
+the documentation.
+"""
+import glob
+import importlib
+import os
+import sys
+from importlib.abc import MetaPathFinder
+from importlib.machinery import ExtensionFileLoader, SourceFileLoader
+from importlib.util import spec_from_file_location
+mod_name = "pyximport"
+PY_EXT = ".py"
+PYX_EXT = ".pyx"
+PYXDEP_EXT = ".pyxdep"
+PYXBLD_EXT = ".pyxbld"
+DEBUG_IMPORT = False
+def _print(message, args):
+    if args:
+        message = message % args
+    print(message)
+def _debug(message, *args):
+    if DEBUG_IMPORT:
+        _print(message, args)
+def _info(message, *args):
+    _print(message, args)
+def load_source(file_path):
+    import importlib.util
+    from importlib.machinery import SourceFileLoader
+    spec = importlib.util.spec_from_file_location("XXXX", file_path, loader=SourceFileLoader("XXXX", file_path))
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+def get_distutils_extension(modname, pyxfilename, language_level=None):
+#    try:
+#        import hashlib
+#    except ImportError:
+#        import md5 as hashlib
+#    extra = "_" + hashlib.md5(open(pyxfilename).read()).hexdigest()
+#    modname = modname + extra
+    extension_mod,setup_args = handle_special_build(modname, pyxfilename)
+    if not extension_mod:
+        if not isinstance(pyxfilename, str):
+            # distutils is stupid in Py2 and requires exactly 'str'
+            # => encode accidentally coerced unicode strings back to str
+            pyxfilename = pyxfilename.encode(sys.getfilesystemencoding())
+        from distutils.extension import Extension
+        extension_mod = Extension(name = modname, sources=[pyxfilename])
+        if language_level is not None:
+            extension_mod.cython_directives = {'language_level': language_level}
+    return extension_mod,setup_args
+def handle_special_build(modname, pyxfilename):
+    special_build = os.path.splitext(pyxfilename)[0] + PYXBLD_EXT
+    ext = None
+    setup_args={}
+    if os.path.exists(special_build):
+        # globls = {}
+        # locs = {}
+        # execfile(special_build, globls, locs)
+        # ext = locs["make_ext"](modname, pyxfilename)
+        mod = load_source(special_build)
+        make_ext = getattr(mod,'make_ext',None)
+        if make_ext:
+            ext = make_ext(modname, pyxfilename)
+            assert ext and ext.sources, "make_ext in %s did not return Extension" % special_build
+        make_setup_args = getattr(mod, 'make_setup_args',None)
+        if make_setup_args:
+            setup_args = make_setup_args()
+            assert isinstance(setup_args,dict), ("make_setup_args in %s did not return a dict"
+                                         % special_build)
+        assert ext or setup_args, ("neither make_ext nor make_setup_args %s"
+                                         % special_build)
+        ext.sources = [os.path.join(os.path.dirname(special_build), source)
+                       for source in ext.sources]
+    return ext, setup_args
+def handle_dependencies(pyxfilename):
+    testing = '_test_files' in globals()
+    dependfile = os.path.splitext(pyxfilename)[0] + PYXDEP_EXT
+    # by default let distutils decide whether to rebuild on its own
+    # (it has a better idea of what the output file will be)
+    # but we know more about dependencies so force a rebuild if
+    # some of the dependencies are newer than the pyxfile.
+    if os.path.exists(dependfile):
+        with open(dependfile) as fid:
+            depends = fid.readlines()
+        depends = [depend.strip() for depend in depends]
+        # gather dependencies in the "files" variable
+        # the dependency file is itself a dependency
+        files = [dependfile]
+        for depend in depends:
+            fullpath = os.path.join(os.path.dirname(dependfile),
+                                    depend)
+            files.extend(glob.glob(fullpath))
+        # only for unit testing to see we did the right thing
+        if testing:
+            _test_files[:] = []  #$pycheck_no
+        # if any file that the pyxfile depends upon is newer than
+        # the pyx file, 'touch' the pyx file so that distutils will
+        # be tricked into rebuilding it.
+        for file in files:
+            from distutils.dep_util import newer
+            if newer(file, pyxfilename):
+                _debug("Rebuilding %s because of %s", pyxfilename, file)
+                filetime = os.path.getmtime(file)
+                os.utime(pyxfilename, (filetime, filetime))
+                if testing:
+                    _test_files.append(file)
+def build_module(name, pyxfilename, pyxbuild_dir=None, inplace=False, language_level=None):
+    assert os.path.exists(pyxfilename), "Path does not exist: %s" % pyxfilename
+    handle_dependencies(pyxfilename)
+    extension_mod, setup_args = get_distutils_extension(name, pyxfilename, language_level)
+    build_in_temp = pyxargs.build_in_temp
+    sargs = pyxargs.setup_args.copy()
+    sargs.update(setup_args)
+    build_in_temp = sargs.pop('build_in_temp',build_in_temp)
+    from . import pyxbuild
+    olddir = os.getcwd()
+    common = ''
+    if pyxbuild_dir and sys.platform == 'win32':
+        # Windows concatenates the pyxbuild_dir to the pyxfilename when
+        # compiling, and then complains that the filename is too long
+        common = os.path.commonprefix([pyxbuild_dir, pyxfilename])
+    if len(common) > 30:
+        pyxfilename = os.path.relpath(pyxfilename, common)
+        pyxbuild_dir = os.path.relpath(pyxbuild_dir, common)
+        os.chdir(common)
+    try:
+        so_path = pyxbuild.pyx_to_dll(pyxfilename, extension_mod,
+                                      build_in_temp=build_in_temp,
+                                      pyxbuild_dir=pyxbuild_dir,
+                                      setup_args=sargs,
+                                      inplace=inplace,
+                                      reload_support=pyxargs.reload_support)
+    finally:
+        os.chdir(olddir)
+    so_path = os.path.join(common, so_path)
+    assert os.path.exists(so_path), "Cannot find: %s" % so_path
+    junkpath = os.path.join(os.path.dirname(so_path), name+"_*")  #very dangerous with --inplace ? yes, indeed, trying to eat my files ;)
+    junkstuff = glob.glob(junkpath)
+    for path in junkstuff:
+        if path != so_path:
+            try:
+                os.remove(path)
+            except IOError:
+                _info("Couldn't remove %s", path)
+    return so_path
+# import hooks
+class PyxImportMetaFinder(MetaPathFinder):
+    def __init__(self, extension=PYX_EXT, pyxbuild_dir=None, inplace=False, language_level=None):
+        self.pyxbuild_dir = pyxbuild_dir
+        self.inplace = inplace
+        self.language_level = language_level
+        self.extension = extension
+    def find_spec(self, fullname, path, target=None):
+        if not path:
+            path = [os.getcwd()]  # top level import --
+        if "." in fullname:
+            *parents, name = fullname.split(".")
+        else:
+            name = fullname
+        for entry in path:
+            if os.path.isdir(os.path.join(entry, name)):
+                # this module has child modules
+                filename = os.path.join(entry, name, "__init__" + self.extension)
+                submodule_locations = [os.path.join(entry, name)]
+            else:
+                filename = os.path.join(entry, name + self.extension)
+                submodule_locations = None
+            if not os.path.exists(filename):
+                continue
+            return spec_from_file_location(
+                fullname, filename,
+                loader=PyxImportLoader(filename, self.pyxbuild_dir, self.inplace, self.language_level),
+                submodule_search_locations=submodule_locations)
+        return None  # we don't know how to import this
+class PyImportMetaFinder(MetaPathFinder):
+    def __init__(self, extension=PY_EXT, pyxbuild_dir=None, inplace=False, language_level=None):
+        self.pyxbuild_dir = pyxbuild_dir
+        self.inplace = inplace
+        self.language_level = language_level
+        self.extension = extension
+        self.uncompilable_modules = {}
+        self.blocked_modules = ['Cython', 'pyxbuild', 'pyximport.pyxbuild',
+                                'distutils', 'cython']
+        self.blocked_packages = ['Cython.', 'distutils.']
+    def find_spec(self, fullname, path, target=None):
+        if fullname in sys.modules:
+            return None
+        if any([fullname.startswith(pkg) for pkg in self.blocked_packages]):
+            return None
+        if fullname in self.blocked_modules:
+            # prevent infinite recursion
+            return None
+        self.blocked_modules.append(fullname)
+        name = fullname
+        if not path:
+            path = [os.getcwd()]  # top level import --
+        try:
+            for entry in path:
+                if os.path.isdir(os.path.join(entry, name)):
+                    # this module has child modules
+                    filename = os.path.join(entry, name, "__init__" + self.extension)
+                    submodule_locations = [os.path.join(entry, name)]
+                else:
+                    filename = os.path.join(entry, name + self.extension)
+                    submodule_locations = None
+                if not os.path.exists(filename):
+                    continue
+                return spec_from_file_location(
+                    fullname, filename,
+                    loader=PyxImportLoader(filename, self.pyxbuild_dir, self.inplace, self.language_level),
+                    submodule_search_locations=submodule_locations)
+        finally:
+            self.blocked_modules.pop()
+        return None  # we don't know how to import this
+class PyxImportLoader(ExtensionFileLoader):
+    def __init__(self, filename, pyxbuild_dir, inplace, language_level):
+        module_name = os.path.splitext(os.path.basename(filename))[0]
+        super().__init__(module_name, filename)
+        self._pyxbuild_dir = pyxbuild_dir
+        self._inplace = inplace
+        self._language_level = language_level
+    def create_module(self, spec):
+        try:
+            so_path = build_module(spec.name, pyxfilename=spec.origin, pyxbuild_dir=self._pyxbuild_dir,
+                                   inplace=self._inplace, language_level=self._language_level)
+            self.path = so_path
+            spec.origin = so_path
+            return super().create_module(spec)
+        except Exception as failure_exc:
+            _debug("Failed to load extension module: %r" % failure_exc)
+            if pyxargs.load_py_module_on_import_failure and spec.origin.endswith(PY_EXT):
+                spec = importlib.util.spec_from_file_location(spec.name, spec.origin,
+                                                              loader=SourceFileLoader(spec.name, spec.origin))
+                mod = importlib.util.module_from_spec(spec)
+                assert mod.__file__ in (spec.origin, spec.origin + 'c', spec.origin + 'o'), (mod.__file__, spec.origin)
+                return mod
+            else:
+                tb = sys.exc_info()[2]
+                import traceback
+                exc = ImportError("Building module %s failed: %s" % (
+                    spec.name, traceback.format_exception_only(*sys.exc_info()[:2])))
+                raise exc.with_traceback(tb)
+    def exec_module(self, module):
+        try:
+            return super().exec_module(module)
+        except Exception as failure_exc:
+            import traceback
+            _debug("Failed to load extension module: %r" % failure_exc)
+            raise ImportError("Executing module %s failed %s" % (
+                    module.__file__, traceback.format_exception_only(*sys.exc_info()[:2])))
+#install args
+class PyxArgs(object):
+    build_dir=True
+    build_in_temp=True
+    setup_args={}   #None
+def _have_importers():
+    has_py_importer = False
+    has_pyx_importer = False
+    for importer in sys.meta_path:
+        if isinstance(importer, PyxImportMetaFinder):
+            if isinstance(importer, PyImportMetaFinder):
+                has_py_importer = True
+            else:
+                has_pyx_importer = True
+    return has_py_importer, has_pyx_importer
+def install(pyximport=True, pyimport=False, build_dir=None, build_in_temp=True,
+            setup_args=None, reload_support=False,
+            load_py_module_on_import_failure=False, inplace=False,
+            language_level=None):
+    """ Main entry point for pyxinstall.
+    Call this to install the ``.pyx`` import hook in
+    your meta-path for a single Python process.  If you want it to be
+    installed whenever you use Python, add it to your ``sitecustomize``
+    (as described above).
+    :param pyximport: If set to False, does not try to import ``.pyx`` files.
+    :param pyimport: You can pass ``pyimport=True`` to also
+        install the ``.py`` import hook
+        in your meta-path.  Note, however, that it is rather experimental,
+        will not work at all for some ``.py`` files and packages, and will
+        heavily slow down your imports due to search and compilation.
+        Use at your own risk.
+    :param build_dir: By default, compiled modules will end up in a ``.pyxbld``
+        directory in the user's home directory.  Passing a different path
+        as ``build_dir`` will override this.
+    :param build_in_temp: If ``False``, will produce the C files locally. Working
+        with complex dependencies and debugging becomes more easy. This
+        can principally interfere with existing files of the same name.
+    :param setup_args: Dict of arguments for Distribution.
+        See ``distutils.core.setup()``.
+    :param reload_support: Enables support for dynamic
+        ``reload(my_module)``, e.g. after a change in the Cython code.
+        Additional files ``<so_path>.reloadNN`` may arise on that account, when
+        the previously loaded module file cannot be overwritten.
+    :param load_py_module_on_import_failure: If the compilation of a ``.py``
+        file succeeds, but the subsequent import fails for some reason,
+        retry the import with the normal ``.py`` module instead of the
+        compiled module.  Note that this may lead to unpredictable results
+        for modules that change the system state during their import, as
+        the second import will rerun these modifications in whatever state
+        the system was left after the import of the compiled module
+        failed.
+    :param inplace: Install the compiled module
+        (``.so`` for Linux and Mac / ``.pyd`` for Windows)
+        next to the source file.
+    :param language_level: The source language level to use: 2 or 3.
+        The default is to use the language level of the current Python
+        runtime for .py files and Py2 for ``.pyx`` files.
+    """
+    if setup_args is None:
+        setup_args = {}
+    if not build_dir:
+        build_dir = os.path.join(os.path.expanduser('~'), '.pyxbld')
+    global pyxargs
+    pyxargs = PyxArgs()  #$pycheck_no
+    pyxargs.build_dir = build_dir
+    pyxargs.build_in_temp = build_in_temp
+    pyxargs.setup_args = (setup_args or {}).copy()
+    pyxargs.reload_support = reload_support
+    pyxargs.load_py_module_on_import_failure = load_py_module_on_import_failure
+    has_py_importer, has_pyx_importer = _have_importers()
+    py_importer, pyx_importer = None, None
+    if pyimport and not has_py_importer:
+        py_importer = PyImportMetaFinder(pyxbuild_dir=build_dir, inplace=inplace,
+                                         language_level=language_level)
+        # make sure we import Cython before we install the import hook
+        import Cython.Compiler.Main, Cython.Compiler.Pipeline, Cython.Compiler.Optimize
+        sys.meta_path.insert(0, py_importer)
+    if pyximport and not has_pyx_importer:
+        pyx_importer = PyxImportMetaFinder(pyxbuild_dir=build_dir, inplace=inplace,
+                                           language_level=language_level)
+        sys.meta_path.append(pyx_importer)
+    return py_importer, pyx_importer
+def uninstall(py_importer, pyx_importer):
+    """
+    Uninstall an import hook.
+    """
+    try:
+        sys.meta_path.remove(py_importer)
+    except ValueError:
+        pass
+    try:
+        sys.meta_path.remove(pyx_importer)
+    except ValueError:
+        pass
+# MAIN
+def show_docs():
+    import __main__
+    __main__.__name__ = mod_name
+    for name in dir(__main__):
+        item = getattr(__main__, name)
+        try:
+            setattr(item, "__module__", mod_name)
+        except (AttributeError, TypeError):
+            pass
+    help(__main__)
+if __name__ == '__main__':
+    show_docs()

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_VF.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""
+This makes the functions in torch._C._VariableFunctions available as
+    torch._VF.<funcname>
+without mypy being able to find them.
+A subset of those functions are mapped to ATen functions in
+torch/jit/_builtins.py
+See https://github.com/pytorch/pytorch/issues/21478 for the reason for
+introducing torch._VF
+"""
+import sys
+import types
+import torch
+class VFModule(types.ModuleType):
+    vf: types.ModuleType
+    def __init__(self, name):
+        super().__init__(name)
+        self.vf = torch._C._VariableFunctions
+    def __getattr__(self, attr):
+        return getattr(self.vf, attr)
+sys.modules[__name__] = VFModule(__name__)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_classes.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import types
+import torch._C
+class _ClassNamespace(types.ModuleType):
+    def __init__(self, name):
+        super().__init__("torch.classes" + name)
+        self.name = name
+    def __getattr__(self, attr):
+        proxy = torch._C._get_custom_class_python_wrapper(self.name, attr)
+        if proxy is None:
+            raise RuntimeError(f"Class {self.name}.{attr} not registered!")
+        return proxy
+class _Classes(types.ModuleType):
+    __file__ = "_classes.py"
+    def __init__(self):
+        super().__init__("torch.classes")
+    def __getattr__(self, name):
+        namespace = _ClassNamespace(name)
+        setattr(self, name, namespace)
+        return namespace
+    @property
+    def loaded_libraries(self):
+        return torch.ops.loaded_libraries
+    def load_library(self, path):
+        """
+        Loads a shared library from the given path into the current process.
+        The library being loaded may run global initialization code to register
+        custom classes with the PyTorch JIT runtime. This allows dynamically
+        loading custom classes. For this, you should compile your class
+        and the static registration code into a shared library object, and then
+        call ``torch.classes.load_library('path/to/libcustom.so')`` to load the
+        shared object.
+        After the library is loaded, it is added to the
+        ``torch.classes.loaded_libraries`` attribute, a set that may be inspected
+        for the paths of all libraries loaded using this function.
+        Args:
+            path (str): A path to a shared library to load.
+        """
+        torch.ops.load_library(path)
+# The classes "namespace"
+classes = _Classes()

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_deploy.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import io
+import torch
+from torch.package import Importer, OrderedImporter, PackageImporter, sys_importer
+from torch.package._package_pickler import create_pickler
+from torch.package._package_unpickler import PackageUnpickler
+from torch.serialization import _maybe_decode_ascii
+def _save_storages(importer, obj):
+    serialized_storages = []
+    serialized_dtypes = []
+    importer = importer if isinstance(importer, torch.package.PackageImporter) else None
+    importers: Importer
+    if importer is not None:
+        importers = OrderedImporter(importer, sys_importer)
+    else:
+        importers = sys_importer
+    def persistent_id(obj):
+        if torch.is_storage(obj) or isinstance(obj, torch.storage.TypedStorage):
+            if isinstance(obj, torch.storage.TypedStorage):
+                # TODO: Once we decide to break serialization FC, we can
+                # remove this case
+                storage = obj._untyped_storage
+                dtype = obj.dtype
+            else:
+                storage = obj
+                dtype = torch.uint8
+            serialized_storages.append(obj)
+            serialized_dtypes.append(dtype)
+            return ("storage", len(serialized_storages) - 1)
+        if hasattr(obj, "__reduce_deploy__"):
+            if _serialized_reduces.get(id(obj)) is None:
+                _serialized_reduces[id(obj)] = (
+                    "reduce_deploy",
+                    id(obj),
+                    *obj.__reduce_deploy__(importers),
+                )
+            return _serialized_reduces[id(obj)]
+        return None
+    # Write the pickle data for `obj`
+    data_buf = io.BytesIO()
+    pickler = create_pickler(data_buf, importers)
+    pickler.persistent_id = persistent_id
+    pickler.dump(obj)
+    data_value = data_buf.getvalue()
+    return (
+        data_value,
+        serialized_storages,
+        serialized_dtypes,
+        importer.zip_reader if importer else None,
+    )
+def _load_storages(id, zip_reader, obj_bytes, serialized_storages, serialized_dtypes):
+    def persistent_load(saved_id):
+        assert isinstance(saved_id, tuple)
+        typename = _maybe_decode_ascii(saved_id[0])
+        data = saved_id[1:]
+        if typename == "storage":
+            # TODO: Once we decide to break serialization FC, we can
+            # stop wrapping with TypedStorage
+            storage = serialized_storages[data[0]]
+            dtype = serialized_dtypes[data[0]]
+            return torch.storage.TypedStorage(
+                wrap_storage=storage.untyped(), dtype=dtype
+            )
+        if typename == "reduce_deploy":
+            reduce_id, func, args = data
+            if reduce_id not in _loaded_reduces:
+                _loaded_reduces[reduce_id] = func(_raw_packages[zip_reader], *args)
+            return _loaded_reduces[reduce_id]
+        return None
+    importer: Importer
+    if zip_reader is not None:
+        importer = OrderedImporter(_get_package(zip_reader), sys_importer)
+    else:
+        importer = sys_importer
+    unpickler = PackageUnpickler(importer, io.BytesIO(obj_bytes))
+    unpickler.persistent_load = persistent_load  # type: ignore[method-assign]
+    result = _deploy_objects[id] = unpickler.load()
+    return result
+def _get_package(zip_reader):
+    if zip_reader not in _raw_packages:
+        _raw_packages[zip_reader] = PackageImporter(zip_reader)
+    return _raw_packages[zip_reader]
+_raw_packages: dict = {}
+_deploy_objects: dict = {}
+_serialized_reduces: dict = {}
+_loaded_reduces: dict = {}

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_linalg_utils.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""Various linear algebra utility methods for internal use.
+"""
+from typing import Optional, Tuple
+import torch
+from torch import Tensor
+def is_sparse(A):
+    """Check if tensor A is a sparse tensor"""
+    if isinstance(A, torch.Tensor):
+        return A.layout == torch.sparse_coo
+    error_str = "expected Tensor"
+    if not torch.jit.is_scripting():
+        error_str += f" but got {type(A)}"
+    raise TypeError(error_str)
+def get_floating_dtype(A):
+    """Return the floating point dtype of tensor A.
+    Integer types map to float32.
+    """
+    dtype = A.dtype
+    if dtype in (torch.float16, torch.float32, torch.float64):
+        return dtype
+    return torch.float32
+def matmul(A: Optional[Tensor], B: Tensor) -> Tensor:
+    """Multiply two matrices.
+    If A is None, return B. A can be sparse or dense. B is always
+    dense.
+    """
+    if A is None:
+        return B
+    if is_sparse(A):
+        return torch.sparse.mm(A, B)
+    return torch.matmul(A, B)
+def conjugate(A):
+    """Return conjugate of tensor A.
+    .. note:: If A's dtype is not complex, A is returned.
+    """
+    if A.is_complex():
+        return A.conj()
+    return A
+def transpose(A):
+    """Return transpose of a matrix or batches of matrices."""
+    ndim = len(A.shape)
+    return A.transpose(ndim - 1, ndim - 2)
+def transjugate(A):
+    """Return transpose conjugate of a matrix or batches of matrices."""
+    return conjugate(transpose(A))
+def bform(X: Tensor, A: Optional[Tensor], Y: Tensor) -> Tensor:
+    """Return bilinear form of matrices: :math:`X^T A Y`."""
+    return matmul(transpose(X), matmul(A, Y))
+def qform(A: Optional[Tensor], S: Tensor):
+    """Return quadratic form :math:`S^T A S`."""
+    return bform(S, A, S)
+def basis(A):
+    """Return orthogonal basis of A columns."""
+    return torch.linalg.qr(A).Q
+def symeig(A: Tensor, largest: Optional[bool] = False) -> Tuple[Tensor, Tensor]:
+    """Return eigenpairs of A with specified ordering."""
+    if largest is None:
+        largest = False
+    E, Z = torch.linalg.eigh(A, UPLO="U")
+    # assuming that E is ordered
+    if largest:
+        E = torch.flip(E, dims=(-1,))
+        Z = torch.flip(Z, dims=(-1,))
+    return E, Z
+# These functions were deprecated and removed
+# This nice error message can be removed in version 1.13+
+def matrix_rank(input, tol=None, symmetric=False, *, out=None) -> Tensor:
+    raise RuntimeError(
+        "This function was deprecated since version 1.9 and is now removed.\n"
+        "Please use the `torch.linalg.matrix_rank` function instead. "
+        "The parameter 'symmetric' was renamed in `torch.linalg.matrix_rank()` to 'hermitian'."
+    )
+def solve(input: Tensor, A: Tensor, *, out=None) -> Tuple[Tensor, Tensor]:
+    raise RuntimeError(
+        "This function was deprecated since version 1.9 and is now removed. "
+        "`torch.solve` is deprecated in favor of `torch.linalg.solve`. "
+        "`torch.linalg.solve` has its arguments reversed and does not return the LU factorization.\n\n"
+        "To get the LU factorization see `torch.lu`, which can be used with `torch.lu_solve` or `torch.lu_unpack`.\n"
+        "X = torch.solve(B, A).solution "
+        "should be replaced with:\n"
+        "X = torch.linalg.solve(A, B)"
+    )
+def lstsq(input: Tensor, A: Tensor, *, out=None) -> Tuple[Tensor, Tensor]:
+    raise RuntimeError(
+        "This function was deprecated since version 1.9 and is now removed. "
+        "`torch.lstsq` is deprecated in favor of `torch.linalg.lstsq`.\n"
+        "`torch.linalg.lstsq` has reversed arguments and does not return the QR decomposition in "
+        "the returned tuple (although it returns other information about the problem).\n\n"
+        "To get the QR decomposition consider using `torch.linalg.qr`.\n\n"
+        "The returned solution in `torch.lstsq` stored the residuals of the solution in the "
+        "last m - n columns of the returned value whenever m > n. In torch.linalg.lstsq, "
+        "the residuals are in the field 'residuals' of the returned named tuple.\n\n"
+        "The unpacking of the solution, as in\n"
+        "X, _ = torch.lstsq(B, A).solution[:A.size(1)]\n"
+        "should be replaced with:\n"
+        "X = torch.linalg.lstsq(A, B).solution"
+    )
+def _symeig(
+    input, eigenvectors=False, upper=True, *, out=None
+) -> Tuple[Tensor, Tensor]:
+    raise RuntimeError(
+        "This function was deprecated since version 1.9 and is now removed. "
+        "The default behavior has changed from using the upper triangular portion of the matrix by default "
+        "to using the lower triangular portion.\n\n"
+        "L, _ = torch.symeig(A, upper=upper) "
+        "should be replaced with:\n"
+        "L = torch.linalg.eigvalsh(A, UPLO='U' if upper else 'L')\n\n"
+        "and\n\n"
+        "L, V = torch.symeig(A, eigenvectors=True) "
+        "should be replaced with:\n"
+        "L, V = torch.linalg.eigh(A, UPLO='U' if upper else 'L')"
+    )
+def eig(
+    self: Tensor, eigenvectors: bool = False, *, e=None, v=None
+) -> Tuple[Tensor, Tensor]:
+    raise RuntimeError(
+        "This function was deprecated since version 1.9 and is now removed. "
+        "`torch.linalg.eig` returns complex tensors of dtype `cfloat` or `cdouble` rather than real tensors "
+        "mimicking complex tensors.\n\n"
+        "L, _ = torch.eig(A) "
+        "should be replaced with:\n"
+        "L_complex = torch.linalg.eigvals(A)\n\n"
+        "and\n\n"
+        "L, V = torch.eig(A, eigenvectors=True) "
+        "should be replaced with:\n"
+        "L_complex, V_complex = torch.linalg.eig(A)"
+    )