koichi12 commited on Feb 12, 2025

Commit

a249ee4

verified ·

1 Parent(s): eda6db7

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/caching.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/fuse.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/dask.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/webhdfs.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudaGL_meta.h +116 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudaVDPAU_meta.h +46 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudart_removed_meta.h +55 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/nvperf_target.h +570 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl.hpp +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_egl.h +123 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_gl.h +154 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_gl_ext.h +44 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_platform.h +1414 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/opencl.h +40 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/builtin_types.h +64 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/channel_descriptor.h +595 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups.h +1828 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/coalesced_reduce.h +108 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/functional.h +207 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/helpers.h +707 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/info.h +323 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/partitioning.h +133 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/reduce.h +430 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/scan.h +324 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/memcpy_async.h +62 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/reduce.h +63 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda.h +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaEGLTypedefs.h +96 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaGLTypedefs.h +123 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaTypedefs.h +959 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaVDPAUTypedefs.h +90 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_bf16.h +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp16.h +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp16.hpp +2614 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp8.h +360 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_gl_interop.h +508 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_runtime_api.h +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_vdpau_interop.h +198 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_functions.h +65 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/math_constants.h +152 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_intrinsics.h +1551 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_intrinsics.hpp +221 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_30_intrinsics.h +215 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_30_intrinsics.hpp +604 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_35_atomic_functions.h +58 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_61_intrinsics.hpp +161 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/surface_indirect_functions.h +286 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/surface_types.h +119 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_indirect_functions.h +771 -0

tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/caching.cpython-311.pyc ADDED Viewed

Binary file (37.6 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/fuse.cpython-311.pyc ADDED Viewed

Binary file (17.1 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/dask.cpython-311.pyc ADDED Viewed

Binary file (7.78 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/webhdfs.cpython-311.pyc ADDED Viewed

Binary file (24.8 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudaGL_meta.h ADDED Viewed

	@@ -0,0 +1,116 @@

+// This file is generated.  Any changes you make will be lost during the next clean build.
+// Dependent includes
+#ifdef __APPLE__
+#include <OpenGL/gl.h>
+#else
+#include <GL/gl.h>
+#endif
+// CUDA public interface, for type definitions and cu* function prototypes
+#include "cudaGL.h"
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+typedef struct cuGraphicsGLRegisterBuffer_params_st {
+    CUgraphicsResource *pCudaResource;
+    GLuint buffer;
+    unsigned int Flags;
+} cuGraphicsGLRegisterBuffer_params;
+typedef struct cuGraphicsGLRegisterImage_params_st {
+    CUgraphicsResource *pCudaResource;
+    GLuint image;
+    GLenum target;
+    unsigned int Flags;
+} cuGraphicsGLRegisterImage_params;
+typedef struct cuGLGetDevices_v2_params_st {
+    unsigned int *pCudaDeviceCount;
+    CUdevice *pCudaDevices;
+    unsigned int cudaDeviceCount;
+    CUGLDeviceList deviceList;
+} cuGLGetDevices_v2_params;
+typedef struct cuGLCtxCreate_v2_params_st {
+    CUcontext *pCtx;
+    unsigned int Flags;
+    CUdevice device;
+} cuGLCtxCreate_v2_params;
+typedef struct cuGLRegisterBufferObject_params_st {
+    GLuint buffer;
+} cuGLRegisterBufferObject_params;
+typedef struct cuGLMapBufferObject_v2_ptds_params_st {
+    CUdeviceptr *dptr;
+    size_t *size;
+    GLuint buffer;
+} cuGLMapBufferObject_v2_ptds_params;
+typedef struct cuGLUnmapBufferObject_params_st {
+    GLuint buffer;
+} cuGLUnmapBufferObject_params;
+typedef struct cuGLUnregisterBufferObject_params_st {
+    GLuint buffer;
+} cuGLUnregisterBufferObject_params;
+typedef struct cuGLSetBufferObjectMapFlags_params_st {
+    GLuint buffer;
+    unsigned int Flags;
+} cuGLSetBufferObjectMapFlags_params;
+typedef struct cuGLMapBufferObjectAsync_v2_ptsz_params_st {
+    CUdeviceptr *dptr;
+    size_t *size;
+    GLuint buffer;
+    CUstream hStream;
+} cuGLMapBufferObjectAsync_v2_ptsz_params;
+typedef struct cuGLUnmapBufferObjectAsync_params_st {
+    GLuint buffer;
+    CUstream hStream;
+} cuGLUnmapBufferObjectAsync_params;
+typedef struct cuGLGetDevices_params_st {
+    unsigned int *pCudaDeviceCount;
+    CUdevice *pCudaDevices;
+    unsigned int cudaDeviceCount;
+    CUGLDeviceList deviceList;
+} cuGLGetDevices_params;
+typedef struct cuGLMapBufferObject_v2_params_st {
+    CUdeviceptr *dptr;
+    size_t *size;
+    GLuint buffer;
+} cuGLMapBufferObject_v2_params;
+typedef struct cuGLMapBufferObjectAsync_v2_params_st {
+    CUdeviceptr *dptr;
+    size_t *size;
+    GLuint buffer;
+    CUstream hStream;
+} cuGLMapBufferObjectAsync_v2_params;
+typedef struct cuGLCtxCreate_params_st {
+    CUcontext *pCtx;
+    unsigned int Flags;
+    CUdevice device;
+} cuGLCtxCreate_params;
+typedef struct cuGLMapBufferObject_params_st {
+    CUdeviceptr_v1 *dptr;
+    unsigned int *size;
+    GLuint buffer;
+} cuGLMapBufferObject_params;
+typedef struct cuGLMapBufferObjectAsync_params_st {
+    CUdeviceptr_v1 *dptr;
+    unsigned int *size;
+    GLuint buffer;
+    CUstream hStream;
+} cuGLMapBufferObjectAsync_params;

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudaVDPAU_meta.h ADDED Viewed

	@@ -0,0 +1,46 @@

+// This file is generated.  Any changes you make will be lost during the next clean build.
+// Dependent includes
+#include <vdpau/vdpau.h>
+// CUDA public interface, for type definitions and cu* function prototypes
+#include "cudaVDPAU.h"
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+typedef struct cuVDPAUGetDevice_params_st {
+    CUdevice *pDevice;
+    VdpDevice vdpDevice;
+    VdpGetProcAddress *vdpGetProcAddress;
+} cuVDPAUGetDevice_params;
+typedef struct cuVDPAUCtxCreate_v2_params_st {
+    CUcontext *pCtx;
+    unsigned int flags;
+    CUdevice device;
+    VdpDevice vdpDevice;
+    VdpGetProcAddress *vdpGetProcAddress;
+} cuVDPAUCtxCreate_v2_params;
+typedef struct cuGraphicsVDPAURegisterVideoSurface_params_st {
+    CUgraphicsResource *pCudaResource;
+    VdpVideoSurface vdpSurface;
+    unsigned int flags;
+} cuGraphicsVDPAURegisterVideoSurface_params;
+typedef struct cuGraphicsVDPAURegisterOutputSurface_params_st {
+    CUgraphicsResource *pCudaResource;
+    VdpOutputSurface vdpSurface;
+    unsigned int flags;
+} cuGraphicsVDPAURegisterOutputSurface_params;
+typedef struct cuVDPAUCtxCreate_params_st {
+    CUcontext *pCtx;
+    unsigned int flags;
+    CUdevice device;
+    VdpDevice vdpDevice;
+    VdpGetProcAddress *vdpGetProcAddress;
+} cuVDPAUCtxCreate_params;

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudart_removed_meta.h ADDED Viewed

	@@ -0,0 +1,55 @@

+// This file is generated.  Any changes you make will be lost during the next clean build.
+// CUDA public interface, for type definitions and api function prototypes
+#include "cudart_removed.h"
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+// Currently used parameter trace structures
+typedef struct cudaStreamDestroy_v3020_params_st {
+    cudaStream_t stream;
+} cudaStreamDestroy_v3020_params;
+typedef struct cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6000_params_st {
+    int *numBlocks;
+    const void *func;
+    size_t numDynamicSmemBytes;
+} cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6000_params;
+typedef struct cudaConfigureCall_v3020_params_st {
+    dim3 gridDim;
+    dim3 blockDim;
+    size_t sharedMem  __dv;
+    cudaStream_t stream  __dv;
+} cudaConfigureCall_v3020_params;
+typedef struct cudaSetupArgument_v3020_params_st {
+    const void *arg;
+    size_t size;
+    size_t offset;
+} cudaSetupArgument_v3020_params;
+typedef struct cudaLaunch_v3020_params_st {
+    const void *func;
+} cudaLaunch_v3020_params;
+typedef struct cudaLaunch_ptsz_v7000_params_st {
+    const void *func;
+} cudaLaunch_ptsz_v7000_params;
+typedef struct cudaStreamSetFlags_v10200_params_st {
+    cudaStream_t hStream;
+    unsigned int flags;
+} cudaStreamSetFlags_v10200_params;
+typedef struct cudaStreamSetFlags_ptsz_v10200_params_st {
+    cudaStream_t hStream;
+    unsigned int flags;
+} cudaStreamSetFlags_ptsz_v10200_params;
+// Parameter trace structures for removed functions
+// End of parameter trace structures

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/nvperf_target.h ADDED Viewed

	@@ -0,0 +1,570 @@

+#ifndef NVPERF_TARGET_H
+#define NVPERF_TARGET_H
+/*
+ * Copyright 2014-2022  NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.
+ *
+ * This software and the information contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+ * of a form of NVIDIA software license agreement.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.   This source code is a "commercial item" as
+ * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer  software"  and "commercial computer software
+ * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ *
+ * Any use of this source code in individual and commercial software must
+ * include, in the user documentation and internal comments to the code,
+ * the above Disclaimer and U.S. Government End Users Notice.
+ */
+#include <stddef.h>
+#include <stdint.h>
+#include "nvperf_common.h"
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility push(default)
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL __attribute__ ((visibility ("hidden")))
+    #endif
+#else
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL
+    #endif
+#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ *  @file   nvperf_target.h
+ */
+#ifndef NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_DEFINED
+#define NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_DEFINED
+    /// GPU architecture support level
+    typedef enum NVPW_GpuArchitectureSupportLevel
+    {
+        NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_UNSUPPORTED,
+        NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_SUPPORTED
+    } NVPW_GpuArchitectureSupportLevel;
+#endif //NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_DEFINED
+#ifndef NVPW_SLI_SUPPORT_LEVEL_DEFINED
+#define NVPW_SLI_SUPPORT_LEVEL_DEFINED
+    /// SLI configuration support level
+    typedef enum NVPW_SliSupportLevel
+    {
+        NVPW_SLI_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_SLI_SUPPORT_LEVEL_UNSUPPORTED,
+        /// Only Non-SLI configurations are supported.
+        NVPW_SLI_SUPPORT_LEVEL_SUPPORTED_NON_SLI_CONFIGURATION
+    } NVPW_SliSupportLevel;
+#endif //NVPW_SLI_SUPPORT_LEVEL_DEFINED
+#ifndef NVPW_VGPU_SUPPORT_LEVEL_DEFINED
+#define NVPW_VGPU_SUPPORT_LEVEL_DEFINED
+    /// Virtualized GPU configuration support level
+    typedef enum NVPW_VGpuSupportLevel
+    {
+        NVPW_VGPU_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_VGPU_SUPPORT_LEVEL_UNSUPPORTED,
+        /// Supported but not allowed by system admin.
+        NVPW_VGPU_SUPPORT_LEVEL_SUPPORTED_DISALLOWED,
+        NVPW_VGPU_SUPPORT_LEVEL_SUPPORTED_ALLOWED,
+        NVPW_VGPU_SUPPORT_LEVEL_SUPPORTED_NON_VGPU_CONFIGURATION
+    } NVPW_VGpuSupportLevel;
+#endif //NVPW_VGPU_SUPPORT_LEVEL_DEFINED
+#ifndef NVPW_CONF_COMPUTE_SUPPORT_LEVEL_DEFINED
+#define NVPW_CONF_COMPUTE_SUPPORT_LEVEL_DEFINED
+    /// Confidential Compute mode support level
+    typedef enum NVPW_ConfidentialComputeSupportLevel
+    {
+        NVPW_CONF_COMPUTE_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_CONF_COMPUTE_SUPPORT_LEVEL_UNSUPPORTED,
+        NVPW_CONF_COMPUTE_SUPPORT_LEVEL_SUPPORTED_NON_CONF_COMPUTE_CONFIGURATION
+    } NVPW_ConfidentialComputeSupportLevel;
+#endif //NVPW_CONF_COMPUTE_SUPPORT_LEVEL_DEFINED
+#ifndef NVPW_CMP_SUPPORT_LEVEL_DEFINED
+#define NVPW_CMP_SUPPORT_LEVEL_DEFINED
+    /// CMP support level
+    typedef enum NVPW_CmpSupportLevel
+    {
+        NVPW_CMP_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_CMP_SUPPORT_LEVEL_UNSUPPORTED,
+        NVPW_CMP_SUPPORT_LEVEL_SUPPORTED_NON_CMP_CONFIGURATON
+    } NVPW_CmpSupportLevel;
+#endif //NVPW_CMP_SUPPORT_LEVEL_DEFINED
+#ifndef NVPW_WSL_SUPPORT_LEVEL_DEFINED
+#define NVPW_WSL_SUPPORT_LEVEL_DEFINED
+    /// WSL support level
+    typedef enum NVPW_WslSupportLevel
+    {
+        NVPW_WSL_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_WSL_SUPPORT_LEVEL_UNSUPPORTED_INSUFFICIENT_DRIVER_VERSION,
+        NVPW_WSL_SUPPORT_LEVEL_SUPPORTED,
+        NVPW_WSL_SUPPORT_LEVEL_SUPPORTED_NON_WSL_CONFIGURATION
+    } NVPW_WslSupportLevel;
+#endif //NVPW_WSL_SUPPORT_LEVEL_DEFINED
+    typedef struct NVPW_InitializeTarget_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+    } NVPW_InitializeTarget_Params;
+#define NVPW_InitializeTarget_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_InitializeTarget_Params, pPriv)
+    /// Load the target library.
+    NVPA_Status NVPW_InitializeTarget(NVPW_InitializeTarget_Params* pParams);
+    typedef struct NVPW_GetDeviceCount_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        size_t numDevices;
+    } NVPW_GetDeviceCount_Params;
+#define NVPW_GetDeviceCount_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_GetDeviceCount_Params, numDevices)
+    NVPA_Status NVPW_GetDeviceCount(NVPW_GetDeviceCount_Params* pParams);
+    typedef struct NVPW_Device_GetNames_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        size_t deviceIndex;
+        const char* pDeviceName;
+        const char* pChipName;
+    } NVPW_Device_GetNames_Params;
+#define NVPW_Device_GetNames_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetNames_Params, pChipName)
+    NVPA_Status NVPW_Device_GetNames(NVPW_Device_GetNames_Params* pParams);
+    typedef struct NVPW_PciBusId
+    {
+        /// The PCI domain on which the device bus resides.
+        uint32_t domain;
+        ///  The bus on which the device resides.
+        uint16_t bus;
+        /// device ID.
+        uint16_t device;
+    } NVPW_PciBusId;
+#define NVPW_PciBusId_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PciBusId, device)
+    typedef struct NVPW_Device_GetPciBusIds_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in] caller-allocated array of NVPW_PciBusId, indexed by NVPW deviceIndex
+        NVPW_PciBusId* pBusIds;
+        /// [in] size of the pBusIDs array; use result from NVPW_GetDeviceCount
+        size_t numDevices;
+    } NVPW_Device_GetPciBusIds_Params;
+#define NVPW_Device_GetPciBusIds_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetPciBusIds_Params, numDevices)
+    NVPA_Status NVPW_Device_GetPciBusIds(NVPW_Device_GetPciBusIds_Params* pParams);
+#define NVPW_DEVICE_MIG_GPU_INSTANCE_ID_INVALID     0xFFFFFFFFu
+#define NVPW_DEVICE_MIG_GPU_INSTANCE_ID_FULLCHIP    0xFFFFFFFEu
+    typedef struct NVPW_Device_GetMigAttributes_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        size_t deviceIndex;
+        /// [out]
+        NVPA_Bool isMigPartition;
+        /// [out]
+        uint32_t gpuInstanceId;
+        /// [out]
+        uint32_t computeInstanceId;
+    } NVPW_Device_GetMigAttributes_Params;
+#define NVPW_Device_GetMigAttributes_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetMigAttributes_Params, computeInstanceId)
+    NVPA_Status NVPW_Device_GetMigAttributes(NVPW_Device_GetMigAttributes_Params* pParams);
+    typedef struct NVPW_Adapter_GetDeviceIndex_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct IDXGIAdapter* pAdapter;
+        /// [in]
+        size_t sliIndex;
+        /// [out]
+        size_t deviceIndex;
+    } NVPW_Adapter_GetDeviceIndex_Params;
+#define NVPW_Adapter_GetDeviceIndex_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Adapter_GetDeviceIndex_Params, deviceIndex)
+    NVPA_Status NVPW_Adapter_GetDeviceIndex(NVPW_Adapter_GetDeviceIndex_Params* pParams);
+    typedef struct NVPW_CounterData_GetNumRanges_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const uint8_t* pCounterDataImage;
+        size_t numRanges;
+    } NVPW_CounterData_GetNumRanges_Params;
+#define NVPW_CounterData_GetNumRanges_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_GetNumRanges_Params, numRanges)
+    NVPA_Status NVPW_CounterData_GetNumRanges(NVPW_CounterData_GetNumRanges_Params* pParams);
+    typedef struct NVPW_CounterData_GetChipName_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in]
+        size_t counterDataImageSize;
+        /// [out]
+        const char* pChipName;
+    } NVPW_CounterData_GetChipName_Params;
+#define NVPW_CounterData_GetChipName_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_GetChipName_Params, pChipName)
+    NVPA_Status NVPW_CounterData_GetChipName(NVPW_CounterData_GetChipName_Params* pParams);
+    typedef struct NVPW_Config_GetNumPasses_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pConfig;
+        /// [out]
+        size_t numPipelinedPasses;
+        /// [out]
+        size_t numIsolatedPasses;
+    } NVPW_Config_GetNumPasses_Params;
+#define NVPW_Config_GetNumPasses_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Config_GetNumPasses_Params, numIsolatedPasses)
+    /// Total num passes = numPipelinedPasses + numIsolatedPasses * numNestingLevels
+    NVPA_Status NVPW_Config_GetNumPasses(NVPW_Config_GetNumPasses_Params* pParams);
+    typedef struct NVPW_Config_GetNumPasses_V2_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pConfig;
+        /// [out]
+        size_t numPasses;
+    } NVPW_Config_GetNumPasses_V2_Params;
+#define NVPW_Config_GetNumPasses_V2_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Config_GetNumPasses_V2_Params, numPasses)
+    /// Total num passes = numPasses * numNestingLevels
+    NVPA_Status NVPW_Config_GetNumPasses_V2(NVPW_Config_GetNumPasses_V2_Params* pParams);
+#define NVPW_API_SET_CUDA_PROFILER             0x18209d0775b2f89dULL
+#define NVPW_API_SET_D3D11_PROFILER            0xca55c6738445db2bULL
+#define NVPW_API_SET_D3D12_PROFILER            0xc0c2d46dd7c7ad78ULL
+#define NVPW_API_SET_EGL_PROFILER              0x3c3747dae1f9565cULL
+#define NVPW_API_SET_GPU_PERIODICSAMPLER       0x9f4c2571fc0b2e8aULL
+#define NVPW_API_SET_METRICSCONTEXT            0x7c8579f6f2144beaULL
+#define NVPW_API_SET_METRICSEVALUATOR          0x0368a8768d811af9ULL
+#define NVPW_API_SET_METRICS_GA100_COMP        0x16b7d8c20d8b4915ULL
+#define NVPW_API_SET_METRICS_GA100_GRFX        0xc94eaabec04a94faULL
+#define NVPW_API_SET_METRICS_GA10X_COMP        0xb5d6391c2e299ab5ULL
+#define NVPW_API_SET_METRICS_GA10X_GRFX        0x6ebc121178b5ce0bULL
+#define NVPW_API_SET_METRICS_GV100_COMP        0x863705cc57919f72ULL
+#define NVPW_API_SET_METRICS_GV100_GRFX        0x9900da75d164fecfULL
+#define NVPW_API_SET_METRICS_GV11B_COMP        0xd3f79a859235848fULL
+#define NVPW_API_SET_METRICS_GV11B_GRFX        0xeb8e26220106e227ULL
+#define NVPW_API_SET_METRICS_TU10X_COMP        0x70f40be0afd35da8ULL
+#define NVPW_API_SET_METRICS_TU10X_GRFX        0xdf219cb838db6968ULL
+#define NVPW_API_SET_METRICS_TU11X_COMP        0xeb0069d7d0956678ULL
+#define NVPW_API_SET_METRICS_TU11X_GRFX        0x0977d9342bd62743ULL
+#define NVPW_API_SET_OPENGL_PROFILER           0xe4cd9ea40f2ee777ULL
+#define NVPW_API_SET_VULKAN_PROFILER           0x8c56b6a03d779689ULL
+    typedef struct NVPW_QueryVersionNumber_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        uint64_t apiSet;
+        /// [out]
+        uint32_t major;
+        /// [out]
+        uint32_t minor;
+        /// [out]
+        uint32_t patch;
+        /// [out]
+        uint32_t relMajor;
+        /// [out]
+        uint32_t relMinor;
+        /// [out]
+        uint32_t relPatch;
+    } NVPW_QueryVersionNumber_Params;
+#define NVPW_QueryVersionNumber_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_QueryVersionNumber_Params, relPatch)
+    /// Query version number of an API set
+    NVPA_Status NVPW_QueryVersionNumber(NVPW_QueryVersionNumber_Params* pParams);
+    typedef enum NVPW_Device_ClockStatus
+    {
+        /// clock status is unknown
+        NVPW_DEVICE_CLOCK_STATUS_UNKNOWN,
+        /// clocks are locked to rated tdp values
+        NVPW_DEVICE_CLOCK_STATUS_LOCKED_TO_RATED_TDP,
+        /// clocks are not locked and can boost above rated tdp
+        NVPW_DEVICE_CLOCK_STATUS_BOOST_ENABLED,
+        /// clocks are not locked and will not go above rated tdp
+        NVPW_DEVICE_CLOCK_STATUS_BOOST_DISABLED,
+        NVPW_DEVICE_CLOCK_STATUS__COUNT
+    } NVPW_Device_ClockStatus;
+    typedef struct NVPW_Device_GetClockStatus_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        size_t deviceIndex;
+        /// [in]
+        NVPW_Device_ClockStatus clockStatus;
+    } NVPW_Device_GetClockStatus_Params;
+#define NVPW_Device_GetClockStatus_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetClockStatus_Params, clockStatus)
+    NVPA_Status NVPW_Device_GetClockStatus(NVPW_Device_GetClockStatus_Params* pParams);
+    typedef enum NVPW_Device_ClockSetting
+    {
+        /// invalid op, specify valid clocks operation during profiling
+        NVPW_DEVICE_CLOCK_SETTING_INVALID,
+        /// default to driver/application config (normally unlocked and not boosted, but could be unlocked boosted, or
+        /// locked to rated TDP)
+        NVPW_DEVICE_CLOCK_SETTING_DEFAULT,
+        /// lock clocks at rated tdp base values
+        NVPW_DEVICE_CLOCK_SETTING_LOCK_TO_RATED_TDP,
+        NVPW_DEVICE_CLOCK_SETTING__COUNT
+    } NVPW_Device_ClockSetting;
+    typedef struct NVPW_Device_SetClockSetting_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        size_t deviceIndex;
+        /// [in]
+        NVPW_Device_ClockSetting clockSetting;
+    } NVPW_Device_SetClockSetting_Params;
+#define NVPW_Device_SetClockSetting_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_SetClockSetting_Params, clockSetting)
+    NVPA_Status NVPW_Device_SetClockSetting(NVPW_Device_SetClockSetting_Params* pParams);
+    typedef struct NVPW_CounterData_GetRangeDescriptions_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const uint8_t* pCounterDataImage;
+        size_t rangeIndex;
+        /// [inout] Number of descriptions allocated in ppDescriptions
+        size_t numDescriptions;
+        const char** ppDescriptions;
+    } NVPW_CounterData_GetRangeDescriptions_Params;
+#define NVPW_CounterData_GetRangeDescriptions_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_GetRangeDescriptions_Params, ppDescriptions)
+    NVPA_Status NVPW_CounterData_GetRangeDescriptions(NVPW_CounterData_GetRangeDescriptions_Params* pParams);
+    typedef struct NVPW_Profiler_CounterData_GetRangeDescriptions_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const uint8_t* pCounterDataImage;
+        size_t rangeIndex;
+        /// [inout] Number of descriptions allocated in ppDescriptions
+        size_t numDescriptions;
+        const char** ppDescriptions;
+    } NVPW_Profiler_CounterData_GetRangeDescriptions_Params;
+#define NVPW_Profiler_CounterData_GetRangeDescriptions_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Profiler_CounterData_GetRangeDescriptions_Params, ppDescriptions)
+    NVPA_Status NVPW_Profiler_CounterData_GetRangeDescriptions(NVPW_Profiler_CounterData_GetRangeDescriptions_Params* pParams);
+#ifndef NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_DEFINED
+#define NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_DEFINED
+    typedef enum NVPW_PeriodicSampler_CounterData_AppendMode
+    {
+        NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_LINEAR = 0,
+        NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_CIRCULAR = 1,
+        NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE__COUNT
+    } NVPW_PeriodicSampler_CounterData_AppendMode;
+#endif //NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_DEFINED
+    typedef struct NVPW_PeriodicSampler_CounterData_GetSampleTime_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in]
+        size_t rangeIndex;
+        /// [out]
+        uint64_t timestampStart;
+        /// [out]
+        uint64_t timestampEnd;
+    } NVPW_PeriodicSampler_CounterData_GetSampleTime_Params;
+#define NVPW_PeriodicSampler_CounterData_GetSampleTime_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_GetSampleTime_Params, timestampEnd)
+    NVPA_Status NVPW_PeriodicSampler_CounterData_GetSampleTime(NVPW_PeriodicSampler_CounterData_GetSampleTime_Params* pParams);
+    typedef struct NVPW_PeriodicSampler_CounterData_TrimInPlace_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        uint8_t* pCounterDataImage;
+        /// [in]
+        size_t counterDataImageSize;
+        /// [out]
+        size_t counterDataImageTrimmedSize;
+    } NVPW_PeriodicSampler_CounterData_TrimInPlace_Params;
+#define NVPW_PeriodicSampler_CounterData_TrimInPlace_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_TrimInPlace_Params, counterDataImageTrimmedSize)
+    NVPA_Status NVPW_PeriodicSampler_CounterData_TrimInPlace(NVPW_PeriodicSampler_CounterData_TrimInPlace_Params* pParams);
+    typedef struct NVPW_PeriodicSampler_CounterData_GetInfo_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in]
+        size_t counterDataImageSize;
+        /// [out] total number of ranges in the counter data
+        size_t numTotalRanges;
+        /// [out] if in "linear" mode, this API returns the number of "populated" ranges; if it's in "circular" mode,
+        /// then it returns the last "populated" range index + 1, when there is no such range, it returns 0.
+        size_t numPopulatedRanges;
+        /// [out] if in "linear" mode, this API returns the number of "completed" ranges; if it's in "circular" mode,
+        /// then it returns the last "completed" range index + 1, when there is no such range, it returns 0.
+        size_t numCompletedRanges;
+    } NVPW_PeriodicSampler_CounterData_GetInfo_Params;
+#define NVPW_PeriodicSampler_CounterData_GetInfo_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_GetInfo_Params, numCompletedRanges)
+    /// In periodic sampler, a range in counter data stores exactly one sample's data. For better performance, periodic
+    /// sampler may operate in an out-of-order fashion when populating sample data, i.e. it may not fully populate all
+    /// counters of a sample/range before starting to populate the next sample/range. As a result, we have two concepts
+    /// here, "populated" & "completed": a range is considered "populated" even if only partial counters have been
+    /// written; on the other hand, a range is only considered "completed" if all the collecting counters have been
+    /// written.
+    NVPA_Status NVPW_PeriodicSampler_CounterData_GetInfo(NVPW_PeriodicSampler_CounterData_GetInfo_Params* pParams);
+    typedef struct NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in]
+        size_t counterDataImageSize;
+        /// [in]
+        size_t rangeIndex;
+        /// [out]
+        uint32_t triggerCount;
+    } NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params;
+#define NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params, triggerCount)
+    NVPA_Status NVPW_PeriodicSampler_CounterData_GetTriggerCount(NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params* pParams);
+    typedef struct NVPW_TimestampReport
+    {
+        uint32_t payload;
+        uint8_t reserved0004[4];
+        uint64_t timestamp;
+    } NVPW_TimestampReport;
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility pop
+#endif
+#endif // NVPERF_TARGET_H

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl.hpp ADDED Viewed

The diff for this file is too large to render. See raw diff

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_egl.h ADDED Viewed

	@@ -0,0 +1,123 @@

+/*******************************************************************************
+ * Copyright (c) 2008-2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+#ifndef __OPENCL_CL_EGL_H
+#define __OPENCL_CL_EGL_H
+#ifdef __APPLE__
+#else
+#include <CL/cl.h>
+#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
+/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
+#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR  0x202F
+#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR    0x202D
+#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR    0x202E
+/* Error type for clCreateFromEGLImageKHR */
+#define CL_INVALID_EGL_OBJECT_KHR             -1093
+#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR      -1092
+/* CLeglImageKHR is an opaque handle to an EGLImage */
+typedef void* CLeglImageKHR;
+/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
+typedef void* CLeglDisplayKHR;
+/* CLeglSyncKHR is an opaque handle to an EGLSync object */
+typedef void* CLeglSyncKHR;
+/* properties passed to clCreateFromEGLImageKHR */
+typedef intptr_t cl_egl_image_properties_khr;
+#define cl_khr_egl_image 1
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromEGLImageKHR(cl_context                  context,
+                        CLeglDisplayKHR             egldisplay,
+                        CLeglImageKHR               eglimage,
+                        cl_mem_flags                flags,
+                        const cl_egl_image_properties_khr * properties,
+                        cl_int *                    errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
+    cl_context                  context,
+    CLeglDisplayKHR             egldisplay,
+    CLeglImageKHR               eglimage,
+    cl_mem_flags                flags,
+    const cl_egl_image_properties_khr * properties,
+    cl_int *                    errcode_ret);
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireEGLObjectsKHR(cl_command_queue command_queue,
+                              cl_uint          num_objects,
+                              const cl_mem *   mem_objects,
+                              cl_uint          num_events_in_wait_list,
+                              const cl_event * event_wait_list,
+                              cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event);
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseEGLObjectsKHR(cl_command_queue command_queue,
+                              cl_uint          num_objects,
+                              const cl_mem *   mem_objects,
+                              cl_uint          num_events_in_wait_list,
+                              const cl_event * event_wait_list,
+                              cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event);
+#define cl_khr_egl_event 1
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromEGLSyncKHR(cl_context      context,
+                            CLeglSyncKHR    sync,
+                            CLeglDisplayKHR display,
+                            cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
+    cl_context      context,
+    CLeglSyncKHR    sync,
+    CLeglDisplayKHR display,
+    cl_int *        errcode_ret);
+#ifdef __cplusplus
+}
+#endif
+#endif /* __OPENCL_CL_EGL_H */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_gl.h ADDED Viewed

	@@ -0,0 +1,154 @@

+/*******************************************************************************
+ * Copyright (c) 2008-2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+#ifndef __OPENCL_CL_GL_H
+#define __OPENCL_CL_GL_H
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
+typedef cl_uint     cl_gl_object_type;
+typedef cl_uint     cl_gl_texture_info;
+typedef cl_uint     cl_gl_platform_info;
+typedef struct __GLsync *cl_GLsync;
+/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */
+#define CL_GL_OBJECT_BUFFER                     0x2000
+#define CL_GL_OBJECT_TEXTURE2D                  0x2001
+#define CL_GL_OBJECT_TEXTURE3D                  0x2002
+#define CL_GL_OBJECT_RENDERBUFFER               0x2003
+#define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E
+#define CL_GL_OBJECT_TEXTURE1D                  0x200F
+#define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010
+#define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011
+/* cl_gl_texture_info           */
+#define CL_GL_TEXTURE_TARGET                    0x2004
+#define CL_GL_MIPMAP_LEVEL                      0x2005
+#define CL_GL_NUM_SAMPLES                       0x2012
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLBuffer(cl_context     context,
+                     cl_mem_flags   flags,
+                     cl_GLuint      bufobj,
+                     cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture(cl_context      context,
+                      cl_mem_flags    flags,
+                      cl_GLenum       target,
+                      cl_GLint        miplevel,
+                      cl_GLuint       texture,
+                      cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLRenderbuffer(cl_context   context,
+                           cl_mem_flags flags,
+                           cl_GLuint    renderbuffer,
+                           cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLObjectInfo(cl_mem                memobj,
+                  cl_gl_object_type *   gl_object_type,
+                  cl_GLuint *           gl_object_name) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLTextureInfo(cl_mem               memobj,
+                   cl_gl_texture_info   param_name,
+                   size_t               param_value_size,
+                   void *               param_value,
+                   size_t *             param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGLObjects(cl_command_queue      command_queue,
+                          cl_uint               num_objects,
+                          const cl_mem *        mem_objects,
+                          cl_uint               num_events_in_wait_list,
+                          const cl_event *      event_wait_list,
+                          cl_event *            event) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGLObjects(cl_command_queue      command_queue,
+                          cl_uint               num_objects,
+                          const cl_mem *        mem_objects,
+                          cl_uint               num_events_in_wait_list,
+                          const cl_event *      event_wait_list,
+                          cl_event *            event) CL_API_SUFFIX__VERSION_1_0;
+/* Deprecated OpenCL 1.1 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture2D(cl_context      context,
+                        cl_mem_flags    flags,
+                        cl_GLenum       target,
+                        cl_GLint        miplevel,
+                        cl_GLuint       texture,
+                        cl_int *        errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture3D(cl_context      context,
+                        cl_mem_flags    flags,
+                        cl_GLenum       target,
+                        cl_GLint        miplevel,
+                        cl_GLuint       texture,
+                        cl_int *        errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+/* cl_khr_gl_sharing extension  */
+#define cl_khr_gl_sharing 1
+typedef cl_uint     cl_gl_context_info;
+/* Additional Error Codes  */
+#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
+/* cl_gl_context_info  */
+#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
+#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
+/* Additional cl_context_properties  */
+#define CL_GL_CONTEXT_KHR                       0x2008
+#define CL_EGL_DISPLAY_KHR                      0x2009
+#define CL_GLX_DISPLAY_KHR                      0x200A
+#define CL_WGL_HDC_KHR                          0x200B
+#define CL_CGL_SHAREGROUP_KHR                   0x200C
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLContextInfoKHR(const cl_context_properties * properties,
+                      cl_gl_context_info            param_name,
+                      size_t                        param_value_size,
+                      void *                        param_value,
+                      size_t *                      param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
+    const cl_context_properties * properties,
+    cl_gl_context_info            param_name,
+    size_t                        param_value_size,
+    void *                        param_value,
+    size_t *                      param_value_size_ret);
+#ifdef __cplusplus
+}
+#endif
+#endif  /* __OPENCL_CL_GL_H */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_gl_ext.h ADDED Viewed

	@@ -0,0 +1,44 @@

+/*******************************************************************************
+ * Copyright (c) 2008-2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+#ifndef __OPENCL_CL_GL_EXT_H
+#define __OPENCL_CL_GL_EXT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+#ifdef __APPLE__
+    #include <OpenCL/cl_gl.h>
+#else
+    #include <CL/cl_gl.h>
+#endif
+/*
+ *  cl_khr_gl_event extension
+ */
+#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromGLsyncKHR(cl_context context,
+                           cl_GLsync  sync,
+                           cl_int *   errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
+#ifdef __cplusplus
+}
+#endif
+#endif	/* __OPENCL_CL_GL_EXT_H  */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_platform.h ADDED Viewed

	@@ -0,0 +1,1414 @@

+/*******************************************************************************
+ * Copyright (c) 2008-2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+#ifndef __CL_PLATFORM_H
+#define __CL_PLATFORM_H
+#ifdef __APPLE__
+    /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */
+    #include <AvailabilityMacros.h>
+#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if defined(_WIN32)
+    #define CL_API_ENTRY
+    #define CL_API_CALL     __stdcall
+    #define CL_CALLBACK     __stdcall
+#else
+    #define CL_API_ENTRY
+    #define CL_API_CALL
+    #define CL_CALLBACK
+#endif
+/*
+ * Deprecation flags refer to the last version of the header in which the
+ * feature was not deprecated.
+ *
+ * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without
+ * deprecation but is deprecated in versions later than 1.1.
+ */
+#ifdef __APPLE__
+    #define CL_EXTENSION_WEAK_LINK       __attribute__((weak_import))
+    #define CL_API_SUFFIX__VERSION_1_0                  AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_0                  CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_API_SUFFIX__VERSION_1_1                  AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define GCL_API_SUFFIX__VERSION_1_1                 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_1                  CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED       CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
+    #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
+    #else
+        #warning  This path should never happen outside of internal operating system development.  AvailabilityMacros do not function correctly here!
+        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #endif
+#else
+    #define CL_EXTENSION_WEAK_LINK
+    #define CL_API_SUFFIX__VERSION_1_0
+    #define CL_EXT_SUFFIX__VERSION_1_0
+    #define CL_API_SUFFIX__VERSION_1_1
+    #define CL_EXT_SUFFIX__VERSION_1_1
+    #define CL_API_SUFFIX__VERSION_1_2
+    #define CL_EXT_SUFFIX__VERSION_1_2
+    #define CL_API_SUFFIX__VERSION_2_0
+    #define CL_EXT_SUFFIX__VERSION_2_0
+    #define CL_API_SUFFIX__VERSION_2_1
+    #define CL_EXT_SUFFIX__VERSION_2_1
+    #define CL_API_SUFFIX__VERSION_2_2
+    #define CL_EXT_SUFFIX__VERSION_2_2
+    #define CL_API_SUFFIX__VERSION_3_0
+    #define CL_EXT_SUFFIX__VERSION_3_0
+    #define CL_API_SUFFIX__EXPERIMENTAL
+    #define CL_EXT_SUFFIX__EXPERIMENTAL
+    #ifdef __GNUC__
+        #define CL_EXT_SUFFIX_DEPRECATED __attribute__((deprecated))
+        #define CL_EXT_PREFIX_DEPRECATED
+    #elif defined(_WIN32)
+        #define CL_EXT_SUFFIX_DEPRECATED
+        #define CL_EXT_PREFIX_DEPRECATED __declspec(deprecated)
+    #else
+        #define CL_EXT_SUFFIX_DEPRECATED
+        #define CL_EXT_PREFIX_DEPRECATED
+    #endif
+    #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+        #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+    #else
+        #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+    #endif
+    #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    #else
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+    #endif
+    #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+        #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+    #else
+        #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+    #endif
+    #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS
+        #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
+    #else
+        #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+    #endif
+    #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS
+        #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
+    #else
+        #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+    #endif
+    #ifdef CL_USE_DEPRECATED_OPENCL_2_2_APIS
+        #define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED
+    #else
+        #define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+    #endif
+#endif
+#if (defined (_WIN32) && defined(_MSC_VER))
+/* scalar types  */
+typedef signed   __int8         cl_char;
+typedef unsigned __int8         cl_uchar;
+typedef signed   __int16        cl_short;
+typedef unsigned __int16        cl_ushort;
+typedef signed   __int32        cl_int;
+typedef unsigned __int32        cl_uint;
+typedef signed   __int64        cl_long;
+typedef unsigned __int64        cl_ulong;
+typedef unsigned __int16        cl_half;
+typedef float                   cl_float;
+typedef double                  cl_double;
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      1.1920928955078125e-7f
+#define CL_HALF_DIG          3
+#define CL_HALF_MANT_DIG     11
+#define CL_HALF_MAX_10_EXP   +4
+#define CL_HALF_MAX_EXP      +16
+#define CL_HALF_MIN_10_EXP   -4
+#define CL_HALF_MIN_EXP      -13
+#define CL_HALF_RADIX        2
+#define CL_HALF_MAX          65504.0f
+#define CL_HALF_MIN          6.103515625e-05f
+#define CL_HALF_EPSILON      9.765625e-04f
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          1.7976931348623158e+308
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
+#define CL_M_E              2.7182818284590452354
+#define CL_M_LOG2E          1.4426950408889634074
+#define CL_M_LOG10E         0.43429448190325182765
+#define CL_M_LN2            0.69314718055994530942
+#define CL_M_LN10           2.30258509299404568402
+#define CL_M_PI             3.14159265358979323846
+#define CL_M_PI_2           1.57079632679489661923
+#define CL_M_PI_4           0.78539816339744830962
+#define CL_M_1_PI           0.31830988618379067154
+#define CL_M_2_PI           0.63661977236758134308
+#define CL_M_2_SQRTPI       1.12837916709551257390
+#define CL_M_SQRT2          1.41421356237309504880
+#define CL_M_SQRT1_2        0.70710678118654752440
+#define CL_M_E_F            2.718281828f
+#define CL_M_LOG2E_F        1.442695041f
+#define CL_M_LOG10E_F       0.434294482f
+#define CL_M_LN2_F          0.693147181f
+#define CL_M_LN10_F         2.302585093f
+#define CL_M_PI_F           3.141592654f
+#define CL_M_PI_2_F         1.570796327f
+#define CL_M_PI_4_F         0.785398163f
+#define CL_M_1_PI_F         0.318309886f
+#define CL_M_2_PI_F         0.636619772f
+#define CL_M_2_SQRTPI_F     1.128379167f
+#define CL_M_SQRT2_F        1.414213562f
+#define CL_M_SQRT1_2_F      0.707106781f
+#define CL_NAN              (CL_INFINITY - CL_INFINITY)
+#define CL_HUGE_VALF        ((cl_float) 1e50)
+#define CL_HUGE_VAL         ((cl_double) 1e500)
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+#else
+#include <stdint.h>
+/* scalar types  */
+typedef int8_t          cl_char;
+typedef uint8_t         cl_uchar;
+typedef int16_t         cl_short;
+typedef uint16_t        cl_ushort;
+typedef int32_t         cl_int;
+typedef uint32_t        cl_uint;
+typedef int64_t         cl_long;
+typedef uint64_t        cl_ulong;
+typedef uint16_t        cl_half;
+typedef float           cl_float;
+typedef double          cl_double;
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      1.1920928955078125e-7f
+#define CL_HALF_DIG          3
+#define CL_HALF_MANT_DIG     11
+#define CL_HALF_MAX_10_EXP   +4
+#define CL_HALF_MAX_EXP      +16
+#define CL_HALF_MIN_10_EXP   -4
+#define CL_HALF_MIN_EXP      -13
+#define CL_HALF_RADIX        2
+#define CL_HALF_MAX          65504.0f
+#define CL_HALF_MIN          6.103515625e-05f
+#define CL_HALF_EPSILON      9.765625e-04f
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
+#define CL_M_E              2.7182818284590452354
+#define CL_M_LOG2E          1.4426950408889634074
+#define CL_M_LOG10E         0.43429448190325182765
+#define CL_M_LN2            0.69314718055994530942
+#define CL_M_LN10           2.30258509299404568402
+#define CL_M_PI             3.14159265358979323846
+#define CL_M_PI_2           1.57079632679489661923
+#define CL_M_PI_4           0.78539816339744830962
+#define CL_M_1_PI           0.31830988618379067154
+#define CL_M_2_PI           0.63661977236758134308
+#define CL_M_2_SQRTPI       1.12837916709551257390
+#define CL_M_SQRT2          1.41421356237309504880
+#define CL_M_SQRT1_2        0.70710678118654752440
+#define CL_M_E_F            2.718281828f
+#define CL_M_LOG2E_F        1.442695041f
+#define CL_M_LOG10E_F       0.434294482f
+#define CL_M_LN2_F          0.693147181f
+#define CL_M_LN10_F         2.302585093f
+#define CL_M_PI_F           3.141592654f
+#define CL_M_PI_2_F         1.570796327f
+#define CL_M_PI_4_F         0.785398163f
+#define CL_M_1_PI_F         0.318309886f
+#define CL_M_2_PI_F         0.636619772f
+#define CL_M_2_SQRTPI_F     1.128379167f
+#define CL_M_SQRT2_F        1.414213562f
+#define CL_M_SQRT1_2_F      0.707106781f
+#if defined( __GNUC__ )
+   #define CL_HUGE_VALF     __builtin_huge_valf()
+   #define CL_HUGE_VAL      __builtin_huge_val()
+   #define CL_NAN           __builtin_nanf( "" )
+#else
+   #define CL_HUGE_VALF     ((cl_float) 1e50)
+   #define CL_HUGE_VAL      ((cl_double) 1e500)
+   float nanf( const char * );
+   #define CL_NAN           nanf( "" )
+#endif
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+#endif
+#include <stddef.h>
+/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */
+typedef unsigned int cl_GLuint;
+typedef int          cl_GLint;
+typedef unsigned int cl_GLenum;
+/*
+ * Vector types
+ *
+ *  Note:   OpenCL requires that all types be naturally aligned.
+ *          This means that vector types must be naturally aligned.
+ *          For example, a vector of four floats must be aligned to
+ *          a 16 byte boundary (calculated as 4 * the natural 4-byte
+ *          alignment of the float).  The alignment qualifiers here
+ *          will only function properly if your compiler supports them
+ *          and if you don't actively work to defeat them.  For example,
+ *          in order for a cl_float4 to be 16 byte aligned in a struct,
+ *          the start of the struct must itself be 16-byte aligned.
+ *
+ *          Maintaining proper alignment is the user's responsibility.
+ */
+/* Define basic vector types */
+#if defined( __VEC__ )
+  #if !defined(__clang__)
+     #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
+  #endif
+   typedef __vector unsigned char     __cl_uchar16;
+   typedef __vector signed char       __cl_char16;
+   typedef __vector unsigned short    __cl_ushort8;
+   typedef __vector signed short      __cl_short8;
+   typedef __vector unsigned int      __cl_uint4;
+   typedef __vector signed int        __cl_int4;
+   typedef __vector float             __cl_float4;
+   #define  __CL_UCHAR16__  1
+   #define  __CL_CHAR16__   1
+   #define  __CL_USHORT8__  1
+   #define  __CL_SHORT8__   1
+   #define  __CL_UINT4__    1
+   #define  __CL_INT4__     1
+   #define  __CL_FLOAT4__   1
+#endif
+#if defined( __SSE__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <xmmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef float __cl_float4   __attribute__((vector_size(16)));
+    #else
+        typedef __m128 __cl_float4;
+    #endif
+    #define __CL_FLOAT4__   1
+#endif
+#if defined( __SSE2__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <emmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar16    __attribute__((vector_size(16)));
+        typedef cl_char     __cl_char16     __attribute__((vector_size(16)));
+        typedef cl_ushort   __cl_ushort8    __attribute__((vector_size(16)));
+        typedef cl_short    __cl_short8     __attribute__((vector_size(16)));
+        typedef cl_uint     __cl_uint4      __attribute__((vector_size(16)));
+        typedef cl_int      __cl_int4       __attribute__((vector_size(16)));
+        typedef cl_ulong    __cl_ulong2     __attribute__((vector_size(16)));
+        typedef cl_long     __cl_long2      __attribute__((vector_size(16)));
+        typedef cl_double   __cl_double2    __attribute__((vector_size(16)));
+    #else
+        typedef __m128i __cl_uchar16;
+        typedef __m128i __cl_char16;
+        typedef __m128i __cl_ushort8;
+        typedef __m128i __cl_short8;
+        typedef __m128i __cl_uint4;
+        typedef __m128i __cl_int4;
+        typedef __m128i __cl_ulong2;
+        typedef __m128i __cl_long2;
+        typedef __m128d __cl_double2;
+    #endif
+    #define __CL_UCHAR16__  1
+    #define __CL_CHAR16__   1
+    #define __CL_USHORT8__  1
+    #define __CL_SHORT8__   1
+    #define __CL_INT4__     1
+    #define __CL_UINT4__    1
+    #define __CL_ULONG2__   1
+    #define __CL_LONG2__    1
+    #define __CL_DOUBLE2__  1
+#endif
+#if defined( __MMX__ )
+    #include <mmintrin.h>
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar8     __attribute__((vector_size(8)));
+        typedef cl_char     __cl_char8      __attribute__((vector_size(8)));
+        typedef cl_ushort   __cl_ushort4    __attribute__((vector_size(8)));
+        typedef cl_short    __cl_short4     __attribute__((vector_size(8)));
+        typedef cl_uint     __cl_uint2      __attribute__((vector_size(8)));
+        typedef cl_int      __cl_int2       __attribute__((vector_size(8)));
+        typedef cl_ulong    __cl_ulong1     __attribute__((vector_size(8)));
+        typedef cl_long     __cl_long1      __attribute__((vector_size(8)));
+        typedef cl_float    __cl_float2     __attribute__((vector_size(8)));
+    #else
+        typedef __m64       __cl_uchar8;
+        typedef __m64       __cl_char8;
+        typedef __m64       __cl_ushort4;
+        typedef __m64       __cl_short4;
+        typedef __m64       __cl_uint2;
+        typedef __m64       __cl_int2;
+        typedef __m64       __cl_ulong1;
+        typedef __m64       __cl_long1;
+        typedef __m64       __cl_float2;
+    #endif
+    #define __CL_UCHAR8__   1
+    #define __CL_CHAR8__    1
+    #define __CL_USHORT4__  1
+    #define __CL_SHORT4__   1
+    #define __CL_INT2__     1
+    #define __CL_UINT2__    1
+    #define __CL_ULONG1__   1
+    #define __CL_LONG1__    1
+    #define __CL_FLOAT2__   1
+#endif
+#if defined( __AVX__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <immintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_float    __cl_float8     __attribute__((vector_size(32)));
+        typedef cl_double   __cl_double4    __attribute__((vector_size(32)));
+    #else
+        typedef __m256      __cl_float8;
+        typedef __m256d     __cl_double4;
+    #endif
+    #define __CL_FLOAT8__   1
+    #define __CL_DOUBLE4__  1
+#endif
+/* Define capabilities for anonymous struct members. */
+#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__
+#elif defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__ __extension__
+#elif defined( _WIN32) && defined(_MSC_VER) && ! defined(__STDC__)
+    #if _MSC_VER >= 1500
+   /* Microsoft Developer Studio 2008 supports anonymous structs, but
+    * complains by default. */
+    #define  __CL_HAS_ANON_STRUCT__ 1
+    #define  __CL_ANON_STRUCT__
+   /* Disable warning C4201: nonstandard extension used : nameless
+    * struct/union */
+    #pragma warning( push )
+    #pragma warning( disable : 4201 )
+    #endif
+#else
+#define  __CL_HAS_ANON_STRUCT__ 0
+#define  __CL_ANON_STRUCT__
+#endif
+/* Define alignment keys */
+#if defined( __GNUC__ )
+    #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
+#elif defined( _WIN32) && (_MSC_VER)
+    /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */
+    /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */
+    /* #include <crtdefs.h>                                                                                             */
+    /* #define CL_ALIGNED(_x)          _CRT_ALIGN(_x)                                                                   */
+    #define CL_ALIGNED(_x)
+#else
+   #warning  Need to implement some method to align data here
+   #define  CL_ALIGNED(_x)
+#endif
+/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
+#if __CL_HAS_ANON_STRUCT__
+    /* .xyzw and .s0123...{f|F} are supported */
+    #define CL_HAS_NAMED_VECTOR_FIELDS 1
+    /* .hi and .lo are supported */
+    #define CL_HAS_HI_LO_VECTOR_FIELDS 1
+#endif
+/* Define cl_vector types */
+/* ---- cl_charn ---- */
+typedef union
+{
+    cl_char  CL_ALIGNED(2) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_char  lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2;
+#endif
+}cl_char2;
+typedef union
+{
+    cl_char  CL_ALIGNED(4) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2[2];
+#endif
+#if defined( __CL_CHAR4__)
+    __cl_char4     v4;
+#endif
+}cl_char4;
+/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
+typedef  cl_char4  cl_char3;
+typedef union
+{
+    cl_char   CL_ALIGNED(8) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2[4];
+#endif
+#if defined( __CL_CHAR4__)
+    __cl_char4     v4[2];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8;
+#endif
+}cl_char8;
+typedef union
+{
+    cl_char  CL_ALIGNED(16) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2[8];
+#endif
+#if defined( __CL_CHAR4__)
+    __cl_char4     v4[4];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8[2];
+#endif
+#if defined( __CL_CHAR16__ )
+    __cl_char16    v16;
+#endif
+}cl_char16;
+/* ---- cl_ucharn ---- */
+typedef union
+{
+    cl_uchar  CL_ALIGNED(2) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  lo, hi; };
+#endif
+#if defined( __cl_uchar2__)
+    __cl_uchar2     v2;
+#endif
+}cl_uchar2;
+typedef union
+{
+    cl_uchar  CL_ALIGNED(4) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+    __cl_uchar2     v2[2];
+#endif
+#if defined( __CL_UCHAR4__)
+    __cl_uchar4     v4;
+#endif
+}cl_uchar4;
+/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
+typedef  cl_uchar4  cl_uchar3;
+typedef union
+{
+    cl_uchar   CL_ALIGNED(8) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+    __cl_uchar2     v2[4];
+#endif
+#if defined( __CL_UCHAR4__)
+    __cl_uchar4     v4[2];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8;
+#endif
+}cl_uchar8;
+typedef union
+{
+    cl_uchar  CL_ALIGNED(16) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+    __cl_uchar2     v2[8];
+#endif
+#if defined( __CL_UCHAR4__)
+    __cl_uchar4     v4[4];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8[2];
+#endif
+#if defined( __CL_UCHAR16__ )
+    __cl_uchar16    v16;
+#endif
+}cl_uchar16;
+/* ---- cl_shortn ---- */
+typedef union
+{
+    cl_short  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_short  lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2;
+#endif
+}cl_short2;
+typedef union
+{
+    cl_short  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2[2];
+#endif
+#if defined( __CL_SHORT4__)
+    __cl_short4     v4;
+#endif
+}cl_short4;
+/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
+typedef  cl_short4  cl_short3;
+typedef union
+{
+    cl_short   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2[4];
+#endif
+#if defined( __CL_SHORT4__)
+    __cl_short4     v4[2];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8;
+#endif
+}cl_short8;
+typedef union
+{
+    cl_short  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2[8];
+#endif
+#if defined( __CL_SHORT4__)
+    __cl_short4     v4[4];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8[2];
+#endif
+#if defined( __CL_SHORT16__ )
+    __cl_short16    v16;
+#endif
+}cl_short16;
+/* ---- cl_ushortn ---- */
+typedef union
+{
+    cl_ushort  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2;
+#endif
+}cl_ushort2;
+typedef union
+{
+    cl_ushort  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2[2];
+#endif
+#if defined( __CL_USHORT4__)
+    __cl_ushort4     v4;
+#endif
+}cl_ushort4;
+/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
+typedef  cl_ushort4  cl_ushort3;
+typedef union
+{
+    cl_ushort   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2[4];
+#endif
+#if defined( __CL_USHORT4__)
+    __cl_ushort4     v4[2];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8;
+#endif
+}cl_ushort8;
+typedef union
+{
+    cl_ushort  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2[8];
+#endif
+#if defined( __CL_USHORT4__)
+    __cl_ushort4     v4[4];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8[2];
+#endif
+#if defined( __CL_USHORT16__ )
+    __cl_ushort16    v16;
+#endif
+}cl_ushort16;
+/* ---- cl_halfn ---- */
+typedef union
+{
+    cl_half  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1; };
+    __CL_ANON_STRUCT__ struct{ cl_half  lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2;
+#endif
+}cl_half2;
+typedef union
+{
+    cl_half  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3; };
+    __CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2[2];
+#endif
+#if defined( __CL_HALF4__)
+    __cl_half4     v4;
+#endif
+}cl_half4;
+/* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */
+typedef  cl_half4  cl_half3;
+typedef union
+{
+    cl_half   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7; };
+    __CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2[4];
+#endif
+#if defined( __CL_HALF4__)
+    __cl_half4     v4[2];
+#endif
+#if defined( __CL_HALF8__ )
+    __cl_half8     v8;
+#endif
+}cl_half8;
+typedef union
+{
+    cl_half  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+    __CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2[8];
+#endif
+#if defined( __CL_HALF4__)
+    __cl_half4     v4[4];
+#endif
+#if defined( __CL_HALF8__ )
+    __cl_half8     v8[2];
+#endif
+#if defined( __CL_HALF16__ )
+    __cl_half16    v16;
+#endif
+}cl_half16;
+/* ---- cl_intn ---- */
+typedef union
+{
+    cl_int  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_int  lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2;
+#endif
+}cl_int2;
+typedef union
+{
+    cl_int  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2[2];
+#endif
+#if defined( __CL_INT4__)
+    __cl_int4     v4;
+#endif
+}cl_int4;
+/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
+typedef  cl_int4  cl_int3;
+typedef union
+{
+    cl_int   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2[4];
+#endif
+#if defined( __CL_INT4__)
+    __cl_int4     v4[2];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8;
+#endif
+}cl_int8;
+typedef union
+{
+    cl_int  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2[8];
+#endif
+#if defined( __CL_INT4__)
+    __cl_int4     v4[4];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8[2];
+#endif
+#if defined( __CL_INT16__ )
+    __cl_int16    v16;
+#endif
+}cl_int16;
+/* ---- cl_uintn ---- */
+typedef union
+{
+    cl_uint  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2;
+#endif
+}cl_uint2;
+typedef union
+{
+    cl_uint  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2[2];
+#endif
+#if defined( __CL_UINT4__)
+    __cl_uint4     v4;
+#endif
+}cl_uint4;
+/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
+typedef  cl_uint4  cl_uint3;
+typedef union
+{
+    cl_uint   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2[4];
+#endif
+#if defined( __CL_UINT4__)
+    __cl_uint4     v4[2];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8;
+#endif
+}cl_uint8;
+typedef union
+{
+    cl_uint  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2[8];
+#endif
+#if defined( __CL_UINT4__)
+    __cl_uint4     v4[4];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8[2];
+#endif
+#if defined( __CL_UINT16__ )
+    __cl_uint16    v16;
+#endif
+}cl_uint16;
+/* ---- cl_longn ---- */
+typedef union
+{
+    cl_long  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_long  lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2;
+#endif
+}cl_long2;
+typedef union
+{
+    cl_long  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2[2];
+#endif
+#if defined( __CL_LONG4__)
+    __cl_long4     v4;
+#endif
+}cl_long4;
+/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
+typedef  cl_long4  cl_long3;
+typedef union
+{
+    cl_long   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2[4];
+#endif
+#if defined( __CL_LONG4__)
+    __cl_long4     v4[2];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8;
+#endif
+}cl_long8;
+typedef union
+{
+    cl_long  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2[8];
+#endif
+#if defined( __CL_LONG4__)
+    __cl_long4     v4[4];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8[2];
+#endif
+#if defined( __CL_LONG16__ )
+    __cl_long16    v16;
+#endif
+}cl_long16;
+/* ---- cl_ulongn ---- */
+typedef union
+{
+    cl_ulong  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2;
+#endif
+}cl_ulong2;
+typedef union
+{
+    cl_ulong  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2[2];
+#endif
+#if defined( __CL_ULONG4__)
+    __cl_ulong4     v4;
+#endif
+}cl_ulong4;
+/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
+typedef  cl_ulong4  cl_ulong3;
+typedef union
+{
+    cl_ulong   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2[4];
+#endif
+#if defined( __CL_ULONG4__)
+    __cl_ulong4     v4[2];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8;
+#endif
+}cl_ulong8;
+typedef union
+{
+    cl_ulong  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2[8];
+#endif
+#if defined( __CL_ULONG4__)
+    __cl_ulong4     v4[4];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8[2];
+#endif
+#if defined( __CL_ULONG16__ )
+    __cl_ulong16    v16;
+#endif
+}cl_ulong16;
+/* --- cl_floatn ---- */
+typedef union
+{
+    cl_float  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_float  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2;
+#endif
+}cl_float2;
+typedef union
+{
+    cl_float  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_float2  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2[2];
+#endif
+#if defined( __CL_FLOAT4__)
+    __cl_float4     v4;
+#endif
+}cl_float4;
+/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
+typedef  cl_float4  cl_float3;
+typedef union
+{
+    cl_float   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_float4  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2[4];
+#endif
+#if defined( __CL_FLOAT4__)
+    __cl_float4     v4[2];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8;
+#endif
+}cl_float8;
+typedef union
+{
+    cl_float  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2[8];
+#endif
+#if defined( __CL_FLOAT4__)
+    __cl_float4     v4[4];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8[2];
+#endif
+#if defined( __CL_FLOAT16__ )
+    __cl_float16    v16;
+#endif
+}cl_float16;
+/* --- cl_doublen ---- */
+typedef union
+{
+    cl_double  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_double s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_double lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2;
+#endif
+}cl_double2;
+typedef union
+{
+    cl_double  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2[2];
+#endif
+#if defined( __CL_DOUBLE4__)
+    __cl_double4     v4;
+#endif
+}cl_double4;
+/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
+typedef  cl_double4  cl_double3;
+typedef union
+{
+    cl_double   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2[4];
+#endif
+#if defined( __CL_DOUBLE4__)
+    __cl_double4     v4[2];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8;
+#endif
+}cl_double8;
+typedef union
+{
+    cl_double  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2[8];
+#endif
+#if defined( __CL_DOUBLE4__)
+    __cl_double4     v4[4];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8[2];
+#endif
+#if defined( __CL_DOUBLE16__ )
+    __cl_double16    v16;
+#endif
+}cl_double16;
+/* Macro to facilitate debugging
+ * Usage:
+ *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source.
+ *   The first line ends with:   CL_PROGRAM_STRING_DEBUG_INFO \"
+ *   Each line thereafter of OpenCL C source must end with: \n\
+ *   The last line ends in ";
+ *
+ *   Example:
+ *
+ *   const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\
+ *   kernel void foo( int a, float * b )             \n\
+ *   {                                               \n\
+ *      // my comment                                \n\
+ *      *b[ get_global_id(0)] = a;                   \n\
+ *   }                                               \n\
+ *   ";
+ *
+ * This should correctly set up the line, (column) and file information for your source
+ * string so you can do source level debugging.
+ */
+#define  __CL_STRINGIFY( _x )               # _x
+#define  _CL_STRINGIFY( _x )                __CL_STRINGIFY( _x )
+#define  CL_PROGRAM_STRING_DEBUG_INFO       "#line "  _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n"
+#ifdef __cplusplus
+}
+#endif
+#undef __CL_HAS_ANON_STRUCT__
+#undef __CL_ANON_STRUCT__
+#if defined( _WIN32) && defined(_MSC_VER) && ! defined(__STDC__)
+    #if _MSC_VER >=1500
+    #pragma warning( pop )
+    #endif
+#endif
+#endif  /* __CL_PLATFORM_H  */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/opencl.h ADDED Viewed

	@@ -0,0 +1,40 @@

+/*******************************************************************************
+ * Copyright (c) 2008-2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+#ifndef __OPENCL_H
+#define __OPENCL_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#include <OpenCL/cl_gl.h>
+#include <OpenCL/cl_gl_ext.h>
+#include <OpenCL/cl_ext.h>
+#else
+#include <CL/cl.h>
+#include <CL/cl_gl.h>
+#include <CL/cl_gl_ext.h>
+#include <CL/cl_ext.h>
+#endif
+#ifdef __cplusplus
+}
+#endif
+#endif  /* __OPENCL_H   */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (228 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/builtin_types.h ADDED Viewed

	@@ -0,0 +1,64 @@

+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#include "device_types.h"
+#if !defined(__CUDACC_RTC__)
+#define EXCLUDE_FROM_RTC
+#include "driver_types.h"
+#undef EXCLUDE_FROM_RTC
+#endif /* !__CUDACC_RTC__ */
+#include "surface_types.h"
+#include "texture_types.h"
+#include "vector_types.h"

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/channel_descriptor.h ADDED Viewed

	@@ -0,0 +1,595 @@

+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__CHANNEL_DESCRIPTOR_H__)
+#define __CHANNEL_DESCRIPTOR_H__
+#if defined(__cplusplus)
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#include "cuda_runtime_api.h"
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+/**
+ * \addtogroup CUDART_HIGHLEVEL
+ *
+ * @{
+ */
+/**
+ * \brief \hl Returns a channel descriptor using the specified format
+ *
+ * Returns a channel descriptor with format \p f and number of bits of each
+ * component \p x, \p y, \p z, and \p w.  The ::cudaChannelFormatDesc is
+ * defined as:
+ * \code
+  struct cudaChannelFormatDesc {
+    int x, y, z, w;
+    enum cudaChannelFormatKind f;
+  };
+ * \endcode
+ *
+ * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
+ * ::cudaChannelFormatKindUnsigned, cudaChannelFormatKindFloat,
+ * ::cudaChannelFormatKindSignedNormalized8X1, ::cudaChannelFormatKindSignedNormalized8X2,
+ * ::cudaChannelFormatKindSignedNormalized8X4,
+ * ::cudaChannelFormatKindUnsignedNormalized8X1, ::cudaChannelFormatKindUnsignedNormalized8X2,
+ * ::cudaChannelFormatKindUnsignedNormalized8X4,
+ * ::cudaChannelFormatKindSignedNormalized16X1, ::cudaChannelFormatKindSignedNormalized16X2,
+ * ::cudaChannelFormatKindSignedNormalized16X4,
+ * ::cudaChannelFormatKindUnsignedNormalized16X1, ::cudaChannelFormatKindUnsignedNormalized16X2,
+ * ::cudaChannelFormatKindUnsignedNormalized16X4
+ * or ::cudaChannelFormatKindNV12.
+ *
+ * The format is specified by the template specialization.
+ *
+ * The template function specializes for the following scalar types:
+ * char, signed char, unsigned char, short, unsigned short, int, unsigned int, long, unsigned long, and float.
+ * The template function specializes for the following vector types:
+ * char{1|2|4}, uchar{1|2|4}, short{1|2|4}, ushort{1|2|4}, int{1|2|4}, uint{1|2|4}, long{1|2|4}, ulong{1|2|4}, float{1|2|4}.
+ * The template function specializes for following cudaChannelFormatKind enum values:
+ * ::cudaChannelFormatKind{Uns|S}ignedNormalized{8|16}X{1|2|4}, and ::cudaChannelFormatKindNV12.
+ *
+ * Invoking the function on a type without a specialization defaults to creating a channel format of kind ::cudaChannelFormatKindNone
+ *
+ * \return
+ * Channel descriptor with format \p f
+ *
+ * \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (High level)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, size_t) "cudaBindTexture (High level, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (High level)",
+ * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (High level)",
+ * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (High level, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (High level)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (High level)"
+ */
+template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+  return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
+}
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
+}
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char>(void)
+{
+  int e = (int)sizeof(char) * 8;
+#if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__)
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+#else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+#endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<signed char>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned char>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char1>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar1>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char2>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar2>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char4>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar4>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short>(void)
+{
+  int e = (int)sizeof(short) * 8;
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned short>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short1>(void)
+{
+  int e = (int)sizeof(short) * 8;
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort1>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short2>(void)
+{
+  int e = (int)sizeof(short) * 8;
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort2>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short4>(void)
+{
+  int e = (int)sizeof(short) * 8;
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort4>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int>(void)
+{
+  int e = (int)sizeof(int) * 8;
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned int>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int1>(void)
+{
+  int e = (int)sizeof(int) * 8;
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint1>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int2>(void)
+{
+  int e = (int)sizeof(int) * 8;
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint2>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int4>(void)
+{
+  int e = (int)sizeof(int) * 8;
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint4>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+#if !defined(__LP64__)
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long>(void)
+{
+  int e = (int)sizeof(long) * 8;
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned long>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long1>(void)
+{
+  int e = (int)sizeof(long) * 8;
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong1>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long2>(void)
+{
+  int e = (int)sizeof(long) * 8;
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong2>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long4>(void)
+{
+  int e = (int)sizeof(long) * 8;
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong4>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+#endif /* !__LP64__ */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float>(void)
+{
+  int e = (int)sizeof(float) * 8;
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float1>(void)
+{
+  int e = (int)sizeof(float) * 8;
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float2>(void)
+{
+  int e = (int)sizeof(float) * 8;
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float4>(void)
+{
+  int e = (int)sizeof(float) * 8;
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
+}
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescNV12(void)
+{
+    int e = (int)sizeof(char) * 8;
+    return cudaCreateChannelDesc(e, e, e, 0, cudaChannelFormatKindNV12);
+}
+template<cudaChannelFormatKind> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+    return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
+}
+/* Signed 8-bit normalized integer formats */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X1>(void)
+{
+    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedNormalized8X1);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X2>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedNormalized8X2);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X4>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindSignedNormalized8X4);
+}
+/* Unsigned 8-bit normalized integer formats */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X1>(void)
+{
+    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized8X1);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X2>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedNormalized8X2);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X4>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedNormalized8X4);
+}
+/* Signed 16-bit normalized integer formats */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X1>(void)
+{
+    return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindSignedNormalized16X1);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X2>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindSignedNormalized16X2);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X4>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindSignedNormalized16X4);
+}
+/* Unsigned 16-bit normalized integer formats */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X1>(void)
+{
+    return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized16X1);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X2>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindUnsignedNormalized16X2);
+}
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X4>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindUnsignedNormalized16X4);
+}
+/* NV12 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindNV12>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 0, cudaChannelFormatKindNV12);
+}
+/* BC1 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1);
+}
+/* BC1sRGB format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1SRGB>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1SRGB);
+}
+/* BC2 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2);
+}
+/* BC2sRGB format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2SRGB>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2SRGB);
+}
+/* BC3 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3);
+}
+/* BC3sRGB format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3SRGB>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3SRGB);
+}
+/* BC4 unsigned format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed4>(void)
+{
+    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed4);
+}
+/* BC4 signed format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed4>(void)
+{
+    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedBlockCompressed4);
+}
+/* BC5 unsigned format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed5>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed5);
+}
+/* BC5 signed format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed5>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedBlockCompressed5);
+}
+/* BC6H unsigned format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed6H>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindUnsignedBlockCompressed6H);
+}
+/* BC6H signed format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed6H>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindSignedBlockCompressed6H);
+}
+/* BC7 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7);
+}
+/* BC7sRGB format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7SRGB>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7SRGB);
+}
+#endif /* __cplusplus */
+/** @} */
+/** @} */ /* END CUDART_TEXTURE_HL */
+#endif /* !__CHANNEL_DESCRIPTOR_H__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups.h ADDED Viewed

	@@ -0,0 +1,1828 @@

+/*
+ * Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef _COOPERATIVE_GROUPS_H_
+#define _COOPERATIVE_GROUPS_H_
+#if defined(__cplusplus) && defined(__CUDACC__)
+#include "cooperative_groups/details/info.h"
+#include "cooperative_groups/details/driver_abi.h"
+#include "cooperative_groups/details/helpers.h"
+#if defined(_CG_HAS_STL_ATOMICS)
+#include <cuda/atomic>
+#define _CG_THREAD_SCOPE(scope) _CG_STATIC_CONST_DECL cuda::thread_scope thread_scope = scope;
+#else
+#define _CG_THREAD_SCOPE(scope)
+#endif
+_CG_BEGIN_NAMESPACE
+namespace details {
+    _CG_CONST_DECL unsigned int coalesced_group_id = 1;
+    _CG_CONST_DECL unsigned int multi_grid_group_id = 2;
+    _CG_CONST_DECL unsigned int grid_group_id = 3;
+    _CG_CONST_DECL unsigned int thread_block_id = 4;
+    _CG_CONST_DECL unsigned int multi_tile_group_id = 5;
+    _CG_CONST_DECL unsigned int cluster_group_id = 6;
+}
+/**
+ * class thread_group;
+ *
+ * Generic thread group type, into which all groups are convertible.
+ * It acts as a container for all storage necessary for the derived groups,
+ * and will dispatch the API calls to the correct derived group. This means
+ * that all derived groups must implement the same interface as thread_group.
+ */
+class thread_group
+{
+protected:
+    struct group_data {
+        unsigned int _unused : 1;
+        unsigned int type : 7, : 0;
+    };
+    struct gg_data  {
+        details::grid_workspace *gridWs;
+    };
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    struct mg_data  {
+        unsigned long long _unused : 1;
+        unsigned long long type    : 7;
+        unsigned long long handle  : 56;
+        const details::multi_grid::multi_grid_functions *functions;
+    };
+#endif
+    struct tg_data {
+        unsigned int is_tiled : 1;
+        unsigned int type : 7;
+        unsigned int size : 24;
+        // packed to 4b
+        unsigned int metaGroupSize : 16;
+        unsigned int metaGroupRank : 16;
+        // packed to 8b
+        unsigned int mask;
+        // packed to 12b
+        unsigned int _res;
+    };
+    friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
+    friend class thread_block;
+    union __align__(8) {
+        group_data  group;
+        tg_data     coalesced;
+        gg_data     grid;
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+        mg_data     multi_grid;
+#endif
+    } _data;
+    _CG_QUALIFIER thread_group operator=(const thread_group& src);
+    _CG_QUALIFIER thread_group(unsigned int type) {
+        _data.group.type = type;
+        _data.group._unused = false;
+    }
+#ifdef _CG_CPP11_FEATURES
+    static_assert(sizeof(tg_data) <= 16, "Failed size check");
+    static_assert(sizeof(gg_data) <= 16, "Failed size check");
+#  ifdef _CG_ABI_EXPERIMENTAL
+    static_assert(sizeof(mg_data) <= 16, "Failed size check");
+#  endif
+#endif
+public:
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
+    _CG_QUALIFIER unsigned long long size() const;
+    _CG_QUALIFIER unsigned long long num_threads() const;
+    _CG_QUALIFIER unsigned long long thread_rank() const;
+    _CG_QUALIFIER void sync() const;
+    _CG_QUALIFIER unsigned int get_type() const {
+        return _data.group.type;
+    }
+};
+template <unsigned int TyId>
+struct thread_group_base : public thread_group {
+    _CG_QUALIFIER thread_group_base() : thread_group(TyId) {}
+    _CG_STATIC_CONST_DECL unsigned int id = TyId;
+};
+#if defined(_CG_HAS_MULTI_GRID_GROUP)
+/**
+ * class multi_grid_group;
+ *
+ * Threads within this this group are guaranteed to be co-resident on the
+ * same system, on multiple devices within the same launched kernels.
+ * To use this group, the kernel must have been launched with
+ * cuLaunchCooperativeKernelMultiDevice (or the CUDA Runtime equivalent),
+ * and the device must support it (queryable device attribute).
+ *
+ * Constructed via this_multi_grid();
+ */
+# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+class multi_grid_group;
+// Multi grid group requires these functions to be templated to prevent ptxas from trying to use CG syscalls
+template <typename = void>
+__device__ _CG_DEPRECATED multi_grid_group this_multi_grid();
+class multi_grid_group : public thread_group_base<details::multi_grid_group_id>
+{
+private:
+    template <typename = void>
+    _CG_QUALIFIER multi_grid_group() {
+        _data.multi_grid.functions = details::multi_grid::load_grid_intrinsics();
+        _data.multi_grid.handle = _data.multi_grid.functions->get_intrinsic_handle();
+    }
+    friend multi_grid_group this_multi_grid<void>();
+public:
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
+    _CG_QUALIFIER bool is_valid() const {
+        return (_data.multi_grid.handle != 0);
+    }
+    _CG_QUALIFIER void sync() const {
+        if (!is_valid()) {
+            _CG_ABORT();
+        }
+        _data.multi_grid.functions->sync(_data.multi_grid.handle);
+    }
+    _CG_QUALIFIER unsigned long long num_threads() const {
+        _CG_ASSERT(is_valid());
+        return _data.multi_grid.functions->size(_data.multi_grid.handle);
+    }
+    _CG_QUALIFIER unsigned long long size() const {
+        return num_threads();
+    }
+    _CG_QUALIFIER unsigned long long thread_rank() const {
+        _CG_ASSERT(is_valid());
+        return _data.multi_grid.functions->thread_rank(_data.multi_grid.handle);
+    }
+    _CG_QUALIFIER unsigned int grid_rank() const {
+        _CG_ASSERT(is_valid());
+        return (_data.multi_grid.functions->grid_rank(_data.multi_grid.handle));
+    }
+    _CG_QUALIFIER unsigned int num_grids() const {
+        _CG_ASSERT(is_valid());
+        return (_data.multi_grid.functions->num_grids(_data.multi_grid.handle));
+    }
+};
+# else
+class multi_grid_group
+{
+private:
+    unsigned long long _handle;
+    unsigned int _size;
+    unsigned int _rank;
+    friend _CG_QUALIFIER multi_grid_group this_multi_grid();
+    _CG_QUALIFIER multi_grid_group() {
+        _handle = details::multi_grid::get_intrinsic_handle();
+        _size = details::multi_grid::size(_handle);
+        _rank = details::multi_grid::thread_rank(_handle);
+    }
+public:
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
+    _CG_QUALIFIER _CG_DEPRECATED bool is_valid() const {
+        return (_handle != 0);
+    }
+    _CG_QUALIFIER _CG_DEPRECATED void sync() const {
+        if (!is_valid()) {
+            _CG_ABORT();
+        }
+        details::multi_grid::sync(_handle);
+    }
+    _CG_QUALIFIER _CG_DEPRECATED unsigned long long num_threads() const {
+        _CG_ASSERT(is_valid());
+        return _size;
+    }
+    _CG_QUALIFIER _CG_DEPRECATED unsigned long long size() const {
+        return num_threads();
+    }
+    _CG_QUALIFIER _CG_DEPRECATED unsigned long long thread_rank() const {
+        _CG_ASSERT(is_valid());
+        return _rank;
+    }
+    _CG_QUALIFIER _CG_DEPRECATED unsigned int grid_rank() const {
+        _CG_ASSERT(is_valid());
+        return (details::multi_grid::grid_rank(_handle));
+    }
+    _CG_QUALIFIER _CG_DEPRECATED unsigned int num_grids() const {
+        _CG_ASSERT(is_valid());
+        return (details::multi_grid::num_grids(_handle));
+    }
+};
+# endif
+/**
+ * multi_grid_group this_multi_grid()
+ *
+ * Constructs a multi_grid_group
+ */
+# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+template <typename>
+__device__
+#else
+_CG_QUALIFIER
+# endif
+_CG_DEPRECATED
+multi_grid_group this_multi_grid()
+{
+    return multi_grid_group();
+}
+#endif
+/**
+ * class grid_group;
+ *
+ * Threads within this this group are guaranteed to be co-resident on the
+ * same device within the same launched kernel. To use this group, the kernel
+ * must have been launched with cuLaunchCooperativeKernel (or the CUDA Runtime equivalent),
+ * and the device must support it (queryable device attribute).
+ *
+ * Constructed via this_grid();
+ */
+class grid_group : public thread_group_base<details::grid_group_id>
+{
+    _CG_STATIC_CONST_DECL unsigned int _group_id = details::grid_group_id;
+    friend _CG_QUALIFIER grid_group this_grid();
+private:
+    _CG_QUALIFIER grid_group(details::grid_workspace *gridWs) {
+        _data.grid.gridWs = gridWs;
+    }
+ public:
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
+    _CG_QUALIFIER bool is_valid() const {
+        return (_data.grid.gridWs != NULL);
+    }
+    _CG_QUALIFIER void sync() const {
+        if (!is_valid()) {
+            _CG_ABORT();
+        }
+        details::grid::sync(&_data.grid.gridWs->barrier);
+    }
+    _CG_STATIC_QUALIFIER unsigned long long size() {
+        return details::grid::size();
+    }
+    _CG_STATIC_QUALIFIER unsigned long long thread_rank() {
+        return details::grid::thread_rank();
+    }
+    _CG_STATIC_QUALIFIER dim3 group_dim() {
+        return details::grid::grid_dim();
+    }
+    _CG_STATIC_QUALIFIER unsigned long long num_threads() {
+        return details::grid::num_threads();
+    }
+    _CG_STATIC_QUALIFIER dim3 dim_blocks() {
+        return details::grid::dim_blocks();
+    }
+    _CG_STATIC_QUALIFIER unsigned long long num_blocks() {
+        return details::grid::num_blocks();
+    }
+    _CG_STATIC_QUALIFIER dim3 block_index() {
+        return details::grid::block_index();
+    }
+    _CG_STATIC_QUALIFIER unsigned long long block_rank() {
+        return details::grid::block_rank();
+    }
+# if defined(_CG_HAS_CLUSTER_GROUP)
+    _CG_STATIC_QUALIFIER dim3 dim_clusters() {
+        return details::grid::dim_clusters();
+    }
+    _CG_STATIC_QUALIFIER unsigned long long num_clusters() {
+        return details::grid::num_clusters();
+    }
+    _CG_STATIC_QUALIFIER dim3 cluster_index() {
+        return details::grid::cluster_index();
+    }
+    _CG_STATIC_QUALIFIER unsigned long long cluster_rank() {
+        return details::grid::cluster_rank();
+    }
+# endif
+};
+_CG_QUALIFIER grid_group this_grid() {
+    // Load a workspace from the driver
+    grid_group gg(details::get_grid_workspace());
+#ifdef _CG_DEBUG
+    // *all* threads must be available to synchronize
+    gg.sync();
+#endif // _CG_DEBUG
+    return gg;
+}
+#if defined(_CG_HAS_CLUSTER_GROUP)
+/**
+ * class cluster_group
+ *
+ * Every GPU kernel is executed by a grid of thread blocks. A grid can be evenly
+ * divided along all dimensions to form groups of blocks, each group of which is
+ * a block cluster. Clustered grids are subject to various restrictions and
+ * limitations. Primarily, a cluster consists of at most 8 blocks by default
+ * (although the user is allowed to opt-in to non-standard sizes,) and clustered
+ * grids are subject to additional occupancy limitations due to per-cluster
+ * hardware resource consumption. In exchange, a block cluster is guaranteed to
+ * be a cooperative group, with access to all cooperative group capabilities, as
+ * well as cluster specific capabilities and accelerations. A cluster_group
+ * represents a block cluster.
+ *
+ * Constructed via this_cluster_group();
+ */
+class cluster_group : public thread_group_base<details::cluster_group_id>
+{
+    // Friends
+    friend _CG_QUALIFIER cluster_group this_cluster();
+    // Disable constructor
+    _CG_QUALIFIER cluster_group()
+    {
+    }
+ public:
+    //_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_cluster)
+    // Functionality exposed by the group
+    _CG_STATIC_QUALIFIER void sync()
+    {
+        return details::cluster::sync();
+    }
+    _CG_STATIC_QUALIFIER void barrier_arrive()
+    {
+        return details::cluster::barrier_arrive();
+    }
+    _CG_STATIC_QUALIFIER void barrier_wait()
+    {
+        return details::cluster::barrier_wait();
+    }
+    _CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr)
+    {
+        return details::cluster::query_shared_rank(addr);
+    }
+    template <typename T>
+    _CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank)
+    {
+        return details::cluster::map_shared_rank(addr, rank);
+    }
+    _CG_STATIC_QUALIFIER dim3 block_index()
+    {
+        return details::cluster::block_index();
+    }
+    _CG_STATIC_QUALIFIER unsigned int block_rank()
+    {
+        return details::cluster::block_rank();
+    }
+    _CG_STATIC_QUALIFIER unsigned int thread_rank()
+    {
+        return details::cluster::thread_rank();
+    }
+    _CG_STATIC_QUALIFIER dim3 dim_blocks()
+    {
+        return details::cluster::dim_blocks();
+    }
+    _CG_STATIC_QUALIFIER unsigned int num_blocks()
+    {
+        return details::cluster::num_blocks();
+    }
+    _CG_STATIC_QUALIFIER dim3 dim_threads()
+    {
+        return details::cluster::dim_threads();
+    }
+    _CG_STATIC_QUALIFIER unsigned int num_threads()
+    {
+        return details::cluster::num_threads();
+    }
+    // Legacy aliases
+    _CG_STATIC_QUALIFIER unsigned int size()
+    {
+        return num_threads();
+    }
+};
+/*
+ * cluster_group this_cluster()
+ *
+ * Constructs a cluster_group
+ */
+_CG_QUALIFIER cluster_group this_cluster()
+{
+    cluster_group cg;
+#ifdef _CG_DEBUG
+    cg.sync();
+#endif
+    return cg;
+}
+#endif
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+namespace details {
+    _CG_CONSTEXPR_QUALIFIER unsigned int scratch_sync_memory_size(unsigned int max_block_size) {
+        // One barrier per possible size of the group rounded up to multiple of 4.
+        return 8 * sizeof(details::barrier_t);
+    }
+    _CG_CONSTEXPR_QUALIFIER unsigned int scratch_collectives_memory_size(unsigned int communication_size, unsigned int max_block_size) {
+        // One slot of collectives memory per warp.
+        return max_block_size / 32 * communication_size;
+    }
+    _CG_CONSTEXPR_QUALIFIER unsigned int scratch_size_needed(unsigned int communication_size, unsigned int max_block_size) {
+        return scratch_sync_memory_size(max_block_size) + scratch_collectives_memory_size(communication_size, max_block_size);
+    }
+    _CG_CONSTEXPR_QUALIFIER size_t scratch_alignment(unsigned int communication_size) {
+        return ((communication_size & (communication_size - 1) == 0) && communication_size > 8) ?
+            communication_size : 8;
+    }
+    _CG_CONST_DECL unsigned int default_tile_communication_size = 8;
+    _CG_CONST_DECL unsigned int default_max_block_size = 1024;
+    struct multi_warp_scratch {
+        char memory[1];
+    };
+}
+class thread_block;
+namespace experimental {
+    template <unsigned int TileCommunicationSize = details::default_tile_communication_size,
+              unsigned int MaxBlockSize = details::default_max_block_size>
+    struct __align__(details::scratch_alignment(TileCommunicationSize)) block_tile_memory {
+    private:
+        char scratch[details::scratch_size_needed(TileCommunicationSize, MaxBlockSize)];
+    public:
+        _CG_QUALIFIER void* get_memory() {
+            return static_cast<void*>(scratch);
+        }
+        _CG_STATIC_QUALIFIER unsigned int get_size() {
+            return details::scratch_size_needed(TileCommunicationSize, MaxBlockSize);
+        }
+    };
+    template <unsigned int TileCommunicationSize, unsigned int MaxBlockSize>
+    _CG_QUALIFIER thread_block this_thread_block(experimental::block_tile_memory<TileCommunicationSize, MaxBlockSize>& scratch);
+}
+#endif
+/**
+ * class thread_block
+ *
+ * Every GPU kernel is executed by a grid of thread blocks, and threads within
+ * each block are guaranteed to reside on the same streaming multiprocessor.
+ * A thread_block represents a thread block whose dimensions are not known until runtime.
+ *
+ * Constructed via this_thread_block();
+ */
+class thread_block : public thread_group_base<details::thread_block_id>
+{
+    // Friends
+    friend _CG_QUALIFIER thread_block this_thread_block();
+    friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
+    friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz);
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    template <unsigned int TileCommunicationSize, unsigned int MaxBlockSize>
+    friend _CG_QUALIFIER thread_block experimental::this_thread_block(
+            experimental::block_tile_memory<TileCommunicationSize, MaxBlockSize>& scratch);
+    const unsigned short communication_size;
+    const unsigned short max_block_size;
+    details::multi_warp_scratch* const tile_memory;
+    template <unsigned int Size>
+    friend class __static_size_multi_warp_tile_base;
+    template <unsigned int TileCommunicationSize, unsigned int MaxBlockSize>
+    _CG_QUALIFIER thread_block(experimental::block_tile_memory<TileCommunicationSize, MaxBlockSize>& scratch) :
+        tile_memory(reinterpret_cast<details::multi_warp_scratch*>(&scratch)),
+        communication_size(TileCommunicationSize), max_block_size(MaxBlockSize) {
+        if (thread_rank() < details::scratch_sync_memory_size(MaxBlockSize) / sizeof(details::barrier_t)) {
+            details::barrier_t* barriers = reinterpret_cast<details::barrier_t*>(&tile_memory->memory);
+            barriers[thread_rank()] = 0;
+        }
+        sync();
+    }
+#endif
+    // Disable constructor
+    _CG_QUALIFIER thread_block()
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    : tile_memory(NULL), communication_size(0), max_block_size(0)
+#endif
+    { }
+    // Internal Use
+    _CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const {
+        const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
+        // Invalid, immediately fail
+        if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
+            details::abort();
+            return (thread_block());
+        }
+        unsigned int mask;
+        unsigned int base_offset = thread_rank() & (~(tilesz - 1));
+        unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
+        mask = (unsigned int)(-1) >> (32 - masklength);
+        mask <<= (details::laneid() & ~(tilesz - 1));
+        thread_group tile = thread_group(details::coalesced_group_id);
+        tile._data.coalesced.mask = mask;
+        tile._data.coalesced.size = __popc(mask);
+        tile._data.coalesced.metaGroupSize = (details::cta::size() + tilesz - 1) / tilesz;
+        tile._data.coalesced.metaGroupRank = details::cta::thread_rank() / tilesz;
+        tile._data.coalesced.is_tiled = true;
+        return (tile);
+    }
+ public:
+    _CG_STATIC_CONST_DECL unsigned int _group_id = details::thread_block_id;
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
+    _CG_STATIC_QUALIFIER void sync() {
+        details::cta::sync();
+    }
+    _CG_STATIC_QUALIFIER unsigned int size() {
+        return details::cta::size();
+    }
+    _CG_STATIC_QUALIFIER unsigned int thread_rank() {
+        return details::cta::thread_rank();
+    }
+    // Additional functionality exposed by the group
+    _CG_STATIC_QUALIFIER dim3 group_index() {
+        return details::cta::group_index();
+    }
+    _CG_STATIC_QUALIFIER dim3 thread_index() {
+        return details::cta::thread_index();
+    }
+    _CG_STATIC_QUALIFIER dim3 group_dim() {
+        return details::cta::block_dim();
+    }
+    _CG_STATIC_QUALIFIER dim3 dim_threads() {
+        return details::cta::dim_threads();
+    }
+    _CG_STATIC_QUALIFIER unsigned int num_threads() {
+        return details::cta::num_threads();
+    }
+};
+/**
+ * thread_block this_thread_block()
+ *
+ * Constructs a thread_block group
+ */
+_CG_QUALIFIER thread_block this_thread_block()
+{
+    return (thread_block());
+}
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+namespace experimental {
+    template <unsigned int TileCommunicationSize, unsigned int MaxBlockSize>
+    _CG_QUALIFIER thread_block this_thread_block(experimental::block_tile_memory<TileCommunicationSize, MaxBlockSize>& scratch) {
+        return (thread_block(scratch));
+    }
+}
+#endif
+/**
+ * class coalesced_group
+ *
+ * A group representing the current set of converged threads in a warp.
+ * The size of the group is not guaranteed and it may return a group of
+ * only one thread (itself).
+ *
+ * This group exposes warp-synchronous builtins.
+ * Constructed via coalesced_threads();
+ */
+class coalesced_group : public thread_group_base<details::coalesced_group_id>
+{
+private:
+    friend _CG_QUALIFIER coalesced_group coalesced_threads();
+    friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
+    friend _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz);
+    friend class details::_coalesced_group_data_access;
+    _CG_QUALIFIER unsigned int _packLanes(unsigned laneMask) const {
+        unsigned int member_pack = 0;
+        unsigned int member_rank = 0;
+        for (int bit_idx = 0; bit_idx < 32; bit_idx++) {
+            unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
+            if (lane_bit) {
+                if (laneMask & lane_bit)
+                    member_pack |= 1 << member_rank;
+                member_rank++;
+            }
+        }
+        return (member_pack);
+    }
+    // Internal Use
+    _CG_QUALIFIER coalesced_group _get_tiled_threads(unsigned int tilesz) const {
+        const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
+        // Invalid, immediately fail
+        if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
+            details::abort();
+            return (coalesced_group(0));
+        }
+        if (size() <= tilesz) {
+            return (*this);
+        }
+        if ((_data.coalesced.is_tiled == true) && pow2_tilesz) {
+            unsigned int base_offset = (thread_rank() & (~(tilesz - 1)));
+            unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
+            unsigned int mask = (unsigned int)(-1) >> (32 - masklength);
+            mask <<= (details::laneid() & ~(tilesz - 1));
+            coalesced_group coalesced_tile = coalesced_group(mask);
+            coalesced_tile._data.coalesced.metaGroupSize = size() / tilesz;
+            coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
+            coalesced_tile._data.coalesced.is_tiled = true;
+            return (coalesced_tile);
+        }
+        else if ((_data.coalesced.is_tiled == false) && pow2_tilesz) {
+            unsigned int mask = 0;
+            unsigned int member_rank = 0;
+            int seen_lanes = (thread_rank() / tilesz) * tilesz;
+            for (unsigned int bit_idx = 0; bit_idx < 32; bit_idx++) {
+                unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
+                if (lane_bit) {
+                    if (seen_lanes <= 0 && member_rank < tilesz) {
+                        mask |= lane_bit;
+                        member_rank++;
+                    }
+                    seen_lanes--;
+                }
+            }
+            coalesced_group coalesced_tile = coalesced_group(mask);
+            // Override parent with the size of this group
+            coalesced_tile._data.coalesced.metaGroupSize = (size() + tilesz - 1) / tilesz;
+            coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
+            return coalesced_tile;
+        }
+        else {
+            // None in _CG_VERSION 1000
+            details::abort();
+        }
+        return (coalesced_group(0));
+    }
+ protected:
+    _CG_QUALIFIER coalesced_group(unsigned int mask) {
+        _data.coalesced.mask = mask;
+        _data.coalesced.size = __popc(mask);
+        _data.coalesced.metaGroupRank = 0;
+        _data.coalesced.metaGroupSize = 1;
+        _data.coalesced.is_tiled = false;
+    }
+    _CG_QUALIFIER unsigned int get_mask() const {
+        return (_data.coalesced.mask);
+    }
+ public:
+    _CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
+    _CG_QUALIFIER unsigned int num_threads() const {
+        return _data.coalesced.size;
+    }
+    _CG_QUALIFIER unsigned int size() const {
+        return num_threads();
+    }
+    _CG_QUALIFIER unsigned int thread_rank() const {
+        return (__popc(_data.coalesced.mask & details::lanemask32_lt()));
+    }
+    // Rank of this group in the upper level of the hierarchy
+    _CG_QUALIFIER unsigned int meta_group_rank() const {
+        return _data.coalesced.metaGroupRank;
+    }
+    // Total num partitions created out of all CTAs when the group was created
+    _CG_QUALIFIER unsigned int meta_group_size() const {
+        return _data.coalesced.metaGroupSize;
+    }
+    _CG_QUALIFIER void sync() const {
+        __syncwarp(_data.coalesced.mask);
+    }
+#ifdef _CG_CPP11_FEATURES
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
+        unsigned int lane = (srcRank == 0) ? __ffs(_data.coalesced.mask) - 1 :
+            (size() == 32) ? srcRank : __fns(_data.coalesced.mask, 0, (srcRank + 1));
+        return details::tile::shuffle_dispatch<TyElem>::shfl(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
+    }
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
+        if (size() == 32) {
+            return details::tile::shuffle_dispatch<TyElem>::shfl_down(
+                _CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
+        }
+        unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
+        if (lane >= 32)
+            lane = details::laneid();
+        return details::tile::shuffle_dispatch<TyElem>::shfl(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
+    }
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl_up(TyElem&& elem, int delta) const {
+        if (size() == 32) {
+            return details::tile::shuffle_dispatch<TyElem>::shfl_up(
+                _CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
+        }
+        unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
+        if (lane >= 32)
+            lane = details::laneid();
+        return details::tile::shuffle_dispatch<TyElem>::shfl(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
+    }
+#else
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl(TyIntegral var, unsigned int src_rank) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        unsigned int lane = (src_rank == 0) ? __ffs(_data.coalesced.mask) - 1 :
+            (size() == 32) ? src_rank : __fns(_data.coalesced.mask, 0, (src_rank + 1));
+        return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
+    }
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, int delta) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        if (size() == 32) {
+            return (__shfl_up_sync(0xFFFFFFFF, var, delta, 32));
+        }
+        unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
+        if (lane >= 32) lane = details::laneid();
+        return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
+    }
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, int delta) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        if (size() == 32) {
+            return (__shfl_down_sync(0xFFFFFFFF, var, delta, 32));
+        }
+        unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
+        if (lane >= 32) lane = details::laneid();
+        return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
+    }
+#endif
+    _CG_QUALIFIER int any(int predicate) const {
+        return (__ballot_sync(_data.coalesced.mask, predicate) != 0);
+    }
+    _CG_QUALIFIER int all(int predicate) const {
+        return (__ballot_sync(_data.coalesced.mask, predicate) == _data.coalesced.mask);
+    }
+    _CG_QUALIFIER unsigned int ballot(int predicate) const {
+        if (size() == 32) {
+            return (__ballot_sync(0xFFFFFFFF, predicate));
+        }
+        unsigned int lane_ballot = __ballot_sync(_data.coalesced.mask, predicate);
+        return (_packLanes(lane_ballot));
+    }
+#ifdef _CG_HAS_MATCH_COLLECTIVE
+    template <typename TyIntegral>
+    _CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        if (size() == 32) {
+            return (__match_any_sync(0xFFFFFFFF, val));
+        }
+        unsigned int lane_match = __match_any_sync(_data.coalesced.mask, val);
+        return (_packLanes(lane_match));
+    }
+    template <typename TyIntegral>
+    _CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        if (size() == 32) {
+            return (__match_all_sync(0xFFFFFFFF, val, &pred));
+        }
+        unsigned int lane_match = __match_all_sync(_data.coalesced.mask, val, &pred);
+        return (_packLanes(lane_match));
+    }
+#endif /* !_CG_HAS_MATCH_COLLECTIVE */
+};
+_CG_QUALIFIER coalesced_group coalesced_threads()
+{
+    return (coalesced_group(__activemask()));
+}
+namespace details {
+    template <unsigned int Size> struct verify_thread_block_tile_size;
+    template <> struct verify_thread_block_tile_size<32> { typedef void OK; };
+    template <> struct verify_thread_block_tile_size<16> { typedef void OK; };
+    template <> struct verify_thread_block_tile_size<8>  { typedef void OK; };
+    template <> struct verify_thread_block_tile_size<4>  { typedef void OK; };
+    template <> struct verify_thread_block_tile_size<2>  { typedef void OK; };
+    template <> struct verify_thread_block_tile_size<1>  { typedef void OK; };
+#ifdef _CG_CPP11_FEATURES
+    template <unsigned int Size>
+    using _is_power_of_2 = _CG_STL_NAMESPACE::integral_constant<bool, (Size & (Size - 1)) == 0>;
+    template <unsigned int Size>
+    using _is_single_warp = _CG_STL_NAMESPACE::integral_constant<bool, Size <= 32>;
+    template <unsigned int Size>
+    using _is_multi_warp =
+    _CG_STL_NAMESPACE::integral_constant<bool, (Size > 32) && (Size <= 1024)>;
+    template <unsigned int Size>
+    using _is_valid_single_warp_tile =
+        _CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_single_warp<Size>::value>;
+    template <unsigned int Size>
+    using _is_valid_multi_warp_tile =
+        _CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_multi_warp<Size>::value>;
+#else
+    template <unsigned int Size>
+    struct _is_multi_warp {
+        static const bool value = false;
+    };
+#endif
+}
+template <unsigned int Size>
+class __static_size_tile_base
+{
+protected:
+    _CG_STATIC_CONST_DECL unsigned int numThreads = Size;
+public:
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
+    // Rank of thread within tile
+    _CG_STATIC_QUALIFIER unsigned int thread_rank() {
+        return (details::cta::thread_rank() & (numThreads - 1));
+    }
+    // Number of threads within tile
+    _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int num_threads() {
+        return numThreads;
+    }
+    _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int size() {
+        return num_threads();
+    }
+};
+template <unsigned int Size>
+class __static_size_thread_block_tile_base : public __static_size_tile_base<Size>
+{
+    friend class details::_coalesced_group_data_access;
+    typedef details::tile::tile_helpers<Size> th;
+#ifdef _CG_CPP11_FEATURES
+    static_assert(details::_is_valid_single_warp_tile<Size>::value, "Size must be one of 1/2/4/8/16/32");
+#else
+    typedef typename details::verify_thread_block_tile_size<Size>::OK valid;
+#endif
+    using __static_size_tile_base<Size>::numThreads;
+    _CG_STATIC_CONST_DECL unsigned int fullMask = 0xFFFFFFFF;
+ protected:
+    _CG_STATIC_QUALIFIER unsigned int build_mask() {
+        unsigned int mask = fullMask;
+        if (numThreads != 32) {
+            // [0,31] representing the current active thread in the warp
+            unsigned int laneId = details::laneid();
+            // shift mask according to the partition it belongs to
+            mask = th::tileMask << (laneId & ~(th::laneMask));
+        }
+        return (mask);
+    }
+public:
+    _CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
+    _CG_STATIC_QUALIFIER void sync() {
+        __syncwarp(build_mask());
+    }
+#ifdef _CG_CPP11_FEATURES
+    // PTX supported collectives
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
+        return details::tile::shuffle_dispatch<TyElem>::shfl(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), srcRank, numThreads);
+    }
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
+        return details::tile::shuffle_dispatch<TyElem>::shfl_down(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
+    }
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int delta) const {
+        return details::tile::shuffle_dispatch<TyElem>::shfl_up(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
+    }
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int laneMask) const {
+        return details::tile::shuffle_dispatch<TyElem>::shfl_xor(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), laneMask, numThreads);
+    }
+#else
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl(TyIntegral var, int srcRank) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        return (__shfl_sync(build_mask(), var, srcRank, numThreads));
+    }
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, unsigned int delta) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        return (__shfl_down_sync(build_mask(), var, delta, numThreads));
+    }
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, unsigned int delta) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        return (__shfl_up_sync(build_mask(), var, delta, numThreads));
+    }
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl_xor(TyIntegral var, unsigned int laneMask) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
+    }
+#endif //_CG_CPP11_FEATURES
+    _CG_QUALIFIER int any(int predicate) const {
+        unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
+        return (lane_ballot != 0);
+    }
+    _CG_QUALIFIER int all(int predicate) const {
+        unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
+        return (lane_ballot == build_mask());
+    }
+    _CG_QUALIFIER unsigned int ballot(int predicate) const {
+        unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
+        return (lane_ballot >> (details::laneid() & (~(th::laneMask))));
+    }
+#ifdef _CG_HAS_MATCH_COLLECTIVE
+    template <typename TyIntegral>
+    _CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        unsigned int lane_match = __match_any_sync(build_mask(), val);
+        return (lane_match >> (details::laneid() & (~(th::laneMask))));
+    }
+    template <typename TyIntegral>
+    _CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        unsigned int lane_match = __match_all_sync(build_mask(), val, &pred);
+        return (lane_match >> (details::laneid() & (~(th::laneMask))));
+    }
+#endif
+};
+template <unsigned int Size, typename ParentT>
+class __static_parent_thread_block_tile_base
+{
+public:
+    // Rank of this group in the upper level of the hierarchy
+    _CG_STATIC_QUALIFIER unsigned int meta_group_rank() {
+        return ParentT::thread_rank() / Size;
+    }
+    // Total num partitions created out of all CTAs when the group was created
+    _CG_STATIC_QUALIFIER unsigned int meta_group_size() {
+        return (ParentT::size() + Size - 1) / Size;
+    }
+};
+/**
+ * class thread_block_tile<unsigned int Size, ParentT = void>
+ *
+ * Statically-sized group type, representing one tile of a thread block.
+ * The only specializations currently supported are those with native
+ * hardware support (1/2/4/8/16/32)
+ *
+ * This group exposes warp-synchronous builtins.
+ * Can only be constructed via tiled_partition<Size>(ParentT&)
+ */
+template <unsigned int Size, typename ParentT = void>
+class __single_warp_thread_block_tile :
+    public __static_size_thread_block_tile_base<Size>,
+    public __static_parent_thread_block_tile_base<Size, ParentT>
+{
+    typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
+    friend class details::_coalesced_group_data_access;
+protected:
+    _CG_QUALIFIER __single_warp_thread_block_tile() { };
+    _CG_QUALIFIER __single_warp_thread_block_tile(unsigned int, unsigned int) { };
+    _CG_STATIC_QUALIFIER unsigned int get_mask() {
+        return __static_size_thread_block_tile_base<Size>::build_mask();
+    }
+};
+template <unsigned int Size>
+class __single_warp_thread_block_tile<Size, void> :
+    public __static_size_thread_block_tile_base<Size>,
+    public thread_group_base<details::coalesced_group_id>
+{
+    _CG_STATIC_CONST_DECL unsigned int numThreads = Size;
+    template <unsigned int, typename ParentT> friend class __single_warp_thread_block_tile;
+    friend class details::_coalesced_group_data_access;
+    typedef __static_size_thread_block_tile_base<numThreads> staticSizeBaseT;
+protected:
+    _CG_QUALIFIER __single_warp_thread_block_tile(unsigned int meta_group_rank, unsigned int meta_group_size) {
+        _data.coalesced.mask = staticSizeBaseT::build_mask();
+        _data.coalesced.size = numThreads;
+        _data.coalesced.metaGroupRank = meta_group_rank;
+        _data.coalesced.metaGroupSize = meta_group_size;
+        _data.coalesced.is_tiled = true;
+    }
+    _CG_QUALIFIER unsigned int get_mask() const {
+        return (_data.coalesced.mask);
+    }
+public:
+    using staticSizeBaseT::sync;
+    using staticSizeBaseT::size;
+    using staticSizeBaseT::num_threads;
+    using staticSizeBaseT::thread_rank;
+    _CG_QUALIFIER unsigned int meta_group_rank() const {
+        return _data.coalesced.metaGroupRank;
+    }
+    _CG_QUALIFIER unsigned int meta_group_size() const {
+        return _data.coalesced.metaGroupSize;
+    }
+};
+/**
+ * Outer level API calls
+ * void sync(GroupT) - see <group_type>.sync()
+ * void thread_rank(GroupT) - see <group_type>.thread_rank()
+ * void group_size(GroupT) - see <group_type>.size()
+ */
+template <class GroupT>
+_CG_QUALIFIER void sync(GroupT const &g)
+{
+    g.sync();
+}
+// TODO: Use a static dispatch to determine appropriate return type
+// C++03 is stuck with unsigned long long for now
+#ifdef _CG_CPP11_FEATURES
+template <class GroupT>
+_CG_QUALIFIER auto thread_rank(GroupT const& g) -> decltype(g.thread_rank()) {
+    return g.thread_rank();
+}
+template <class GroupT>
+_CG_QUALIFIER auto group_size(GroupT const &g) -> decltype(g.num_threads()) {
+    return g.num_threads();
+}
+#else
+template <class GroupT>
+_CG_QUALIFIER unsigned long long thread_rank(GroupT const& g) {
+    return static_cast<unsigned long long>(g.thread_rank());
+}
+template <class GroupT>
+_CG_QUALIFIER unsigned long long group_size(GroupT const &g) {
+    return static_cast<unsigned long long>(g.num_threads());
+}
+#endif
+/**
+ * tiled_partition
+ *
+ * The tiled_partition(parent, tilesz) method is a collective operation that
+ * partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
+ *
+ * A total of ((size(parent)+tilesz-1)/tilesz) subgroups will
+ * be created where threads having identical k = (thread_rank(parent)/tilesz)
+ * will be members of the same subgroup.
+ *
+ * The implementation may cause the calling thread to wait until all the members
+ * of the parent group have invoked the operation before resuming execution.
+ *
+ * Functionality is limited to power-of-two sized subgorup instances of at most
+ * 32 threads. Only thread_block, thread_block_tile<>, and their subgroups can be
+ * tiled_partition() in _CG_VERSION 1000.
+ */
+_CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz)
+{
+    if (parent.get_type() == details::coalesced_group_id) {
+        const coalesced_group *_cg = static_cast<const coalesced_group*>(&parent);
+        return _cg->_get_tiled_threads(tilesz);
+    }
+    else {
+        const thread_block *_tb = static_cast<const thread_block*>(&parent);
+        return _tb->_get_tiled_threads(tilesz);
+    }
+}
+// Thread block type overload: returns a basic thread_group for now (may be specialized later)
+_CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz)
+{
+    return (parent._get_tiled_threads(tilesz));
+}
+// Coalesced group type overload: retains its ability to stay coalesced
+_CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz)
+{
+    return (parent._get_tiled_threads(tilesz));
+}
+namespace details {
+    template <unsigned int Size, typename ParentT>
+    class internal_thread_block_tile : public __single_warp_thread_block_tile<Size, ParentT> {};
+    template <unsigned int Size, typename ParentT>
+    _CG_QUALIFIER internal_thread_block_tile<Size, ParentT> tiled_partition_internal() {
+        return internal_thread_block_tile<Size, ParentT>();
+    }
+    template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
+    _CG_QUALIFIER TyVal multi_warp_collectives_helper(
+            const GroupT& group,
+            WarpLambda warp_lambda,
+            InterWarpLambda inter_warp_lambda) {
+                return group.template collectives_scheme<TyVal>(warp_lambda, inter_warp_lambda);
+            }
+    template <typename T, typename GroupT>
+    _CG_QUALIFIER T* multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id) {
+        return group.template get_scratch_location<T>(warp_id);
+    }
+    template <typename GroupT>
+    _CG_QUALIFIER details::barrier_t* multi_warp_sync_location_getter(const GroupT& group) {
+        return group.get_sync_location();
+    }
+}
+/**
+ * tiled_partition<tilesz>
+ *
+ * The tiled_partition<tilesz>(parent) method is a collective operation that
+ * partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
+ *
+ * A total of ((size(parent)/tilesz) subgroups will be created,
+ * therefore the parent group size must be evenly divisible by the tilesz.
+ * The allow parent groups are thread_block or thread_block_tile<size>.
+ *
+ * The implementation may cause the calling thread to wait until all the members
+ * of the parent group have invoked the operation before resuming execution.
+ *
+ * Functionality is limited to native hardware sizes, 1/2/4/8/16/32.
+ * The size(parent) must be greater than the template Size parameter
+ * otherwise the results are undefined.
+ */
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+template <unsigned int Size>
+class __static_size_multi_warp_tile_base : public __static_size_tile_base<Size>
+{
+    static_assert(details::_is_valid_multi_warp_tile<Size>::value, "Size must be one of 64/128/256/512");
+    template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
+    friend TyVal details::multi_warp_collectives_helper(
+            const GroupT& group,
+            WarpLambda warp_lambda,
+            InterWarpLambda inter_warp_lambda);
+    template <typename T, typename GroupT>
+    friend T* details::multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id);
+    template <typename GroupT>
+    friend details::barrier_t* details::multi_warp_sync_location_getter(const GroupT& group);
+    template <unsigned int OtherSize>
+    friend class __static_size_multi_warp_tile_base;
+    using WarpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
+    using ThisType = __static_size_multi_warp_tile_base<Size>;
+    _CG_STATIC_CONST_DECL int numWarps = Size / 32;
+    const unsigned short communication_size;
+    const unsigned short max_block_size;
+protected:
+    details::multi_warp_scratch* const tile_memory;
+    template <typename GroupT>
+    _CG_QUALIFIER __static_size_multi_warp_tile_base(const GroupT& g) :
+            tile_memory(g.tile_memory), communication_size(g.communication_size), max_block_size(g.max_block_size) {}
+private:
+    _CG_QUALIFIER details::barrier_t* get_sync_location() const {
+        // Different group sizes use different barriers, all groups of a given size share one barrier.
+        unsigned int sync_id = details::log2(Size / 64);
+        return &(reinterpret_cast<details::barrier_t*>(tile_memory->memory)[sync_id]);
+    }
+    template <typename T>
+    _CG_QUALIFIER T* get_scratch_location(unsigned int warp_id) const {
+        unsigned int sync_mem_size = details::scratch_sync_memory_size(max_block_size);
+        unsigned int scratch_id = (details::cta::thread_rank() - thread_rank()) / 32 + warp_id;
+        return reinterpret_cast<T*>(&tile_memory->memory[sync_mem_size + scratch_id * communication_size]);
+    }
+    template <typename T>
+    _CG_QUALIFIER T* get_scratch_location() const {
+        unsigned int sync_mem_size = details::scratch_sync_memory_size(max_block_size);
+        unsigned int scratch_id = details::cta::thread_rank() / 32;
+        return reinterpret_cast<T*>(&tile_memory->memory[sync_mem_size + scratch_id * communication_size]);
+    }
+    template <typename TyVal>
+    _CG_QUALIFIER TyVal shfl_impl(TyVal val, unsigned int src) const {
+        unsigned int src_warp = src / 32;
+        auto warp = details::tiled_partition_internal<32, ThisType>();
+        details::barrier_t* sync_location = get_sync_location();
+        // Get warp slot of the source threads warp.
+        TyVal* warp_scratch_location = get_scratch_location<TyVal>(src_warp);
+        if (warp.meta_group_rank() == src_warp) {
+            // Put shuffled value into my warp slot and let my warp arrive at the barrier.
+            if (thread_rank() == src) {
+                *warp_scratch_location = val;
+            }
+            details::sync_warps_arrive(sync_location, details::cta::thread_rank(), numWarps);
+            TyVal result = *warp_scratch_location;
+            details::sync_warps_wait(sync_location, details::cta::thread_rank());
+            return result;
+        }
+        else {
+            // Wait for the source warp to arrive on the barrier.
+            details::sync_warps_wait_for_warps<details::wait_for_specific_warp>(
+                    (details::cta::thread_rank() / 32 - warp.meta_group_rank() + src_warp),
+                    sync_location, details::cta::thread_rank(),
+                    numWarps);
+            TyVal result = *warp_scratch_location;
+            details::sync_warps(sync_location, details::cta::thread_rank(), numWarps);
+            return result;
+        }
+    }
+    template <typename TyVal>
+    _CG_QUALIFIER TyVal shfl_iterative_impl(TyVal val, unsigned int src) const {
+        auto warp = details::tiled_partition_internal<32, ThisType>();
+        details::copy_channel<numWarps> broadcast_channel{
+            get_scratch_location<char>(0),
+            get_sync_location(),
+            (size_t) communication_size * numWarps};
+        if (warp.meta_group_rank() == src / 32) {
+            val = warp.shfl(val, src);
+            broadcast_channel.template send_value<
+                TyVal, 32, decltype(broadcast_channel)::send_many_to_many>(
+                    val, warp.thread_rank(), details::cta::thread_rank() / 32);
+        }
+        else {
+            broadcast_channel.template receive_value<TyVal>(val, warp.thread_rank() == 0);
+        }
+        sync();
+        return val;
+    }
+    template <typename TyVal, typename WarpLambda, typename InterWarpLambda>
+    _CG_QUALIFIER TyVal collectives_scheme_impl(const WarpLambda& warp_lambda, const InterWarpLambda& inter_warp_lambda) const {
+        auto warp = details::tiled_partition_internal<32, ThisType>();
+        details::barrier_t* sync_location = get_sync_location();
+        TyVal* warp_scratch_location = get_scratch_location<TyVal>();
+        warp_lambda(warp, warp_scratch_location);
+        if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), numWarps)) {
+            auto subwarp = details::tiled_partition_internal<numWarps, decltype(warp)>();
+            if (subwarp.meta_group_rank() == 0) {
+                TyVal* thread_scratch_location = get_scratch_location<TyVal>(subwarp.thread_rank());
+                inter_warp_lambda(subwarp, thread_scratch_location);
+            }
+            warp.sync();
+            details::sync_warps_release(sync_location, warp.thread_rank() == 0, details::cta::thread_rank(), numWarps);
+        }
+        TyVal result = *warp_scratch_location;
+        warp.sync();  // Added warpsync, if all collectives do sync before writing to reduce_location (they does right now),
+                      // we could delete it.
+        return result;
+    }
+    template <typename TyVal, typename WarpLambda, typename InterWarpLambda>
+    _CG_QUALIFIER TyVal collectives_scheme_iterative_impl(
+            const WarpLambda& warp_lambda,
+            const InterWarpLambda& inter_warp_lambda) const {
+        auto warp = details::tiled_partition_internal<32, ThisType>();
+        details::barrier_t* sync_location = get_sync_location();
+        details::copy_channel<numWarps> final_result_channel{
+            get_scratch_location<char>(0),
+            sync_location,
+            (size_t) communication_size * numWarps};
+        TyVal warp_result;
+        warp_lambda(warp, &warp_result);
+        if (warp.meta_group_rank() == 0) {
+            auto subwarp = details::tiled_partition_internal<numWarps, decltype(warp)>();
+            details::copy_channel<numWarps> partial_results_channel{
+                get_scratch_location<char>(subwarp.thread_rank()),
+                sync_location,
+                (size_t) communication_size};
+            // Thread 0 in subwarp set as inactive to not overwrite warp 0 warp_result.
+            partial_results_channel.template receive_value<TyVal>(
+                    warp_result,
+                    warp.thread_rank() == 0,
+                    subwarp.thread_rank() != 0 && subwarp.meta_group_rank() == 0);
+            if (subwarp.meta_group_rank() == 0) {
+                inter_warp_lambda(subwarp, &warp_result);
+            }
+            warp_result = warp.shfl(warp_result, 0);
+            final_result_channel.template send_value<TyVal, 32, decltype(final_result_channel)::send_many_to_many>(
+                    warp_result,
+                    warp.thread_rank(),
+                    details::cta::thread_rank() / 32);
+        }
+        else {
+            details::copy_channel<numWarps> partial_results_channel{get_scratch_location<char>(), sync_location, (size_t) communication_size};
+            partial_results_channel.template send_value<TyVal, 32, decltype(partial_results_channel)::send_many_to_one>(
+                    warp_result,
+                    warp.thread_rank(),
+                    (details::cta::thread_rank() - thread_rank()) / 32);
+            final_result_channel.template receive_value<TyVal>(warp_result, warp.thread_rank() == 0);
+        }
+        sync();
+        return warp_result;
+    }
+    template <typename TyVal, typename WarpLambda, typename InterWarpLambda>
+    _CG_QUALIFIER TyVal collectives_scheme(const WarpLambda& warp_lambda, const InterWarpLambda& inter_warp_lambda) const {
+        if (sizeof(TyVal) > communication_size) {
+            return collectives_scheme_iterative_impl<TyVal, WarpLambda, InterWarpLambda>(warp_lambda, inter_warp_lambda);
+        }
+        else {
+            return collectives_scheme_impl<TyVal, WarpLambda, InterWarpLambda>(warp_lambda, inter_warp_lambda);
+        }
+    }
+public:
+    _CG_STATIC_CONST_DECL unsigned int _group_id = details::multi_tile_group_id;
+    using __static_size_tile_base<Size>::thread_rank;
+    template <typename TyVal>
+    _CG_QUALIFIER TyVal shfl(TyVal val, unsigned int src) const {
+        if (sizeof(TyVal) > communication_size) {
+            return shfl_iterative_impl(val, src);
+        }
+        else {
+            return shfl_impl(val, src);
+        }
+    }
+    _CG_QUALIFIER void sync() const {
+        details::sync_warps(get_sync_location(), details::cta::thread_rank(), numWarps);
+    }
+    _CG_QUALIFIER int any(int predicate) const {
+        auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
+                *warp_scratch_location = __any_sync(0xFFFFFFFF, predicate);
+        };
+        auto inter_warp_lambda =
+            [] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
+                *thread_scratch_location = __any_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
+        };
+        return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
+    }
+    _CG_QUALIFIER int all(int predicate) const {
+        auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
+                *warp_scratch_location = __all_sync(0xFFFFFFFF, predicate);
+        };
+        auto inter_warp_lambda =
+            [] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
+                *thread_scratch_location = __all_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
+        };
+        return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
+    }
+};
+template <unsigned int Size, typename ParentT = void>
+class __multi_warp_thread_block_tile :
+    public __static_size_multi_warp_tile_base<Size>,
+    public __static_parent_thread_block_tile_base<Size, ParentT>
+{
+    typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
+    typedef __static_size_multi_warp_tile_base<Size> staticTileBaseT;
+protected:
+    _CG_QUALIFIER __multi_warp_thread_block_tile(const ParentT& g) :
+        __static_size_multi_warp_tile_base<Size>(g) {}
+};
+template <unsigned int Size>
+class __multi_warp_thread_block_tile<Size, void> : public __static_size_multi_warp_tile_base<Size>
+{
+    const unsigned int metaGroupRank;
+    const unsigned int metaGroupSize;
+protected:
+    template <unsigned int OtherSize, typename ParentT>
+    _CG_QUALIFIER __multi_warp_thread_block_tile(const __multi_warp_thread_block_tile<OtherSize, ParentT>& g) :
+        __static_size_multi_warp_tile_base<Size>(g), metaGroupRank(g.meta_group_rank()), metaGroupSize(g.meta_group_size()) {}
+public:
+    _CG_QUALIFIER unsigned int meta_group_rank() const {
+        return metaGroupRank;
+    }
+    _CG_QUALIFIER unsigned int meta_group_size() const {
+        return metaGroupSize;
+    }
+};
+#endif
+template <unsigned int Size, typename ParentT = void>
+class thread_block_tile;
+namespace details {
+    template <unsigned int Size, typename ParentT, bool IsMultiWarp>
+    class thread_block_tile_impl;
+    template <unsigned int Size, typename ParentT>
+    class thread_block_tile_impl<Size, ParentT, false>: public __single_warp_thread_block_tile<Size, ParentT>
+    {
+    protected:
+        template <unsigned int OtherSize, typename OtherParentT, bool OtherIsMultiWarp>
+        _CG_QUALIFIER thread_block_tile_impl(const thread_block_tile_impl<OtherSize, OtherParentT, OtherIsMultiWarp>& g) :
+            __single_warp_thread_block_tile<Size, ParentT>(g.meta_group_rank(), g.meta_group_size()) {}
+        _CG_QUALIFIER thread_block_tile_impl(const thread_block& g) :
+            __single_warp_thread_block_tile<Size, ParentT>() {}
+    };
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    template <unsigned int Size, typename ParentT>
+    class thread_block_tile_impl<Size, ParentT, true> : public __multi_warp_thread_block_tile<Size, ParentT>
+    {
+        protected:
+        template <typename GroupT>
+        _CG_QUALIFIER thread_block_tile_impl(const GroupT& g) :
+            __multi_warp_thread_block_tile<Size, ParentT>(g) {}
+    };
+#else
+    template <unsigned int Size, typename ParentT>
+    class thread_block_tile_impl<Size, ParentT, true>
+    {
+        protected:
+        template <typename GroupT>
+        _CG_QUALIFIER thread_block_tile_impl(const GroupT& g) {}
+    };
+#endif
+}
+template <unsigned int Size, typename ParentT>
+class thread_block_tile : public details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>
+{
+    friend _CG_QUALIFIER thread_block_tile<1, void> this_thread();
+protected:
+    _CG_QUALIFIER thread_block_tile(const ParentT& g) :
+        details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>(g) {}
+public:
+    _CG_QUALIFIER operator thread_block_tile<Size, void>() const {
+        return thread_block_tile<Size, void>(*this);
+    }
+};
+template <unsigned int Size>
+class thread_block_tile<Size, void> : public details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>
+{
+    template <unsigned int, typename ParentT>
+    friend class thread_block_tile;
+protected:
+    template <unsigned int OtherSize, typename OtherParentT>
+    _CG_QUALIFIER thread_block_tile(const thread_block_tile<OtherSize, OtherParentT>& g) :
+        details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
+public:
+    template <typename ParentT>
+    _CG_QUALIFIER thread_block_tile(const thread_block_tile<Size, ParentT>& g) :
+        details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
+};
+namespace details {
+    template <unsigned int Size, typename ParentT>
+    struct tiled_partition_impl;
+    template <unsigned int Size>
+    struct tiled_partition_impl<Size, thread_block> : public thread_block_tile<Size, thread_block> {
+        _CG_QUALIFIER tiled_partition_impl(const thread_block& g) :
+            thread_block_tile<Size, thread_block>(g) {}
+    };
+    // ParentT = static thread_block_tile<ParentSize, GrandParent> specialization
+    template <unsigned int Size, unsigned int ParentSize, typename GrandParent>
+    struct tiled_partition_impl<Size, thread_block_tile<ParentSize, GrandParent> > :
+        public thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> > {
+#ifdef _CG_CPP11_FEATURES
+        static_assert(Size < ParentSize, "Tile size bigger or equal to the parent group size");
+#endif
+        _CG_QUALIFIER tiled_partition_impl(const thread_block_tile<ParentSize, GrandParent>& g) :
+            thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> >(g) {}
+    };
+}
+namespace experimental {
+    template <unsigned int Size, typename ParentT>
+    _CG_QUALIFIER thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g)
+    {
+#if defined(_CG_CPP11_FEATURES) && !defined(_CG_ABI_EXPERIMENTAL)
+        static_assert(details::_is_single_warp<Size>::value, "_CG_ABI_EXPERIMENTAL needs to be defined"
+                " before cooperative_groups header is included to enable experimental features");
+#endif
+        return details::tiled_partition_impl<Size, ParentT>(g);
+    }
+}
+template <unsigned int Size, typename ParentT>
+_CG_QUALIFIER thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g)
+{
+#ifdef _CG_CPP11_FEATURES
+    static_assert(details::_is_single_warp<Size>::value, "Tiled partition with Size > 32 is supported only by"
+            " cooperative_groups::experimental::tiled_partition available with experimental features enabled");
+#endif
+    return details::tiled_partition_impl<Size, ParentT>(g);
+}
+/**
+ * thread_group this_thread()
+ *
+ * Constructs a generic thread_group containing only the calling thread
+ */
+_CG_QUALIFIER thread_block_tile<1, void> this_thread()
+{
+    // Make thread_block_tile<1, thread_block> parent of the returned group, so it will have its
+    // meta group rank and size set to 0 and 1 respectively.
+    return thread_block_tile<1, thread_block_tile<1, thread_block> >(this_thread_block());
+}
+/**
+ * <group_type>.sync()
+ *
+ * Executes a barrier across the group
+ *
+ * Implements both a compiler fence and an architectural fence to prevent,
+ * memory reordering around the barrier.
+ */
+_CG_QUALIFIER void thread_group::sync() const
+{
+    switch (_data.group.type) {
+    case details::coalesced_group_id:
+        cooperative_groups::sync(*static_cast<const coalesced_group*>(this));
+        break;
+    case details::thread_block_id:
+        cooperative_groups::sync(*static_cast<const thread_block*>(this));
+        break;
+    case details::grid_group_id:
+        cooperative_groups::sync(*static_cast<const grid_group*>(this));
+        break;
+#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    case details::multi_grid_group_id:
+        cooperative_groups::sync(*static_cast<const multi_grid_group*>(this));
+        break;
+#endif
+#if defined(_CG_HAS_CLUSTER_GROUP)
+    case details::cluster_group_id:
+        cooperative_groups::sync(*static_cast<const cluster_group*>(this));
+        break;
+#endif
+    default:
+        break;
+    }
+}
+/**
+ * <group_type>.size()
+ *
+ * Returns the total number of threads in the group.
+ */
+_CG_QUALIFIER unsigned long long thread_group::size() const
+{
+    unsigned long long size = 0;
+    switch (_data.group.type) {
+    case details::coalesced_group_id:
+        size = cooperative_groups::group_size(*static_cast<const coalesced_group*>(this));
+        break;
+    case details::thread_block_id:
+        size = cooperative_groups::group_size(*static_cast<const thread_block*>(this));
+        break;
+    case details::grid_group_id:
+        size = cooperative_groups::group_size(*static_cast<const grid_group*>(this));
+        break;
+#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    case details::multi_grid_group_id:
+        size = cooperative_groups::group_size(*static_cast<const multi_grid_group*>(this));
+        break;
+#endif
+#if defined(_CG_HAS_CLUSTER_GROUP)
+    case details::cluster_group_id:
+        size = cooperative_groups::group_size(*static_cast<const cluster_group*>(this));
+        break;
+#endif
+    default:
+        break;
+    }
+    return size;
+}
+/**
+ * <group_type>.thread_rank()
+ *
+ * Returns the linearized rank of the calling thread along the interval [0, size()).
+ */
+_CG_QUALIFIER unsigned long long thread_group::thread_rank() const
+{
+    unsigned long long rank = 0;
+    switch (_data.group.type) {
+    case details::coalesced_group_id:
+        rank = cooperative_groups::thread_rank(*static_cast<const coalesced_group*>(this));
+        break;
+    case details::thread_block_id:
+        rank = cooperative_groups::thread_rank(*static_cast<const thread_block*>(this));
+        break;
+    case details::grid_group_id:
+        rank = cooperative_groups::thread_rank(*static_cast<const grid_group*>(this));
+        break;
+#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    case details::multi_grid_group_id:
+        rank = cooperative_groups::thread_rank(*static_cast<const multi_grid_group*>(this));
+        break;
+#endif
+#if defined(_CG_HAS_CLUSTER_GROUP)
+    case details::cluster_group_id:
+        rank = cooperative_groups::thread_rank(*static_cast<const cluster_group*>(this));
+        break;
+#endif
+    default:
+        break;
+    }
+    return rank;
+}
+_CG_END_NAMESPACE
+#include <cooperative_groups/details/partitioning.h>
+# endif /* ! (__cplusplus, __CUDACC__) */
+#endif /* !_COOPERATIVE_GROUPS_H_ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/coalesced_reduce.h ADDED Viewed

	@@ -0,0 +1,108 @@

+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#ifndef _CG_COALESCED_REDUCE_H_
+#define _CG_COALESCED_REDUCE_H_
+#include "info.h"
+#include "helpers.h"
+#include "cooperative_groups.h"
+#include "partitioning.h"
+#include "coalesced_scan.h"
+_CG_BEGIN_NAMESPACE
+namespace details {
+template <typename TyVal, typename TyOp>
+_CG_QUALIFIER auto coalesced_reduce_to_one(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
+    if (group.size() == 32) {
+        auto out = val;
+        for (int offset = group.size() >> 1; offset > 0; offset >>= 1) {
+            out = op(out, group.shfl_up(out, offset));
+        }
+        return out;
+    }
+    else {
+        auto scan_result =
+            inclusive_scan_non_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+        return scan_result;
+    }
+}
+template <typename TyVal, typename TyOp>
+_CG_QUALIFIER auto coalesced_reduce(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
+    auto out = coalesced_reduce_to_one(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+    if (group.size() == 32) {
+        return group.shfl(out, 31);
+    }
+    else {
+        unsigned int group_mask = _coalesced_group_data_access::get_mask(group);
+        unsigned int last_thread_id = 31 - __clz(group_mask);
+        return details::tile::shuffle_dispatch<TyVal>::shfl(
+            _CG_STL_NAMESPACE::forward<TyVal>(out), group_mask, last_thread_id, 32);
+    }
+}
+template <typename TyVal, typename TyOp, unsigned int TySize, typename ParentT>
+_CG_QUALIFIER auto coalesced_reduce(const __single_warp_thread_block_tile<TySize, ParentT>& group,
+                                    TyVal&& val,
+                                    TyOp&& op) -> decltype(op(val, val)) {
+    auto out = val;
+    for (int mask = TySize >> 1; mask > 0; mask >>= 1) {
+        out = op(out, group.shfl_xor(out, mask));
+    }
+    return out;
+}
+} // details
+_CG_END_NAMESPACE
+#endif // _CG_COALESCED_REDUCE_H_

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/functional.h ADDED Viewed

	@@ -0,0 +1,207 @@

+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#ifndef _CG_FUNCTIONAL_H
+#define _CG_FUNCTIONAL_H
+#include "info.h"
+#include "helpers.h"
+#ifdef _CG_CPP11_FEATURES
+#ifdef _CG_USE_CUDA_STL
+# include <cuda/std/functional>
+#endif
+_CG_BEGIN_NAMESPACE
+namespace details {
+#ifdef _CG_USE_CUDA_STL
+    using cuda::std::plus;
+    using cuda::std::bit_and;
+    using cuda::std::bit_xor;
+    using cuda::std::bit_or;
+#else
+    template <typename Ty> struct plus {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 + arg2;}};
+    template <typename Ty> struct bit_and {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 & arg2;}};
+    template <typename Ty> struct bit_xor {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 ^ arg2;}};
+    template <typename Ty> struct bit_or {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 | arg2;}};
+#endif // _CG_USE_PLATFORM_STL
+} // details
+template <typename Ty>
+struct plus : public details::plus<Ty> {};
+template <typename Ty>
+struct less {
+    __device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {
+        return (arg2 < arg1) ? arg2 : arg1;
+    }
+};
+template <typename Ty>
+struct greater {
+    __device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {
+        return (arg1 < arg2) ? arg2 : arg1;
+    }
+};
+template <typename Ty>
+struct bit_and : public details::bit_and<Ty> {};
+template <typename Ty>
+struct bit_xor : public details::bit_xor<Ty> {};
+template <typename Ty>
+struct bit_or : public details::bit_or<Ty> {};
+#if defined(_CG_HAS_STL_ATOMICS) && defined(_CG_ABI_EXPERIMENTAL)
+namespace details {
+    template <class Ty>
+    using _atomic_is_type_supported = _CG_STL_NAMESPACE::integral_constant<bool,
+            _CG_STL_NAMESPACE::is_integral<Ty>::value && (sizeof(Ty) == 4 || sizeof(Ty) == 8)>;
+    template <typename TyOp> struct _atomic_op_supported                                : public _CG_STL_NAMESPACE::false_type {};
+    template <typename Ty> struct _atomic_op_supported<cooperative_groups::plus<Ty>>    : public _atomic_is_type_supported<Ty> {};
+    template <typename Ty> struct _atomic_op_supported<cooperative_groups::less<Ty>>    : public _atomic_is_type_supported<Ty> {};
+    template <typename Ty> struct _atomic_op_supported<cooperative_groups::greater<Ty>> : public _atomic_is_type_supported<Ty> {};
+    template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_and<Ty>> : public _atomic_is_type_supported<Ty> {};
+    template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_or<Ty>>  : public _atomic_is_type_supported<Ty> {};
+    template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_xor<Ty>> : public _atomic_is_type_supported<Ty> {};
+    template<typename TyAtomic, typename TyVal, typename TyOp>
+    _CG_QUALIFIER remove_qual<TyVal> atomic_cas_fallback(TyAtomic&& atomic, TyVal&& val, TyOp&& op) {
+        remove_qual<TyVal> old = atomic;
+        while(!atomic.compare_exchange_weak(old, op(old, val)));
+        return old;
+    }
+    template<typename TyOp>
+    struct op_picker;
+    template<typename TyVal>
+    struct op_picker<cooperative_groups::plus<TyVal>> {
+        template<typename TyAtomic>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
+            return atomic.fetch_add(val);
+        }
+    };
+    template<typename TyVal>
+    struct op_picker<cooperative_groups::less<TyVal>> {
+        template<typename TyAtomic>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
+            return atomic.fetch_min(val);
+        }
+    };
+    template<typename TyVal>
+    struct op_picker<cooperative_groups::greater<TyVal>> {
+        template<typename TyAtomic>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
+            return atomic.fetch_max(val);
+        }
+    };
+    template<typename TyVal>
+    struct op_picker<cooperative_groups::bit_and<TyVal>> {
+        template<typename TyAtomic>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
+            return atomic.fetch_and(val);
+        }
+    };
+    template<typename TyVal>
+    struct op_picker<cooperative_groups::bit_xor<TyVal>> {
+        template<typename TyAtomic>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
+            return atomic.fetch_xor(val);
+        }
+    };
+    template<typename TyVal>
+    struct op_picker<cooperative_groups::bit_or<TyVal>> {
+        template<typename TyAtomic>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
+            return atomic.fetch_or(val);
+        }
+    };
+    template<bool atomic_supported>
+    struct atomic_update_dispatch {};
+    template<>
+    struct atomic_update_dispatch<false> {
+        template<typename TyAtomic, typename TyVal, typename TyOp>
+        _CG_STATIC_QUALIFIER remove_qual<TyVal> atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) {
+            return atomic_cas_fallback(atomic, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+        }
+    };
+    template<>
+    struct atomic_update_dispatch<true> {
+        template<typename TyAtomic, typename TyVal, typename TyOp>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val, TyOp&& op) {
+            using dispatch = op_picker<details::remove_qual<TyOp>>;
+            return dispatch::atomic_update(atomic, val);
+        }
+    };
+    template<typename TyAtomic, typename TyVal, typename TyOp>
+    _CG_QUALIFIER remove_qual<TyVal> atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) {
+        using dispatch = atomic_update_dispatch<_atomic_op_supported<details::remove_qual<TyOp>>::value>;
+        return dispatch::atomic_update(atomic, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+    }
+}
+#endif
+_CG_END_NAMESPACE
+#endif
+#endif //_CG_FUNCTIONAL_H

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/helpers.h ADDED Viewed

	@@ -0,0 +1,707 @@

+ /* Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#ifndef _COOPERATIVE_GROUPS_HELPERS_H_
+# define _COOPERATIVE_GROUPS_HELPERS_H_
+#include "info.h"
+#include "sync.h"
+_CG_BEGIN_NAMESPACE
+namespace details {
+#ifdef _CG_CPP11_FEATURES
+    template <typename Ty> struct _is_float_or_half          : public _CG_STL_NAMESPACE::is_floating_point<Ty> {};
+# ifdef _CG_HAS_FP16_COLLECTIVE
+    template <>            struct _is_float_or_half<__half>  : public _CG_STL_NAMESPACE::true_type {};
+    template <>            struct _is_float_or_half<__half2> : public _CG_STL_NAMESPACE::true_type {};
+# endif
+    template <typename Ty>
+    using  is_float_or_half = _is_float_or_half<typename _CG_STL_NAMESPACE::remove_cv<Ty>::type>;
+    // Non-STL utility templates
+    template <typename Ty>
+    using remove_qual = typename _CG_STL_NAMESPACE::remove_cv<typename _CG_STL_NAMESPACE::remove_reference<Ty>::type>::type;
+    template <typename TyLhs, typename TyRhs>
+    using is_op_type_same = _CG_STL_NAMESPACE::is_same<remove_qual<TyLhs>, remove_qual<TyRhs>
+    >;
+#endif
+    template <typename TyTrunc>
+    _CG_STATIC_QUALIFIER TyTrunc vec3_to_linear(dim3 index, dim3 nIndex) {
+        return ((TyTrunc)index.z * nIndex.y * nIndex.x) +
+               ((TyTrunc)index.y * nIndex.x) +
+                (TyTrunc)index.x;
+    }
+    namespace cta {
+        _CG_STATIC_QUALIFIER void sync()
+        {
+            __barrier_sync(0);
+        }
+        _CG_STATIC_QUALIFIER unsigned int num_threads()
+        {
+            return static_cast<unsigned int>(blockDim.x * blockDim.y * blockDim.z);
+        }
+        _CG_STATIC_QUALIFIER unsigned int thread_rank()
+        {
+            return vec3_to_linear<unsigned int>(threadIdx, blockDim);
+        }
+        _CG_STATIC_QUALIFIER dim3 group_index()
+        {
+            return dim3(blockIdx.x, blockIdx.y, blockIdx.z);
+        }
+        _CG_STATIC_QUALIFIER dim3 thread_index()
+        {
+            return dim3(threadIdx.x, threadIdx.y, threadIdx.z);
+        }
+        _CG_STATIC_QUALIFIER dim3 dim_threads()
+        {
+            return dim3(blockDim.x, blockDim.y, blockDim.z);
+        }
+        // Legacy aliases
+        _CG_STATIC_QUALIFIER unsigned int size()
+        {
+            return num_threads();
+        }
+        _CG_STATIC_QUALIFIER dim3 block_dim()
+        {
+            return dim_threads();
+        }
+    };
+    class _coalesced_group_data_access {
+    public:
+        // Retrieve mask of coalesced groups
+        template <typename TyGroup>
+        _CG_STATIC_QUALIFIER unsigned int get_mask(const TyGroup &group) {
+            return group.get_mask();
+        }
+        // Retrieve mask of tiles
+        template <template <typename, typename> typename TyGroup, typename Sz, typename Parent>
+        _CG_STATIC_QUALIFIER unsigned int get_mask(const TyGroup<Sz, Parent> &group) {
+            return group.build_maks();
+        }
+        template <typename TyGroup>
+        _CG_STATIC_QUALIFIER TyGroup construct_from_mask(unsigned int mask) {
+            return TyGroup(mask);
+        }
+        template <typename TyGroup>
+        _CG_STATIC_QUALIFIER void modify_meta_group(TyGroup &group, unsigned int mgRank, unsigned int mgSize) {
+            group._data.coalesced.metaGroupRank = mgRank;
+            group._data.coalesced.metaGroupSize = mgSize;
+        }
+    };
+    namespace tile {
+        template <unsigned int TileCount, unsigned int TileMask, unsigned int LaneMask, unsigned int ShiftCount>
+        struct _tile_helpers{
+            _CG_STATIC_CONST_DECL unsigned int tileCount = TileCount;
+            _CG_STATIC_CONST_DECL unsigned int tileMask = TileMask;
+            _CG_STATIC_CONST_DECL unsigned int laneMask = LaneMask;
+            _CG_STATIC_CONST_DECL unsigned int shiftCount = ShiftCount;
+        };
+        template <unsigned int> struct tile_helpers;
+        template <> struct tile_helpers<32> : public _tile_helpers<1,  0xFFFFFFFF, 0x1F, 5> {};
+        template <> struct tile_helpers<16> : public _tile_helpers<2,  0x0000FFFF, 0x0F, 4> {};
+        template <> struct tile_helpers<8>  : public _tile_helpers<4,  0x000000FF, 0x07, 3> {};
+        template <> struct tile_helpers<4>  : public _tile_helpers<8,  0x0000000F, 0x03, 2> {};
+        template <> struct tile_helpers<2>  : public _tile_helpers<16, 0x00000003, 0x01, 1> {};
+        template <> struct tile_helpers<1>  : public _tile_helpers<32, 0x00000001, 0x00, 0> {};
+#ifdef _CG_CPP11_FEATURES
+        namespace shfl {
+            /***********************************************************************************
+             * Recursively Sliced Shuffle
+             *  Purpose:
+             *      Slices an input type a number of times into integral types so that shuffles
+             *      are well defined
+             *  Expectations:
+             *      This object *should not* be used from a reinterpret_cast pointer unless
+             *      some alignment guarantees can be met. Use a memcpy to guarantee that loads
+             *      from the integral types stored within are aligned and correct.
+             **********************************************************************************/
+            template <unsigned int count, bool intSized = (count <= sizeof(int))>
+            struct recursive_sliced_shuffle_helper;
+            template <unsigned int count>
+            struct recursive_sliced_shuffle_helper<count, true> {
+                int val;
+                template <typename TyFn>
+                _CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) {
+                    val = shfl(val);
+                }
+            };
+            template <unsigned int count>
+            struct recursive_sliced_shuffle_helper<count, false> {
+                int val;
+                recursive_sliced_shuffle_helper<count - sizeof(int)> next;
+                template <typename TyFn>
+                _CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) {
+                    val = shfl(val);
+                    next.invoke_shuffle(shfl);
+                }
+            };
+        }
+        struct _memory_shuffle {
+            template <typename TyElem, typename TyShflFn>
+            _CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) {
+                static_assert(sizeof(TyElem) > 0, "in memory shuffle is not yet implemented");
+                return TyElem{};
+            }
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return 0;
+                };
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return 0;
+                };
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return 0;
+                };
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return 0;
+                };
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+        };
+        /***********************************************************************************
+         * Intrinsic Device Function Shuffle
+         *  Purpose:
+         *      Uses a shuffle helper that has characteristics best suited for moving
+         *      elements between threads
+         *  Expectations:
+         *      Object given will be forced into an l-value type so that it can be used
+         *      with a helper structure that reinterprets the data into intrinsic compatible
+         *      types
+         *  Notes:
+         *      !! TyRet is required so that objects are returned by value and not as
+         *      dangling references depending on the value category of the passed object
+         **********************************************************************************/
+        struct _intrinsic_compat_shuffle {
+            template <unsigned int count>
+            using shfl_helper = shfl::recursive_sliced_shuffle_helper<count>;
+            template <typename TyElem, typename TyShflFn>
+            _CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) {
+                static_assert(__is_trivially_copyable(TyElem), "Type is not compatible with device shuffle");
+                shfl_helper<sizeof(TyElem)> helper;
+                memcpy(&helper, &elem, sizeof(TyElem));
+                helper.invoke_shuffle(fn);
+                memcpy(&elem, &helper, sizeof(TyElem));
+                return elem;
+            }
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return __shfl_sync(gMask, val, srcRank, threads);
+                };
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return __shfl_down_sync(gMask, val, delta, threads);
+                };
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return __shfl_up_sync(gMask, val, delta, threads);
+                };
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return __shfl_xor_sync(gMask, val, lMask, threads);
+                };
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+        };
+        struct _native_shuffle {
+            template <typename TyElem>
+            _CG_STATIC_QUALIFIER TyElem shfl(
+                    TyElem elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
+                return static_cast<TyElem>(__shfl_sync(gMask, elem, srcRank, threads));
+            }
+            template <typename TyElem>
+            _CG_STATIC_QUALIFIER TyElem shfl_down(
+                    TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
+                return static_cast<TyElem>(__shfl_down_sync(gMask, elem, delta, threads));
+            }
+            template <typename TyElem>
+            _CG_STATIC_QUALIFIER TyElem shfl_up(
+                    TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
+                return static_cast<TyElem>(__shfl_up_sync(gMask, elem, delta, threads));
+            }
+            template <typename TyElem>
+            _CG_STATIC_QUALIFIER TyElem shfl_xor(
+                    TyElem elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
+                return static_cast<TyElem>(__shfl_xor_sync(gMask, elem, lMask, threads));
+            }
+        };
+        // Almost all arithmetic types are supported by native shuffle
+        // Vector types are the exception
+        template <typename TyElem>
+        using use_native_shuffle = _CG_STL_NAMESPACE::integral_constant<
+            bool,
+            _CG_STL_NAMESPACE::is_integral<
+                remove_qual<TyElem>>::value ||
+            details::is_float_or_half<
+                remove_qual<TyElem>>::value
+        >;
+        constexpr unsigned long long _MemoryShuffleCutoff = 32;
+        template <typename TyElem,
+                  bool IsNative = use_native_shuffle<TyElem>::value,
+                  bool InMem = (sizeof(TyElem) > _MemoryShuffleCutoff)>
+        struct shuffle_dispatch;
+        template <typename TyElem>
+        struct shuffle_dispatch<TyElem, true, false> :  public _native_shuffle {};
+        template <typename TyElem>
+        struct shuffle_dispatch<TyElem, false, false> : public _intrinsic_compat_shuffle {};
+        template <typename TyElem>
+        struct shuffle_dispatch<TyElem, false, true> :  public _memory_shuffle {};
+#endif //_CG_CPP11_FEATURES
+    };
+    namespace multi_grid {
+        struct multi_grid_functions;
+    };
+    namespace grid {
+        _CG_STATIC_QUALIFIER void sync(unsigned int *bar) {
+            unsigned int expected = gridDim.x * gridDim.y * gridDim.z;
+            details::sync_grids(expected, bar);
+        }
+        _CG_STATIC_QUALIFIER unsigned long long num_blocks()
+        {
+            // grid.y * grid.z -> [max(65535) * max(65535)] fits within 4b, promote after multiplication
+            // grid.x * (grid.y * grid.z) -> [max(2^31-1) * max(65535 * 65535)]  exceeds 4b, promote before multiplication
+            return (unsigned long long)gridDim.x * (gridDim.y * gridDim.z);
+        }
+        _CG_STATIC_QUALIFIER unsigned long long num_threads()
+        {
+            return num_blocks() * cta::num_threads();
+        }
+        _CG_STATIC_QUALIFIER unsigned long long block_rank()
+        {
+            return vec3_to_linear<unsigned long long>(blockIdx, gridDim);
+        }
+        _CG_STATIC_QUALIFIER unsigned long long thread_rank()
+        {
+            return block_rank() * cta::num_threads() + cta::thread_rank();
+        }
+        _CG_STATIC_QUALIFIER dim3 dim_blocks()
+        {
+            return dim3(gridDim.x, gridDim.y, gridDim.z);
+        }
+        _CG_STATIC_QUALIFIER dim3 block_index()
+        {
+            return dim3(blockIdx.x, blockIdx.y, blockIdx.z);
+        }
+#if defined(_CG_HAS_CLUSTER_GROUP)
+        _CG_STATIC_QUALIFIER dim3 dim_clusters() {
+            return __clusterGridDimInClusters();
+        }
+        _CG_STATIC_QUALIFIER unsigned long long num_clusters() {
+            const dim3 dimClusters = dim_clusters();
+            return dimClusters.x * dimClusters.y * dimClusters.z;
+        }
+        _CG_STATIC_QUALIFIER dim3 cluster_index() {
+            return __clusterIdx();
+        }
+        _CG_STATIC_QUALIFIER unsigned long long cluster_rank() {
+            return vec3_to_linear<unsigned long long>(cluster_index(), dim_clusters());
+        }
+#endif
+        // Legacy aliases
+        _CG_STATIC_QUALIFIER unsigned long long size()
+        {
+            return num_threads();
+        }
+        _CG_STATIC_QUALIFIER dim3 grid_dim()
+        {
+            return dim_blocks();
+        }
+    };
+#if defined(_CG_HAS_MULTI_GRID_GROUP)
+    namespace multi_grid {
+        _CG_STATIC_QUALIFIER unsigned long long get_intrinsic_handle()
+        {
+            return (cudaCGGetIntrinsicHandle(cudaCGScopeMultiGrid));
+        }
+        _CG_STATIC_QUALIFIER void sync(const unsigned long long handle)
+        {
+            cudaError_t err = cudaCGSynchronize(handle, 0);
+        }
+        _CG_STATIC_QUALIFIER unsigned int size(const unsigned long long handle)
+        {
+            unsigned int numThreads = 0;
+            cudaCGGetSize(&numThreads, NULL, handle);
+            return numThreads;
+        }
+        _CG_STATIC_QUALIFIER unsigned int thread_rank(const unsigned long long handle)
+        {
+            unsigned int threadRank = 0;
+            cudaCGGetRank(&threadRank, NULL, handle);
+            return threadRank;
+        }
+        _CG_STATIC_QUALIFIER unsigned int grid_rank(const unsigned long long handle)
+        {
+            unsigned int gridRank = 0;
+            cudaCGGetRank(NULL, &gridRank, handle);
+            return gridRank;
+        }
+        _CG_STATIC_QUALIFIER unsigned int num_grids(const unsigned long long handle)
+        {
+            unsigned int numGrids = 0;
+            cudaCGGetSize(NULL, &numGrids, handle);
+            return numGrids;
+        }
+# ifdef _CG_CPP11_FEATURES
+        struct multi_grid_functions {
+            decltype(multi_grid::get_intrinsic_handle) *get_intrinsic_handle;
+            decltype(multi_grid::sync) *sync;
+            decltype(multi_grid::size) *size;
+            decltype(multi_grid::thread_rank) *thread_rank;
+            decltype(multi_grid::grid_rank) *grid_rank;
+            decltype(multi_grid::num_grids) *num_grids;
+        };
+        template <typename = void>
+        _CG_STATIC_QUALIFIER const multi_grid_functions* load_grid_intrinsics() {
+            __constant__ static const multi_grid_functions mgf {
+                &multi_grid::get_intrinsic_handle,
+                &multi_grid::sync,
+                &multi_grid::size,
+                &multi_grid::thread_rank,
+                &multi_grid::grid_rank,
+                &multi_grid::num_grids
+            };
+            return &mgf;
+        }
+# endif
+    };
+#endif
+#if defined(_CG_HAS_CLUSTER_GROUP)
+    namespace cluster {
+        _CG_STATIC_QUALIFIER bool isReal()
+        {
+            return __clusterDimIsSpecified();
+        }
+        _CG_STATIC_QUALIFIER void barrier_arrive()
+        {
+            __cluster_barrier_arrive();
+        }
+        _CG_STATIC_QUALIFIER void barrier_wait()
+        {
+            __cluster_barrier_wait();
+        }
+        _CG_STATIC_QUALIFIER void sync()
+        {
+            barrier_arrive();
+            barrier_wait();
+        }
+        _CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr)
+        {
+            return __cluster_query_shared_rank(addr);
+        }
+        template <typename T>
+        _CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank)
+        {
+            return static_cast<T*>(__cluster_map_shared_rank(addr, rank));
+        }
+        _CG_STATIC_QUALIFIER dim3 block_index()
+        {
+            return __clusterRelativeBlockIdx();
+        }
+        _CG_STATIC_QUALIFIER unsigned int block_rank()
+        {
+            return __clusterRelativeBlockRank();
+        }
+        _CG_STATIC_QUALIFIER unsigned int thread_rank()
+        {
+            return block_rank() * cta::num_threads() + cta::thread_rank();
+        }
+        _CG_STATIC_QUALIFIER dim3 dim_blocks()
+        {
+            return __clusterDim();
+        }
+        _CG_STATIC_QUALIFIER unsigned int num_blocks()
+        {
+            return __clusterSizeInBlocks();
+        }
+        _CG_STATIC_QUALIFIER dim3 dim_threads()
+        {
+            const dim3 dimBlocks = dim_blocks();
+            const unsigned int x = dimBlocks.x * blockDim.x;
+            const unsigned int y = dimBlocks.y * blockDim.y;
+            const unsigned int z = dimBlocks.z * blockDim.z;
+            return dim3(x, y, z);
+        }
+        _CG_STATIC_QUALIFIER unsigned int num_threads()
+        {
+            return num_blocks() * cta::num_threads();
+        }
+    };
+#endif
+    _CG_STATIC_QUALIFIER unsigned int laneid()
+    {
+        unsigned int laneid;
+        asm ("mov.u32 %0, %%laneid;" : "=r"(laneid));
+        return laneid;
+    }
+    _CG_STATIC_QUALIFIER unsigned int lanemask32_eq()
+    {
+        unsigned int lanemask32_eq;
+        asm ("mov.u32 %0, %%lanemask_eq;" : "=r"(lanemask32_eq));
+        return (lanemask32_eq);
+    }
+    _CG_STATIC_QUALIFIER unsigned int lanemask32_lt()
+    {
+        unsigned int lanemask32_lt;
+        asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask32_lt));
+        return (lanemask32_lt);
+    }
+    _CG_STATIC_QUALIFIER void abort()
+    {
+        _CG_ABORT();
+    }
+    template <typename Ty>
+    _CG_QUALIFIER void assert_if_not_arithmetic() {
+#ifdef _CG_CPP11_FEATURES
+        static_assert(
+            _CG_STL_NAMESPACE::is_integral<Ty>::value ||
+            details::is_float_or_half<Ty>::value,
+            "Error: Ty is neither integer or float"
+        );
+#endif
+    }
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    template <unsigned int numWarps>
+    struct copy_channel {
+        char* channel_ptr;
+        barrier_t* sync_location;
+        size_t channel_size;
+        // One warp sending to all other warps, it has to wait for all other warps.
+        struct send_many_to_many {
+            _CG_STATIC_CONST_DECL wait_for_warps_kind wait_kind = wait_for_all_other_warps;
+            _CG_STATIC_QUALIFIER void post_iter_release(unsigned int thread_idx, barrier_t* sync_location) {
+                __syncwarp(0xFFFFFFFF);
+                details::sync_warps_release(sync_location, thread_idx == 0, cta::thread_rank(), numWarps);
+            }
+        };
+        // One warp is receiving and all other warps are sending to that warp, they have to wait for that one warp.
+        struct send_many_to_one {
+            _CG_STATIC_CONST_DECL wait_for_warps_kind wait_kind = wait_for_specific_warp;
+            _CG_STATIC_QUALIFIER void post_iter_release(unsigned int thread_idx, barrier_t* sync_location) {
+                // Wait for all warps to finish and let the last warp release all threads.
+                if (details::sync_warps_last_releases(sync_location, cta::thread_rank(), numWarps)) {
+                    details::sync_warps_release(sync_location, thread_idx == 0, cta::thread_rank(), numWarps);
+                }
+            }
+        };
+        template <unsigned int ThreadCnt, size_t ValSize, typename SendDetails>
+        _CG_QUALIFIER void _send_value_internal(char* val_ptr, unsigned int thread_idx, unsigned int warp_id) {
+            size_t thread_offset = thread_idx * sizeof(int);
+            for (size_t i = 0; i < ValSize; i += channel_size) {
+                size_t bytes_left = ValSize - i;
+                size_t copy_chunk = min(bytes_left, channel_size);
+                details::sync_warps_wait_for_warps<SendDetails::wait_kind>(warp_id, sync_location, cta::thread_rank(), numWarps);
+                #pragma unroll 1
+                for (size_t j = thread_offset; j < copy_chunk ; j += sizeof(int) * ThreadCnt) {
+                    size_t my_bytes_left = copy_chunk - j;
+                    memcpy(channel_ptr + j, val_ptr + i + j, min(my_bytes_left, sizeof(int)));
+                }
+                SendDetails::post_iter_release(thread_idx, sync_location);
+            }
+        }
+        template <typename TyVal, unsigned int ThreadCnt, typename SendDetails>
+        _CG_QUALIFIER void send_value(TyVal& val, unsigned int thread_idx, unsigned int warp_id) {
+            _send_value_internal<ThreadCnt, sizeof(TyVal), SendDetails>(reinterpret_cast<char*>(&val), thread_idx, warp_id);
+        }
+        template <size_t ValSize>
+        _CG_QUALIFIER void _receive_value_internal(char* val_ptr, bool warp_master, bool active) {
+            for (size_t i = 0; i < ValSize; i += channel_size) {
+                size_t bytes_left = ValSize - i;
+                details::sync_warps_wait_for_release(sync_location, warp_master, cta::thread_rank(), numWarps);
+                if (active) {
+                    memcpy(val_ptr + i, channel_ptr, min(bytes_left, channel_size));
+                }
+            }
+        }
+        template <typename TyVal>
+        _CG_QUALIFIER void receive_value(TyVal& val, bool warp_master, bool active = true) {
+            _receive_value_internal<sizeof(TyVal)>(reinterpret_cast<char*>(&val), warp_master, active);
+        }
+    };
+    _CG_STATIC_QUALIFIER constexpr unsigned int log2(unsigned int x) {
+        return x == 1 ? 0 : 1 + log2(x / 2);
+    }
+#endif //_CG_CPP11_FEATURES
+}; // !Namespace internal
+_CG_END_NAMESPACE
+#endif /* !_COOPERATIVE_GROUPS_HELPERS_H_ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/info.h ADDED Viewed

	@@ -0,0 +1,323 @@

+ /* Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#ifndef _CG_INFO_H_
+#define _CG_INFO_H_
+/*
+** Define: _CG_VERSION
+*/
+#define _CG_VERSION 1000
+/*
+** Define: _CG_ABI_VERSION
+*/
+#ifndef _CG_ABI_VERSION
+# define _CG_ABI_VERSION 1
+#endif
+/*
+** Define: _CG_ABI_EXPERIMENTAL
+** Desc: If enabled, sets all features enabled (ABI-breaking or experimental)
+*/
+#if defined(_CG_ABI_EXPERIMENTAL)
+#endif
+#define _CG_CONCAT_INNER(x, y) x ## y
+#define _CG_CONCAT_OUTER(x, y) _CG_CONCAT_INNER(x, y)
+#define _CG_NAMESPACE _CG_CONCAT_OUTER(__v, _CG_ABI_VERSION)
+#define _CG_BEGIN_NAMESPACE \
+    namespace cooperative_groups { namespace _CG_NAMESPACE {
+#define _CG_END_NAMESPACE \
+    }; using namespace _CG_NAMESPACE; };
+#if (defined(__cplusplus) && (__cplusplus >= 201103L)) || (defined(_MSC_VER) && (_MSC_VER >= 1900))
+# define _CG_CPP11_FEATURES
+#endif
+#if !defined(_CG_QUALIFIER)
+# define _CG_QUALIFIER __forceinline__ __device__
+#endif
+#if !defined(_CG_STATIC_QUALIFIER)
+# define _CG_STATIC_QUALIFIER static __forceinline__ __device__
+#endif
+#if !defined(_CG_CONSTEXPR_QUALIFIER)
+# if defined(_CG_CPP11_FEATURES)
+#  define _CG_CONSTEXPR_QUALIFIER constexpr __forceinline__ __device__
+# else
+#  define _CG_CONSTEXPR_QUALIFIER _CG_QUALIFIER
+# endif
+#endif
+#if !defined(_CG_STATIC_CONSTEXPR_QUALIFIER)
+# if defined(_CG_CPP11_FEATURES)
+#  define _CG_STATIC_CONSTEXPR_QUALIFIER static constexpr __forceinline__ __device__
+# else
+#  define _CG_STATIC_CONSTEXPR_QUALIFIER _CG_STATIC_QUALIFIER
+# endif
+#endif
+#if defined(_MSC_VER)
+# define _CG_DEPRECATED __declspec(deprecated)
+#else
+# define _CG_DEPRECATED __attribute__((deprecated))
+#endif
+#if (__CUDA_ARCH__ >= 600) || !defined(__CUDA_ARCH__)
+# define _CG_HAS_GRID_GROUP
+#endif
+#if (__CUDA_ARCH__ >= 600) || !defined(__CUDA_ARCH__)
+# define _CG_HAS_MULTI_GRID_GROUP
+#endif
+#if (__CUDA_ARCH__ >= 700) || !defined(__CUDA_ARCH__)
+# define _CG_HAS_MATCH_COLLECTIVE
+#endif
+#if ((__CUDA_ARCH__ >= 900) || !defined(__CUDA_ARCH__)) && (defined(__NVCC__) || defined(__CUDACC_RTC__) || defined(_CG_CLUSTER_INTRINSICS_AVAILABLE))
+# define _CG_HAS_CLUSTER_GROUP
+#endif
+// Has __half and __half2
+// Only usable if you include the cuda_fp16.h extension, and
+// _before_ including cooperative_groups.h
+#ifdef __CUDA_FP16_TYPES_EXIST__
+# define _CG_HAS_FP16_COLLECTIVE
+#endif
+#if (__CUDA_ARCH__ >= 800) || !defined(__CUDA_ARCH__) && (defined(__NVCC__) || defined(__CUDACC_RTC__))
+# define _CG_HAS_OP_REDUX
+#endif
+// Include libcu++ where supported.
+#if defined(_CG_CPP11_FEATURES) && !defined(__QNX__) && !defined(__ibmxl__) && \
+    (defined(__NVCC__) || defined(__CUDACC_RTC__)) && \
+    (defined(__x86_64__) || defined(__aarch64__) || defined(__ppc64__)|| defined(_M_X64) || defined(_M_ARM64)) && \
+    (defined(_MSC_VER) || defined(__GNUC__) || defined(__clang__))
+# define _CG_USE_CUDA_STL
+#else
+# define _CG_USE_OWN_TRAITS
+#endif
+#if defined(_CG_USE_CUDA_STL) && (!defined(__CUDA_ARCH__) || \
+    ((!defined(_MSC_VER) && __CUDA_ARCH__ >= 600) || (defined(_MSC_VER) && __CUDA_ARCH__ >= 700)))
+# define _CG_HAS_STL_ATOMICS
+#endif
+#ifdef _CG_CPP11_FEATURES
+// Use cuda::std:: for type_traits
+# if defined(_CG_USE_CUDA_STL)
+#  define _CG_STL_NAMESPACE cuda::std
+#  include <cuda/std/type_traits>
+// Use CG's implementation of type traits
+# else
+#  define _CG_STL_NAMESPACE cooperative_groups::details::templates
+# endif
+#endif
+#ifdef _CG_CPP11_FEATURES
+# define _CG_STATIC_CONST_DECL static constexpr
+# define _CG_CONST_DECL constexpr
+#else
+# define _CG_STATIC_CONST_DECL static const
+# define _CG_CONST_DECL const
+#endif
+#if (defined(_MSC_VER) && !defined(_WIN64)) || defined(__arm__)
+# define _CG_ASM_PTR_CONSTRAINT "r"
+#else
+#  define _CG_ASM_PTR_CONSTRAINT "l"
+#endif
+/*
+** Define: CG_DEBUG
+** What: Enables various runtime safety checks
+*/
+#if defined(__CUDACC_DEBUG__) && defined(CG_DEBUG) && !defined(NDEBUG)
+# define _CG_DEBUG
+#endif
+#if defined(_CG_DEBUG)
+# include <assert.h>
+# define _CG_ASSERT(x) assert((x));
+# define _CG_ABORT() assert(0);
+#else
+# define _CG_ASSERT(x)
+# define _CG_ABORT() __trap();
+#endif
+#if defined(_CG_CPP11_FEATURES) && !defined(_CG_USE_CUDA_STL)
+_CG_BEGIN_NAMESPACE
+namespace details {
+namespace templates {
+/**
+ * Integral constants
+ **/
+template <typename Ty, Ty Val>
+struct integral_constant {
+    static constexpr Ty value = Val;
+    typedef Ty type;
+    _CG_QUALIFIER constexpr operator type() const noexcept { return value; }
+    _CG_QUALIFIER constexpr type operator()() const noexcept { return value; }
+};
+typedef integral_constant<bool, true>  true_type;
+typedef integral_constant<bool, false> false_type;
+/**
+ * CV Qualifiers
+ **/
+template <class Ty> struct is_lvalue_reference       : public details::templates::false_type {};
+template <class Ty> struct is_lvalue_reference<Ty&>  : public details::templates::true_type {};
+template <class Ty> struct remove_reference       {typedef Ty type;};
+template <class Ty> struct remove_reference<Ty&>  {typedef Ty type;};
+template <class Ty> struct remove_reference<Ty&&> {typedef Ty type;};
+template <class Ty>
+using remove_reference_t = typename details::templates::remove_reference<Ty>::type;
+template <class Ty> struct remove_const           {typedef Ty type;};
+template <class Ty> struct remove_const<const Ty> {typedef Ty type;};
+template <class Ty> struct remove_volatile              {typedef Ty type;};
+template <class Ty> struct remove_volatile<volatile Ty> {typedef Ty type;};
+template <class Ty> struct remove_cv {typedef typename details::templates::remove_volatile<typename details::templates::remove_const<Ty>::type>::type type;};
+template <class Ty>
+using remove_cv_t = typename details::templates::remove_cv<Ty>::type;
+template <class Ty>
+_CG_QUALIFIER Ty&& forward(remove_reference_t<Ty> &t) noexcept {
+    return static_cast<Ty&&>(t);
+}
+template <class Ty>
+_CG_QUALIFIER Ty&& forward(remove_reference_t<Ty> &&t) noexcept {
+    static_assert(!details::templates::is_lvalue_reference<Ty>::value, "Forwarding an rvalue as an lvalue is not allowed.");
+    return static_cast<Ty&&>(t);
+}
+/**
+ * is_integral
+ **/
+template <class Ty> struct _is_integral                     : public details::templates::false_type {};
+template <>         struct _is_integral<bool>               : public details::templates::true_type {};
+template <>         struct _is_integral<char>               : public details::templates::true_type {};
+template <>         struct _is_integral<unsigned char>      : public details::templates::true_type {};
+template <>         struct _is_integral<short>              : public details::templates::true_type {};
+template <>         struct _is_integral<unsigned short>     : public details::templates::true_type {};
+template <>         struct _is_integral<int>                : public details::templates::true_type {};
+template <>         struct _is_integral<unsigned int>       : public details::templates::true_type {};
+template <>         struct _is_integral<long>               : public details::templates::true_type {};
+template <>         struct _is_integral<long long>          : public details::templates::true_type {};
+template <>         struct _is_integral<unsigned long>      : public details::templates::true_type {};
+template <>         struct _is_integral<unsigned long long> : public details::templates::true_type {};
+//Vector type support?
+template <typename Ty>
+struct is_integral : public details::templates::_is_integral<typename details::templates::remove_cv<Ty>::type> {};
+/**
+ * is_floating_point
+ **/
+template <class Ty> struct _is_floating_point              : public details::templates::false_type {};
+template <>         struct _is_floating_point<float>       : public details::templates::true_type {};
+template <>         struct _is_floating_point<double>      : public details::templates::true_type {};
+template <>         struct _is_floating_point<long double> : public details::templates::true_type {};
+# ifdef __CUDA_FP16_TYPES_EXIST__
+template <>         struct _is_floating_point<__half>      : public details::templates::true_type {};
+template <>         struct _is_floating_point<__half2>     : public details::templates::true_type {};
+# endif
+//Vector type support?
+template <typename Ty>
+struct is_floating_point : public details::templates::_is_floating_point<typename details::templates::remove_cv<Ty>::type> {};
+template <class T>
+struct is_arithmetic : details::templates::integral_constant<
+    bool,
+    details::templates::is_integral<T>::value ||
+    details::templates::is_floating_point<T>::value> {};
+template <typename Ty, bool = details::templates::is_arithmetic<Ty>::value>
+struct _is_unsigned : details::templates::integral_constant<bool, Ty(0) < Ty(-1)> {};
+template <typename Ty>
+struct _is_unsigned<Ty,false> : details::templates::false_type {};
+template <typename Ty>
+struct is_unsigned : _is_unsigned<typename details::templates::remove_cv<Ty>::type> {};
+/**
+ * programmatic type traits
+ **/
+template<bool B, class Ty = void>
+struct enable_if {};
+template<class Ty>
+struct enable_if<true, Ty> { typedef Ty type; };
+template<bool Cond, typename Ty = void>
+using enable_if_t = typename details::templates::enable_if<Cond, Ty>::type;
+template<class Ty1, class Ty2>
+struct is_same : details::templates::false_type {};
+template<class Ty>
+struct is_same<Ty, Ty> : details::templates::true_type {};
+} // templates
+} // details
+_CG_END_NAMESPACE
+#endif // _CG_CPP11_FEATURES
+#endif // _CG_INFO_H_

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/partitioning.h ADDED Viewed

	@@ -0,0 +1,133 @@

+/*
+ * Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef _CG_PARTITIONING_H
+#define _CG_PARTITIONING_H
+#include "info.h"
+#include "helpers.h"
+_CG_BEGIN_NAMESPACE
+namespace details {
+    template <typename TyGroup>
+    _CG_STATIC_QUALIFIER coalesced_group _binary_partition(const TyGroup &tile, bool pred) {
+        const unsigned int fullMask = ~0u;
+        unsigned int thisMask = _coalesced_group_data_access::get_mask(tile);
+        unsigned int predMask = pred ? 0 : fullMask;
+        unsigned int setMask = __ballot_sync(thisMask, pred);
+        if (setMask == thisMask || setMask == 0) {
+            coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(thisMask);
+            _coalesced_group_data_access::modify_meta_group(subTile, 0, 1);
+            return subTile;
+        }
+        else {
+            unsigned int subMask = thisMask & (setMask ^ predMask);
+            coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(subMask);
+            _coalesced_group_data_access::modify_meta_group(subTile, pred, 2);
+            return subTile;
+        }
+    }
+#ifdef _CG_HAS_MATCH_COLLECTIVE
+    template <typename TyGroup, typename TyPredicate>
+    _CG_STATIC_QUALIFIER coalesced_group _labeled_partition(const TyGroup &tile, TyPredicate pred) {
+        unsigned int thisMask = _coalesced_group_data_access::get_mask(tile);
+        unsigned int thisBias = __ffs(thisMask) - 1; // Subtract 1 to index properly from [1-32]
+        unsigned int subMask = __match_any_sync(thisMask, pred);
+        coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(subMask);
+        int leaderLaneId = subTile.shfl(details::laneid(), 0);
+        bool isLeader = !subTile.thread_rank();
+        unsigned int leaderMask = __ballot_sync(thisMask, isLeader);
+        unsigned int tileRank = __fns(leaderMask, leaderLaneId, 0) - thisBias;
+        _coalesced_group_data_access::modify_meta_group(subTile, tileRank, __popc(leaderMask));
+        return subTile;
+    }
+#endif
+}; // namespace details
+_CG_STATIC_QUALIFIER coalesced_group binary_partition(const coalesced_group &tile, bool pred) {
+    return details::_binary_partition(tile, pred);
+}
+template <unsigned int Size, typename ParentT>
+_CG_STATIC_QUALIFIER coalesced_group binary_partition(const thread_block_tile<Size, ParentT> &tile, bool pred) {
+#ifdef _CG_CPP11_FEATURES
+    static_assert(Size <= 32, "Binary partition is available only for tiles of size smaller or equal to 32");
+#endif
+    return details::_binary_partition(tile, pred);
+}
+#if defined(_CG_HAS_MATCH_COLLECTIVE) && defined(_CG_CPP11_FEATURES)
+template <typename TyPredicate>
+_CG_STATIC_QUALIFIER coalesced_group labeled_partition(const coalesced_group &tile, TyPredicate pred) {
+    static_assert(_CG_STL_NAMESPACE::is_integral<TyPredicate>::value, "labeled_partition predicate must be an integral type");
+    return details::_labeled_partition(tile, pred);
+}
+template <typename TyPredicate, unsigned int Size, typename ParentT>
+_CG_STATIC_QUALIFIER coalesced_group labeled_partition(const thread_block_tile<Size, ParentT> &tile, TyPredicate pred) {
+    static_assert(_CG_STL_NAMESPACE::is_integral<TyPredicate>::value, "labeled_partition predicate must be an integral type");
+    static_assert(Size <= 32, "Labeled partition is available only for tiles of size smaller or equal to 32");
+    return details::_labeled_partition(tile, pred);
+}
+#endif
+_CG_END_NAMESPACE
+#endif // _CG_PARTITIONING_H

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/reduce.h ADDED Viewed

	@@ -0,0 +1,430 @@

+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#ifndef _CG_REDUCE_H_
+#define _CG_REDUCE_H_
+#include "info.h"
+#include "helpers.h"
+#include "coalesced_reduce.h"
+#include "functional.h"
+#include "cooperative_groups.h"
+_CG_BEGIN_NAMESPACE
+namespace details {
+    template <class Ty>
+    using _redux_is_add_supported = _CG_STL_NAMESPACE::integral_constant<
+            bool,
+            _CG_STL_NAMESPACE::is_integral<Ty>::value && (sizeof(Ty) <= 4)>;
+    template <class Ty>
+    using redux_is_add_supported = _redux_is_add_supported<Ty>;
+    // A specialization for 64 bit logical operations is possible
+    // but for now only accelerate 32 bit bitwise ops
+    template <class Ty>
+    using redux_is_logical_supported = redux_is_add_supported<Ty>;
+    // Base operator support case
+    template <class TyOp, class Ty> struct _redux_op_supported                 : public _CG_STL_NAMESPACE::false_type {};
+#ifdef _CG_HAS_OP_REDUX
+    template <class Ty> struct _redux_op_supported<cooperative_groups::plus<Ty>,    Ty> : public redux_is_add_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::less<Ty>,    Ty> : public redux_is_add_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::greater<Ty>, Ty> : public redux_is_add_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::bit_and<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::bit_or<Ty>,  Ty> : public redux_is_logical_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::bit_xor<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
+#endif
+    template <class Ty, template <class> class TyOp>
+    using redux_op_supported = _redux_op_supported<
+            typename details::remove_qual<TyOp<Ty>>,
+            Ty>;
+    // Groups smaller than 16 actually have worse performance characteristics when used with redux
+    // tiles of size 16 and 32 perform the same or better and have better code generation profiles
+    template <class TyGroup> struct _redux_group_optimized : public _CG_STL_NAMESPACE::false_type {};
+    template <unsigned int Sz, typename TyPar>
+    struct _redux_group_optimized<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::integral_constant<
+                                                                                            bool,
+                                                                                            (Sz >= 16)> {};
+    template <unsigned int Sz, typename TyPar>
+    struct _redux_group_optimized<internal_thread_block_tile<Sz, TyPar>>            : public _CG_STL_NAMESPACE::integral_constant<
+                                                                                            bool,
+                                                                                            (Sz >= 16)> {};
+    template <>
+    struct _redux_group_optimized<cooperative_groups::coalesced_group>              : public _CG_STL_NAMESPACE::true_type  {};
+    template <typename TyGroup>
+    using redux_group_optimized = _redux_group_optimized<details::remove_qual<TyGroup>>;
+    template <template <class> class TyOp>
+    _CG_STATIC_QUALIFIER int pick_redux(int mask, int val);
+    template <template <class> class TyOp>
+    _CG_STATIC_QUALIFIER unsigned int pick_redux(int mask, unsigned int val);
+#ifdef _CG_HAS_OP_REDUX
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::plus>(int mask, int val) {
+        return __reduce_add_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::less>(int mask, int val) {
+        return __reduce_min_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::greater>(int mask, int val) {
+        return __reduce_max_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_and>(int mask, int val) {
+        return __reduce_and_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_xor>(int mask, int val) {
+        return __reduce_xor_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_or>(int mask, int val) {
+        return __reduce_or_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::plus>(int mask, unsigned int val) {
+        return __reduce_add_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::less>(int mask, unsigned int val) {
+        return __reduce_min_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::greater>(int mask, unsigned int val) {
+        return __reduce_max_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_and>(int mask, unsigned int val) {
+        return __reduce_and_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_xor>(int mask, unsigned int val) {
+        return __reduce_xor_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_or>(int mask, unsigned int val) {
+        return __reduce_or_sync(mask, val);
+    }
+#endif
+    template <typename TyVal, bool = _CG_STL_NAMESPACE::is_unsigned<TyVal>::value>
+    struct _accelerated_op;
+    // Signed type redux intrinsic dispatch
+    template <typename TyVal>
+    struct _accelerated_op<TyVal, false> {
+        template <template <class> class TyOp>
+        _CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) {
+            return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<int>(val)));
+        }
+    };
+    // Unsigned type redux intrinsic dispatch
+    template <typename TyVal>
+    struct _accelerated_op<TyVal, true> {
+        template <template <class> class TyOp>
+        _CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) {
+            return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<unsigned int>(val)));
+        }
+    };
+    template <typename TyVal>
+    using accelerated_op = _accelerated_op<TyVal>;
+    template <typename TyVal, typename TyFnInput, typename TyGroup>
+    class _redux_dispatch {
+        template <class Ty, template <class> class TyOp>
+        using _redux_is_usable = _CG_STL_NAMESPACE::integral_constant<bool,
+            redux_op_supported<Ty, TyOp>::value &&
+            redux_group_optimized<TyGroup>::value>;
+        template <class Ty, template <class> class TyOp>
+        using redux_is_usable = typename _CG_STL_NAMESPACE::enable_if<_redux_is_usable<Ty, TyOp>::value, void>::type*;
+        template <class Ty, template <class> class TyOp>
+        using redux_is_not_usable = typename _CG_STL_NAMESPACE::enable_if<!_redux_is_usable<Ty, TyOp>::value, void>::type*;
+    public:
+        // Dispatch to redux if the combination of op and args are supported
+        template<
+            template <class> class TyOp,
+            redux_is_usable<TyFnInput, TyOp> = nullptr>
+        _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
+            // Retrieve the mask for the group and dispatch to redux
+            return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val));
+        }
+        template<
+            template <class> class TyOp,
+            redux_is_usable<TyFnInput, TyOp> = nullptr>
+        _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) {
+            // Retrieve the mask for the group and dispatch to redux
+            return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val));
+        }
+        // Fallback shuffle sync reduction
+        template <
+            template <class> class TyOp,
+            redux_is_not_usable<TyFnInput, TyOp> = nullptr>
+        _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
+            //Dispatch to fallback shuffle sync accelerated reduction
+            return coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
+        }
+    };
+    // Group support for reduce.
+    template <class TyGroup> struct _reduce_group_supported : public _CG_STL_NAMESPACE::false_type {};
+    template <unsigned int Sz, typename TyPar>
+    struct _reduce_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
+    template <unsigned int Sz, typename TyPar>
+    struct _reduce_group_supported<internal_thread_block_tile<Sz, TyPar>>            : public _CG_STL_NAMESPACE::true_type {};
+    template <>
+    struct _reduce_group_supported<cooperative_groups::coalesced_group>              : public _CG_STL_NAMESPACE::true_type {};
+    template <typename TyGroup>
+    using reduce_group_supported = _reduce_group_supported<details::remove_qual<TyGroup>>;
+    template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup>
+    _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
+        static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ");
+        using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>;
+        return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
+    }
+    template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup>
+    _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) {
+        static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ");
+        using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>;
+        return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
+    }
+    template <typename TyVal, typename TyOp, typename TyGroup>
+    _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
+        return details::coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+    }
+    template <unsigned int GroupId>
+    struct tile_reduce_dispatch;
+    template <>
+    struct tile_reduce_dispatch<details::coalesced_group_id> {
+        template <typename TyGroup, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            return details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+        }
+    };
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    template <>
+    struct tile_reduce_dispatch<details::multi_tile_group_id> {
+        template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto reduce(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
+            using TyRet = details::remove_qual<TyVal>;
+            const unsigned int num_warps = Size / 32;
+            auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
+                    *warp_scratch_location =
+                        details::reduce(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+            };
+            auto inter_warp_lambda =
+                [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
+                    *thread_scratch_location =
+                        details::reduce(subwarp, *thread_scratch_location, _CG_STL_NAMESPACE::forward<TyFn>(op));
+            };
+            return details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
+        }
+    };
+    enum class AsyncReduceType { store, update };
+    template <AsyncReduceType TyAsyncReduce>
+    struct async_reduce_result_handler;
+    template<>
+    struct async_reduce_result_handler<AsyncReduceType::store> {
+        template<typename TyDst, typename TyVal, typename TyOp>
+        _CG_STATIC_QUALIFIER void handleResult(TyDst *dst, TyVal& result, TyOp&& op) {
+            *dst = result;
+        }
+    };
+#if defined(_CG_HAS_STL_ATOMICS)
+    template<>
+    struct async_reduce_result_handler<AsyncReduceType::update> {
+        template<typename TyDst, typename TyVal, typename TyOp>
+        _CG_STATIC_QUALIFIER void handleResult(TyDst& dst, TyVal& result, TyOp&& op) {
+            atomic_update(dst, result, _CG_STL_NAMESPACE::forward<TyOp>(op));
+        }
+    };
+#endif
+    template <unsigned int GroupId, AsyncReduceType TyAsyncReduce>
+    struct tile_async_reduce_dispatch;
+    template <AsyncReduceType TyAsyncReduce>
+    struct tile_async_reduce_dispatch<details::coalesced_group_id, TyAsyncReduce> {
+        template <unsigned int TySize, typename ParentT, typename TyDst, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER void reduce(const __single_warp_thread_block_tile<TySize, ParentT>& group, TyDst& dst, TyVal&& val, TyFn&& op) {
+            // Do regular, in group reduction
+            auto result = details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+            // One thread stores/updates the destination
+            if (group.thread_rank() == 0) {
+                async_reduce_result_handler<TyAsyncReduce>::handleResult(dst, result, _CG_STL_NAMESPACE::forward<TyFn>(op));
+            }
+        }
+        template <typename TyDst, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER void reduce(const coalesced_group& group, TyDst& dst, TyVal&& val, TyFn&& op) {
+            // Do in group reduction to the last thread
+            auto result = details::coalesced_reduce_to_one(group, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+            // One thread stores/updates the destination
+            if (group.thread_rank() == group.size() - 1) {
+                async_reduce_result_handler<TyAsyncReduce>::handleResult(dst, result, _CG_STL_NAMESPACE::forward<TyFn>(op));
+            }
+        }
+    };
+    template <AsyncReduceType TyAsyncReduce>
+    struct tile_async_reduce_dispatch<details::multi_tile_group_id, TyAsyncReduce> {
+        template <unsigned int TySize, typename ParentT, typename TyDst, typename TyInputVal, typename TyFn>
+        _CG_STATIC_QUALIFIER void reduce(const thread_block_tile<TySize, ParentT>& group, TyDst& dst, TyInputVal&& val, TyFn&& op) {
+            using TyVal = remove_qual<TyInputVal>;
+            const unsigned int num_warps = TySize / 32;
+            details::barrier_t* sync_location = multi_warp_sync_location_getter(group);
+            auto warp_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, group.thread_rank() / 32);
+            // Do in warp reduce
+            auto warp = details::tiled_partition_internal<32, thread_block_tile<TySize, ParentT>>();
+            *warp_scratch_location = details::reduce(warp, _CG_STL_NAMESPACE::forward<TyInputVal>(val), op);
+            // Tile of size num_warps from the last warp to arrive does final reduction step
+            if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), num_warps)) {
+                auto subwarp = details::tiled_partition_internal<num_warps, decltype(warp)>();
+                if (subwarp.meta_group_rank() == 0) {
+                    auto thread_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, subwarp.thread_rank());
+                    auto thread_val = *thread_scratch_location;
+                    // Release other warps, we read their contribution already.
+                    subwarp.sync();
+                    details::sync_warps_release(sync_location, subwarp.thread_rank() == 0, details::cta::thread_rank(), num_warps);
+                    TyVal result = details::reduce(subwarp, thread_val, op);
+                    // One thread stores the result or updates the atomic
+                    if (subwarp.thread_rank() == 0) {
+                        async_reduce_result_handler<TyAsyncReduce>::handleResult(dst, result, _CG_STL_NAMESPACE::forward<TyFn>(op));
+                    }
+                }
+                warp.sync();
+            }
+        }
+    };
+#endif
+    template <typename TyGroup, typename TyInputVal, typename TyRetVal>
+    _CG_QUALIFIER void check_reduce_params() {
+        static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ");
+        static_assert(details::reduce_group_supported<TyGroup>::value, "This group does not exclusively represent a tile");
+    };
+    template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal>
+    _CG_QUALIFIER void check_async_reduce_params() {
+        check_reduce_params<TyGroup, TyInputVal, TyRetVal>();
+        static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ");
+    }
+} // details
+template <typename TyGroup, typename TyVal, typename TyFn>
+_CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_reduce_params<TyGroup, details::remove_qual<TyVal>, decltype(op(val, val))>();
+    using dispatch = details::tile_reduce_dispatch<TyGroup::_group_id>;
+    return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+namespace experimental {
+    #if defined(_CG_HAS_STL_ATOMICS)
+    template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
+    void _CG_QUALIFIER reduce_update_async(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
+        details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+        using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id, details::AsyncReduceType::update>;
+        dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+    }
+    template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
+    void _CG_QUALIFIER reduce_update_async(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
+        details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+        using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id, details::AsyncReduceType::update>;
+        dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+    }
+    #endif
+    template<typename TyGroup, typename TyVal, typename TyInputVal, typename TyFn>
+    void _CG_QUALIFIER reduce_store_async(const TyGroup& group, TyVal* dst, TyInputVal&& val, TyFn&& op) {
+        details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+        using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id, details::AsyncReduceType::store>;
+        dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+    }
+}
+#endif
+_CG_END_NAMESPACE
+#endif // _CG_REDUCE_H_

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/scan.h ADDED Viewed

	@@ -0,0 +1,324 @@

+/* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#ifndef _CG_SCAN_H_
+#define _CG_SCAN_H_
+#include "info.h"
+#include "helpers.h"
+#include "functional.h"
+#include "coalesced_scan.h"
+_CG_BEGIN_NAMESPACE
+namespace details {
+    // Group support for scan.
+    template <class TyGroup> struct _scan_group_supported : public _CG_STL_NAMESPACE::false_type {};
+    template <unsigned int Sz, typename TyPar>
+    struct _scan_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
+    template <unsigned int Sz, typename TyPar>
+    struct _scan_group_supported<internal_thread_block_tile<Sz, TyPar>>            : public _CG_STL_NAMESPACE::true_type {};
+    template <>
+    struct _scan_group_supported<cooperative_groups::coalesced_group>              : public _CG_STL_NAMESPACE::true_type {};
+    template <typename TyGroup>
+    using scan_group_supported = _scan_group_supported<details::remove_qual<TyGroup>>;
+    template <bool IsIntegralPlus>
+    struct integral_optimized_scan;
+    enum class ScanType { exclusive, inclusive };
+    template <unsigned int GroupId,  ScanType TyScan>
+    struct scan_dispatch;
+    template <ScanType TyScan>
+    struct scan_dispatch<details::coalesced_group_id, TyScan> {
+        template <typename TyGroup, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            auto scan_result = coalesced_inclusive_scan(group, val, op);
+            if (TyScan == ScanType::exclusive) {
+                scan_result = convert_inclusive_to_exclusive(group,
+                                                             scan_result,
+                                                             _CG_STL_NAMESPACE::forward<TyVal>(val),
+                                                             _CG_STL_NAMESPACE::forward<TyFn>(op));
+            }
+            return scan_result;
+        }
+    };
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    template <ScanType TyScan>
+    struct scan_dispatch<details::multi_tile_group_id, TyScan> {
+        template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
+            using TyRet = details::remove_qual<TyVal>;
+            const unsigned int num_warps = Size / 32;
+            // In warp scan result, calculated in warp_lambda
+            TyRet warp_scan;
+            // In warp scan, put sum in the warp_scratch_location
+            auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
+                warp_scan =
+                    details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+                if (warp.thread_rank() + 1 == warp.size()) {
+                    *warp_scratch_location = warp_scan;
+                }
+                if (TyScan == ScanType::exclusive) {
+                    warp_scan = warp.shfl_up(warp_scan, 1);
+                }
+            };
+            // Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it
+            // to its in-warp scan result
+            auto inter_warp_lambda =
+                [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
+                    auto thread_val = *thread_scratch_location;
+                    auto result = coalesced_inclusive_scan(subwarp, thread_val, op);
+                    *thread_scratch_location = convert_inclusive_to_exclusive(subwarp, result, thread_val, op);
+            };
+            TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
+            if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) {
+                return previous_warps_sum;
+            }
+            if (warpType::meta_group_rank() == 0) {
+                return warp_scan;
+            }
+            else {
+                return op(warp_scan, previous_warps_sum);
+            }
+        }
+    };
+#if defined(_CG_HAS_STL_ATOMICS)
+    template <unsigned int GroupId,  ScanType TyScan>
+    struct scan_update_dispatch;
+    template <ScanType TyScan>
+    struct scan_update_dispatch<details::coalesced_group_id, TyScan> {
+        template <typename TyGroup, typename TyAtomic, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            details::remove_qual<TyVal> old;
+            // Do regular in group scan
+            auto scan_result = details::coalesced_inclusive_scan(group, val, op);
+            // Last thread updates the atomic and distributes its old value to other threads
+            if (group.thread_rank() == group.size() - 1) {
+                old = atomic_update(dst, scan_result, _CG_STL_NAMESPACE::forward<TyFn>(op));
+            }
+            old = group.shfl(old, group.size() - 1);
+            if (TyScan == ScanType::exclusive) {
+                scan_result = convert_inclusive_to_exclusive(group, scan_result, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+            }
+            scan_result = op(old, scan_result);
+            return scan_result;
+        }
+    };
+    template <ScanType TyScan>
+    struct scan_update_dispatch<details::multi_tile_group_id, TyScan> {
+        template <unsigned int Size, typename ParentT, typename TyAtomic, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
+            using TyRet = details::remove_qual<TyVal>;
+            const unsigned int num_warps = Size / 32;
+            // In warp scan result, calculated in warp_lambda
+            TyRet warp_scan;
+            // In warp scan, put sum in the warp_scratch_location
+            auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
+                warp_scan =
+                    details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+                if (warp.thread_rank() + 1 == warp.size()) {
+                    *warp_scratch_location = warp_scan;
+                }
+                if (TyScan == ScanType::exclusive) {
+                    warp_scan = warp.shfl_up(warp_scan, 1);
+                }
+            };
+            // Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it
+            // to its in-warp scan result
+            auto inter_warp_lambda =
+                [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
+                    auto thread_val = *thread_scratch_location;
+                    auto scan_result = details::coalesced_inclusive_scan(subwarp, thread_val, op);
+                    TyRet offset;
+                    // Single thread does the atomic update with sum of all contributions and reads the old value.
+                    if (subwarp.thread_rank() == subwarp.size() - 1) {
+                        offset = details::atomic_update(dst, scan_result, op);
+                    }
+                    offset = subwarp.shfl(offset, subwarp.size() - 1);
+                    scan_result = convert_inclusive_to_exclusive(subwarp, scan_result, thread_val, op);
+                    // Add offset read from the atomic to the scanned warp sum.
+                    // Skipping first thread, since it got defautly constructed value from the conversion,
+                    // it should just return the offset received from the thread that did the atomic update.
+                    if (subwarp.thread_rank() != 0) {
+                        offset = op(scan_result, offset);
+                    }
+                    *thread_scratch_location = offset;
+            };
+            TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
+            if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) {
+                return previous_warps_sum;
+            }
+            return op(warp_scan, previous_warps_sum);
+        }
+    };
+#endif
+#endif
+    template <typename TyGroup, typename TyInputVal, typename TyRetVal>
+    _CG_QUALIFIER void check_scan_params() {
+        static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ");
+        static_assert(details::scan_group_supported<TyGroup>::value, "This group does not exclusively represent a tile");
+    }
+#if defined(_CG_HAS_STL_ATOMICS)
+    template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal>
+    _CG_QUALIFIER void check_scan_update_params() {
+        check_scan_params<TyGroup, TyInputVal, TyRetVal>();
+        static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ");
+    }
+#endif
+} // details
+template <typename TyGroup, typename TyVal, typename TyFn>
+_CG_QUALIFIER auto inclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>();
+    using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
+    return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+template <typename TyGroup, typename TyVal>
+_CG_QUALIFIER details::remove_qual<TyVal> inclusive_scan(const TyGroup& group, TyVal&& val) {
+    return inclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>());
+}
+template <typename TyGroup, typename TyVal, typename TyFn>
+_CG_QUALIFIER auto exclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>();
+    using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
+    return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+template <typename TyGroup, typename TyVal>
+_CG_QUALIFIER details::remove_qual<TyVal> exclusive_scan(const TyGroup& group, TyVal&& val) {
+    return exclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>());
+}
+#if defined(_CG_HAS_STL_ATOMICS) && defined(_CG_ABI_EXPERIMENTAL)
+namespace experimental {
+    template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
+    _CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+        details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+        using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
+        return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+    }
+    template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
+    _CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco> & dst, TyInputVal&& val) {
+        return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
+    }
+    template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
+    _CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+        details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+        using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
+        return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+    }
+    template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
+    _CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val) {
+        return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
+    }
+    template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
+    _CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+        details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+        using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
+        return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+    }
+    template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
+    _CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco> & dst, TyInputVal&& val) {
+        return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
+    }
+    template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
+    _CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+        details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+        using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
+        return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+    }
+    template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
+    _CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val) {
+        return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
+    }
+}
+#endif
+_CG_END_NAMESPACE
+#endif // _CG_SCAN_H_

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/memcpy_async.h ADDED Viewed

	@@ -0,0 +1,62 @@

+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#ifndef _COOPERATIVE_GROUPS_MEMCPY_ASYNC
+#define _COOPERATIVE_GROUPS_MEMCPY_ASYNC
+#include "../cooperative_groups.h"
+#include "details/info.h"
+#ifdef _CG_CPP11_FEATURES
+# include "details/async.h"
+#else
+# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
+         -std=c++11 compiler option.
+#endif
+#endif // _COOPERATIVE_GROUPS_MEMCPY_ASYNC

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/reduce.h ADDED Viewed

	@@ -0,0 +1,63 @@

+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#ifndef _COOPERATIVE_GROUPS_REDUCE_H
+#define _COOPERATIVE_GROUPS_REDUCE_H
+#include "../cooperative_groups.h"
+#include "details/info.h"
+#ifdef _CG_CPP11_FEATURES
+# include "details/reduce.h"
+#else
+# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
+         -std=c++11 compiler option.
+#endif
+#endif //_COOPERATIVE_GROUPS_REDUCE_H

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda.h ADDED Viewed

The diff for this file is too large to render. See raw diff

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaEGLTypedefs.h ADDED Viewed

	@@ -0,0 +1,96 @@

+/*
+ * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef CUDAEGLTYPEDEFS_H
+#define CUDAEGLTYPEDEFS_H
+#include <cudaEGL.h>
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+/*
+ * Macros for the latest version for each driver function in cudaEGL.h
+ */
+#define PFN_cuGraphicsEGLRegisterImage  PFN_cuGraphicsEGLRegisterImage_v7000
+#define PFN_cuEGLStreamConsumerConnect  PFN_cuEGLStreamConsumerConnect_v7000
+#define PFN_cuEGLStreamConsumerConnectWithFlags  PFN_cuEGLStreamConsumerConnectWithFlags_v8000
+#define PFN_cuEGLStreamConsumerDisconnect  PFN_cuEGLStreamConsumerDisconnect_v7000
+#define PFN_cuEGLStreamConsumerAcquireFrame  PFN_cuEGLStreamConsumerAcquireFrame_v7000
+#define PFN_cuEGLStreamConsumerReleaseFrame  PFN_cuEGLStreamConsumerReleaseFrame_v7000
+#define PFN_cuEGLStreamProducerConnect  PFN_cuEGLStreamProducerConnect_v7000
+#define PFN_cuEGLStreamProducerDisconnect  PFN_cuEGLStreamProducerDisconnect_v7000
+#define PFN_cuEGLStreamProducerPresentFrame  PFN_cuEGLStreamProducerPresentFrame_v7000
+#define PFN_cuEGLStreamProducerReturnFrame  PFN_cuEGLStreamProducerReturnFrame_v7000
+#define PFN_cuGraphicsResourceGetMappedEglFrame  PFN_cuGraphicsResourceGetMappedEglFrame_v7000
+#define PFN_cuEventCreateFromEGLSync  PFN_cuEventCreateFromEGLSync_v9000
+/**
+ * Type definitions for functions defined in cudaEGL.h
+ */
+typedef CUresult (CUDAAPI *PFN_cuGraphicsEGLRegisterImage_v7000)(CUgraphicsResource CUDAAPI *pCudaResource, EGLImageKHR image, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerConnect_v7000)(CUeglStreamConnection CUDAAPI *conn, EGLStreamKHR stream);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerConnectWithFlags_v8000)(CUeglStreamConnection CUDAAPI *conn, EGLStreamKHR stream, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerDisconnect_v7000)(CUeglStreamConnection CUDAAPI *conn);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerAcquireFrame_v7000)(CUeglStreamConnection CUDAAPI *conn, CUgraphicsResource CUDAAPI *pCudaResource, CUstream CUDAAPI *pStream, unsigned int timeout);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerReleaseFrame_v7000)(CUeglStreamConnection CUDAAPI *conn, CUgraphicsResource pCudaResource, CUstream CUDAAPI *pStream);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamProducerConnect_v7000)(CUeglStreamConnection CUDAAPI *conn, EGLStreamKHR stream, EGLint width, EGLint height);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamProducerDisconnect_v7000)(CUeglStreamConnection CUDAAPI *conn);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamProducerPresentFrame_v7000)(CUeglStreamConnection CUDAAPI *conn, CUeglFrame_v1 eglframe, CUstream CUDAAPI *pStream);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamProducerReturnFrame_v7000)(CUeglStreamConnection CUDAAPI *conn, CUeglFrame_v1 CUDAAPI *eglframe, CUstream CUDAAPI *pStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedEglFrame_v7000)(CUeglFrame_v1 CUDAAPI *eglFrame, CUgraphicsResource resource, unsigned int index, unsigned int mipLevel);
+typedef CUresult (CUDAAPI *PFN_cuEventCreateFromEGLSync_v9000)(CUevent CUDAAPI *phEvent, EGLSyncKHR eglSync, unsigned int flags);
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+#endif // file guard

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaGLTypedefs.h ADDED Viewed

	@@ -0,0 +1,123 @@

+/*
+ * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef CUDAGLTYPEDEFS_H
+#define CUDAGLTYPEDEFS_H
+// Dependent includes for cudagl.h
+#include <GL/gl.h>
+#include <cudaGL.h>
+#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
+    #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptds
+    #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptsz
+#else
+    #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## default_version
+    #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## default_version
+#endif
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+/*
+ * Macros for the latest version for each driver function in cudaGL.h
+ */
+#define PFN_cuGraphicsGLRegisterBuffer  PFN_cuGraphicsGLRegisterBuffer_v3000
+#define PFN_cuGraphicsGLRegisterImage  PFN_cuGraphicsGLRegisterImage_v3000
+#define PFN_cuWGLGetDevice  PFN_cuWGLGetDevice_v2020
+#define PFN_cuGLGetDevices  PFN_cuGLGetDevices_v6050
+#define PFN_cuGLCtxCreate  PFN_cuGLCtxCreate_v3020
+#define PFN_cuGLInit  PFN_cuGLInit_v2000
+#define PFN_cuGLRegisterBufferObject  PFN_cuGLRegisterBufferObject_v2000
+#define PFN_cuGLMapBufferObject  __API_TYPEDEF_PTDS(PFN_cuGLMapBufferObject, 3020, 7000)
+#define PFN_cuGLUnmapBufferObject  PFN_cuGLUnmapBufferObject_v2000
+#define PFN_cuGLUnregisterBufferObject  PFN_cuGLUnregisterBufferObject_v2000
+#define PFN_cuGLSetBufferObjectMapFlags  PFN_cuGLSetBufferObjectMapFlags_v2030
+#define PFN_cuGLMapBufferObjectAsync  __API_TYPEDEF_PTSZ(PFN_cuGLMapBufferObjectAsync, 3020, 7000)
+#define PFN_cuGLUnmapBufferObjectAsync  PFN_cuGLUnmapBufferObjectAsync_v2030
+/**
+ * Type definitions for functions defined in cudaGL.h
+ */
+typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterBuffer_v3000)(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterImage_v3000)(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags);
+#ifdef _WIN32
+typedef CUresult (CUDAAPI *PFN_cuWGLGetDevice_v2020)(CUdevice_v1 *pDevice, HGPUNV hGpu);
+#endif
+typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v6050)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
+typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v3020)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device);
+typedef CUresult (CUDAAPI *PFN_cuGLInit_v2000)(void);
+typedef CUresult (CUDAAPI *PFN_cuGLRegisterBufferObject_v2000)(GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v7000_ptds)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObject_v2000)(GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLUnregisterBufferObject_v2000)(GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLSetBufferObjectMapFlags_v2030)(GLuint buffer, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v7000_ptsz)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObjectAsync_v2030)(GLuint buffer, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream);
+/*
+ * Type definitions for older versioned functions in cuda.h
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v4010)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v2000)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v2030)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v2000)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device);
+#endif
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+#endif // file guard

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaTypedefs.h ADDED Viewed

	@@ -0,0 +1,959 @@

+/*
+ * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef CUDATYPEDEFS_H
+#define CUDATYPEDEFS_H
+#include <cuda.h>
+#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
+    #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptds
+    #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptsz
+#else
+    #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## default_version
+    #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## default_version
+#endif
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+/*
+ * Macros for the latest version for each driver function in cuda.h
+ */
+#define PFN_cuGetErrorString  PFN_cuGetErrorString_v6000
+#define PFN_cuGetErrorName  PFN_cuGetErrorName_v6000
+#define PFN_cuInit  PFN_cuInit_v2000
+#define PFN_cuDriverGetVersion  PFN_cuDriverGetVersion_v2020
+#define PFN_cuDeviceGet  PFN_cuDeviceGet_v2000
+#define PFN_cuDeviceGetCount  PFN_cuDeviceGetCount_v2000
+#define PFN_cuDeviceGetName  PFN_cuDeviceGetName_v2000
+#define PFN_cuDeviceGetUuid  PFN_cuDeviceGetUuid_v11040
+#define PFN_cuDeviceGetLuid  PFN_cuDeviceGetLuid_v10000
+#define PFN_cuDeviceTotalMem  PFN_cuDeviceTotalMem_v3020
+#define PFN_cuDeviceGetTexture1DLinearMaxWidth  PFN_cuDeviceGetTexture1DLinearMaxWidth_v11010
+#define PFN_cuDeviceGetAttribute  PFN_cuDeviceGetAttribute_v2000
+#define PFN_cuDeviceGetNvSciSyncAttributes  PFN_cuDeviceGetNvSciSyncAttributes_v10020
+#define PFN_cuDeviceSetMemPool  PFN_cuDeviceSetMemPool_v11020
+#define PFN_cuDeviceGetMemPool  PFN_cuDeviceGetMemPool_v11020
+#define PFN_cuDeviceGetDefaultMemPool  PFN_cuDeviceGetDefaultMemPool_v11020
+#define PFN_cuDeviceGetProperties  PFN_cuDeviceGetProperties_v2000
+#define PFN_cuDeviceComputeCapability  PFN_cuDeviceComputeCapability_v2000
+#define PFN_cuDevicePrimaryCtxRetain  PFN_cuDevicePrimaryCtxRetain_v7000
+#define PFN_cuDevicePrimaryCtxRelease  PFN_cuDevicePrimaryCtxRelease_v11000
+#define PFN_cuDevicePrimaryCtxSetFlags  PFN_cuDevicePrimaryCtxSetFlags_v11000
+#define PFN_cuDevicePrimaryCtxGetState  PFN_cuDevicePrimaryCtxGetState_v7000
+#define PFN_cuDevicePrimaryCtxReset  PFN_cuDevicePrimaryCtxReset_v11000
+#define PFN_cuDeviceGetExecAffinitySupport  PFN_cuDeviceGetExecAffinitySupport_v11040
+#define PFN_cuCtxCreate  PFN_cuCtxCreate_v11040
+#define PFN_cuCtxDestroy  PFN_cuCtxDestroy_v4000
+#define PFN_cuCtxPushCurrent  PFN_cuCtxPushCurrent_v4000
+#define PFN_cuCtxPopCurrent  PFN_cuCtxPopCurrent_v4000
+#define PFN_cuCtxSetCurrent  PFN_cuCtxSetCurrent_v4000
+#define PFN_cuCtxGetCurrent  PFN_cuCtxGetCurrent_v4000
+#define PFN_cuCtxGetDevice  PFN_cuCtxGetDevice_v2000
+#define PFN_cuCtxGetFlags  PFN_cuCtxGetFlags_v7000
+#define PFN_cuCtxSynchronize  PFN_cuCtxSynchronize_v2000
+#define PFN_cuCtxSetLimit  PFN_cuCtxSetLimit_v3010
+#define PFN_cuCtxGetLimit  PFN_cuCtxGetLimit_v3010
+#define PFN_cuCtxGetCacheConfig  PFN_cuCtxGetCacheConfig_v3020
+#define PFN_cuCtxSetCacheConfig  PFN_cuCtxSetCacheConfig_v3020
+#define PFN_cuCtxGetSharedMemConfig  PFN_cuCtxGetSharedMemConfig_v4020
+#define PFN_cuCtxSetSharedMemConfig  PFN_cuCtxSetSharedMemConfig_v4020
+#define PFN_cuCtxGetApiVersion  PFN_cuCtxGetApiVersion_v3020
+#define PFN_cuCtxGetStreamPriorityRange  PFN_cuCtxGetStreamPriorityRange_v5050
+#define PFN_cuCtxResetPersistingL2Cache  PFN_cuCtxResetPersistingL2Cache_v11000
+#define PFN_cuCtxAttach  PFN_cuCtxAttach_v2000
+#define PFN_cuCtxDetach  PFN_cuCtxDetach_v2000
+#define PFN_cuCtxGetExecAffinity  PFN_cuCtxGetExecAffinity_v11040
+#define PFN_cuModuleLoad  PFN_cuModuleLoad_v2000
+#define PFN_cuModuleLoadData  PFN_cuModuleLoadData_v2000
+#define PFN_cuModuleLoadDataEx  PFN_cuModuleLoadDataEx_v2010
+#define PFN_cuModuleLoadFatBinary  PFN_cuModuleLoadFatBinary_v2000
+#define PFN_cuModuleUnload  PFN_cuModuleUnload_v2000
+#define PFN_cuModuleGetFunction  PFN_cuModuleGetFunction_v2000
+#define PFN_cuModuleGetGlobal  PFN_cuModuleGetGlobal_v3020
+#define PFN_cuModuleGetTexRef  PFN_cuModuleGetTexRef_v2000
+#define PFN_cuModuleGetSurfRef  PFN_cuModuleGetSurfRef_v3000
+#define PFN_cuLinkCreate  PFN_cuLinkCreate_v6050
+#define PFN_cuLinkAddData  PFN_cuLinkAddData_v6050
+#define PFN_cuLinkAddFile  PFN_cuLinkAddFile_v6050
+#define PFN_cuLinkComplete  PFN_cuLinkComplete_v5050
+#define PFN_cuLinkDestroy  PFN_cuLinkDestroy_v5050
+#define PFN_cuMemGetInfo  PFN_cuMemGetInfo_v3020
+#define PFN_cuMemAlloc  PFN_cuMemAlloc_v3020
+#define PFN_cuMemAllocPitch  PFN_cuMemAllocPitch_v3020
+#define PFN_cuMemFree  PFN_cuMemFree_v3020
+#define PFN_cuMemGetAddressRange  PFN_cuMemGetAddressRange_v3020
+#define PFN_cuMemAllocHost  PFN_cuMemAllocHost_v3020
+#define PFN_cuMemFreeHost  PFN_cuMemFreeHost_v2000
+#define PFN_cuMemHostAlloc  PFN_cuMemHostAlloc_v2020
+#define PFN_cuMemHostGetDevicePointer  PFN_cuMemHostGetDevicePointer_v3020
+#define PFN_cuMemHostGetFlags  PFN_cuMemHostGetFlags_v2030
+#define PFN_cuMemAllocManaged  PFN_cuMemAllocManaged_v6000
+#define PFN_cuDeviceGetByPCIBusId  PFN_cuDeviceGetByPCIBusId_v4010
+#define PFN_cuDeviceGetPCIBusId  PFN_cuDeviceGetPCIBusId_v4010
+#define PFN_cuIpcGetEventHandle  PFN_cuIpcGetEventHandle_v4010
+#define PFN_cuIpcOpenEventHandle  PFN_cuIpcOpenEventHandle_v4010
+#define PFN_cuIpcGetMemHandle  PFN_cuIpcGetMemHandle_v4010
+#define PFN_cuIpcOpenMemHandle  PFN_cuIpcOpenMemHandle_v11000
+#define PFN_cuIpcCloseMemHandle  PFN_cuIpcCloseMemHandle_v4010
+#define PFN_cuMemHostRegister  PFN_cuMemHostRegister_v6050
+#define PFN_cuMemHostUnregister  PFN_cuMemHostUnregister_v4000
+#define PFN_cuMemcpy  __API_TYPEDEF_PTDS(PFN_cuMemcpy, 4000, 7000)
+#define PFN_cuMemcpyPeer  __API_TYPEDEF_PTDS(PFN_cuMemcpyPeer, 4000, 7000)
+#define PFN_cuMemcpyHtoD  __API_TYPEDEF_PTDS(PFN_cuMemcpyHtoD, 3020, 7000)
+#define PFN_cuMemcpyDtoH  __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoH, 3020, 7000)
+#define PFN_cuMemcpyDtoD  __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoD, 3020, 7000)
+#define PFN_cuMemcpyDtoA  __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoA, 3020, 7000)
+#define PFN_cuMemcpyAtoD  __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoD, 3020, 7000)
+#define PFN_cuMemcpyHtoA  __API_TYPEDEF_PTDS(PFN_cuMemcpyHtoA, 3020, 7000)
+#define PFN_cuMemcpyAtoH  __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoH, 3020, 7000)
+#define PFN_cuMemcpyAtoA  __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoA, 3020, 7000)
+#define PFN_cuMemcpy2D  __API_TYPEDEF_PTDS(PFN_cuMemcpy2D, 3020, 7000)
+#define PFN_cuMemcpy2DUnaligned  __API_TYPEDEF_PTDS(PFN_cuMemcpy2DUnaligned, 3020, 7000)
+#define PFN_cuMemcpy3D  __API_TYPEDEF_PTDS(PFN_cuMemcpy3D, 3020, 7000)
+#define PFN_cuMemcpy3DPeer  __API_TYPEDEF_PTDS(PFN_cuMemcpy3DPeer, 4000, 7000)
+#define PFN_cuMemcpyAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyAsync, 4000, 7000)
+#define PFN_cuMemcpyPeerAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyPeerAsync, 4000, 7000)
+#define PFN_cuMemcpyHtoDAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyHtoDAsync, 3020, 7000)
+#define PFN_cuMemcpyDtoHAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyDtoHAsync, 3020, 7000)
+#define PFN_cuMemcpyDtoDAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyDtoDAsync, 3020, 7000)
+#define PFN_cuMemcpyHtoAAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyHtoAAsync, 3020, 7000)
+#define PFN_cuMemcpyAtoHAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyAtoHAsync, 3020, 7000)
+#define PFN_cuMemcpy2DAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpy2DAsync, 3020, 7000)
+#define PFN_cuMemcpy3DAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpy3DAsync, 3020, 7000)
+#define PFN_cuMemcpy3DPeerAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpy3DPeerAsync, 4000, 7000)
+#define PFN_cuMemsetD8  __API_TYPEDEF_PTDS(PFN_cuMemsetD8, 3020, 7000)
+#define PFN_cuMemsetD16  __API_TYPEDEF_PTDS(PFN_cuMemsetD16, 3020, 7000)
+#define PFN_cuMemsetD32  __API_TYPEDEF_PTDS(PFN_cuMemsetD32, 3020, 7000)
+#define PFN_cuMemsetD2D8  __API_TYPEDEF_PTDS(PFN_cuMemsetD2D8, 3020, 7000)
+#define PFN_cuMemsetD2D16  __API_TYPEDEF_PTDS(PFN_cuMemsetD2D16, 3020, 7000)
+#define PFN_cuMemsetD2D32  __API_TYPEDEF_PTDS(PFN_cuMemsetD2D32, 3020, 7000)
+#define PFN_cuMemsetD8Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD8Async, 3020, 7000)
+#define PFN_cuMemsetD16Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD16Async, 3020, 7000)
+#define PFN_cuMemsetD32Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD32Async, 3020, 7000)
+#define PFN_cuMemsetD2D8Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D8Async, 3020, 7000)
+#define PFN_cuMemsetD2D16Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D16Async, 3020, 7000)
+#define PFN_cuMemsetD2D32Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D32Async, 3020, 7000)
+#define PFN_cuArrayCreate  PFN_cuArrayCreate_v3020
+#define PFN_cuArrayGetDescriptor  PFN_cuArrayGetDescriptor_v3020
+#define PFN_cuArrayGetSparseProperties  PFN_cuArrayGetSparseProperties_v11010
+#define PFN_cuMipmappedArrayGetSparseProperties  PFN_cuMipmappedArrayGetSparseProperties_v11010
+#define PFN_cuArrayGetMemoryRequirements  PFN_cuArrayGetMemoryRequirements_v11060
+#define PFN_cuMipmappedArrayGetMemoryRequirements  PFN_cuMipmappedArrayGetMemoryRequirements_v11060
+#define PFN_cuArrayGetPlane  PFN_cuArrayGetPlane_v11020
+#define PFN_cuArrayDestroy  PFN_cuArrayDestroy_v2000
+#define PFN_cuArray3DCreate  PFN_cuArray3DCreate_v3020
+#define PFN_cuArray3DGetDescriptor  PFN_cuArray3DGetDescriptor_v3020
+#define PFN_cuMipmappedArrayCreate  PFN_cuMipmappedArrayCreate_v5000
+#define PFN_cuMipmappedArrayGetLevel  PFN_cuMipmappedArrayGetLevel_v5000
+#define PFN_cuMipmappedArrayDestroy  PFN_cuMipmappedArrayDestroy_v5000
+#define PFN_cuMemAddressReserve  PFN_cuMemAddressReserve_v10020
+#define PFN_cuMemAddressFree  PFN_cuMemAddressFree_v10020
+#define PFN_cuMemCreate  PFN_cuMemCreate_v10020
+#define PFN_cuMemRelease  PFN_cuMemRelease_v10020
+#define PFN_cuMemMap  PFN_cuMemMap_v10020
+#define PFN_cuMemMapArrayAsync  __API_TYPEDEF_PTSZ(PFN_cuMemMapArrayAsync, 11010, 11010)
+#define PFN_cuMemUnmap  PFN_cuMemUnmap_v10020
+#define PFN_cuMemSetAccess  PFN_cuMemSetAccess_v10020
+#define PFN_cuMemGetAccess  PFN_cuMemGetAccess_v10020
+#define PFN_cuMemExportToShareableHandle  PFN_cuMemExportToShareableHandle_v10020
+#define PFN_cuMemImportFromShareableHandle  PFN_cuMemImportFromShareableHandle_v10020
+#define PFN_cuMemGetAllocationGranularity  PFN_cuMemGetAllocationGranularity_v10020
+#define PFN_cuMemGetAllocationPropertiesFromHandle  PFN_cuMemGetAllocationPropertiesFromHandle_v10020
+#define PFN_cuMemRetainAllocationHandle  PFN_cuMemRetainAllocationHandle_v11000
+#define PFN_cuMemFreeAsync  __API_TYPEDEF_PTSZ(PFN_cuMemFreeAsync, 11020, 11020)
+#define PFN_cuMemAllocAsync  __API_TYPEDEF_PTSZ(PFN_cuMemAllocAsync, 11020, 11020)
+#define PFN_cuMemPoolTrimTo  PFN_cuMemPoolTrimTo_v11020
+#define PFN_cuMemPoolSetAttribute  PFN_cuMemPoolSetAttribute_v11020
+#define PFN_cuMemPoolGetAttribute  PFN_cuMemPoolGetAttribute_v11020
+#define PFN_cuMemPoolSetAccess  PFN_cuMemPoolSetAccess_v11020
+#define PFN_cuMemPoolGetAccess  PFN_cuMemPoolGetAccess_v11020
+#define PFN_cuMemPoolCreate  PFN_cuMemPoolCreate_v11020
+#define PFN_cuMemPoolDestroy  PFN_cuMemPoolDestroy_v11020
+#define PFN_cuMemAllocFromPoolAsync  __API_TYPEDEF_PTSZ(PFN_cuMemAllocFromPoolAsync, 11020, 11020)
+#define PFN_cuMemPoolExportToShareableHandle  PFN_cuMemPoolExportToShareableHandle_v11020
+#define PFN_cuMemPoolImportFromShareableHandle  PFN_cuMemPoolImportFromShareableHandle_v11020
+#define PFN_cuMemPoolExportPointer  PFN_cuMemPoolExportPointer_v11020
+#define PFN_cuMemPoolImportPointer  PFN_cuMemPoolImportPointer_v11020
+#define PFN_cuPointerGetAttribute  PFN_cuPointerGetAttribute_v4000
+#define PFN_cuMemPrefetchAsync  __API_TYPEDEF_PTSZ(PFN_cuMemPrefetchAsync, 8000, 8000)
+#define PFN_cuMemAdvise  PFN_cuMemAdvise_v8000
+#define PFN_cuMemRangeGetAttribute  PFN_cuMemRangeGetAttribute_v8000
+#define PFN_cuMemRangeGetAttributes  PFN_cuMemRangeGetAttributes_v8000
+#define PFN_cuPointerSetAttribute  PFN_cuPointerSetAttribute_v6000
+#define PFN_cuPointerGetAttributes  PFN_cuPointerGetAttributes_v7000
+#define PFN_cuStreamCreate  PFN_cuStreamCreate_v2000
+#define PFN_cuStreamCreateWithPriority  PFN_cuStreamCreateWithPriority_v5050
+#define PFN_cuStreamGetPriority  __API_TYPEDEF_PTSZ(PFN_cuStreamGetPriority, 5050, 7000)
+#define PFN_cuStreamGetFlags  __API_TYPEDEF_PTSZ(PFN_cuStreamGetFlags, 5050, 7000)
+#define PFN_cuStreamGetCtx  __API_TYPEDEF_PTSZ(PFN_cuStreamGetCtx, 9020, 9020)
+#define PFN_cuStreamWaitEvent  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitEvent, 3020, 7000)
+#define PFN_cuStreamAddCallback  __API_TYPEDEF_PTSZ(PFN_cuStreamAddCallback, 5000, 7000)
+#define PFN_cuStreamBeginCapture  __API_TYPEDEF_PTSZ(PFN_cuStreamBeginCapture, 10010, 10010)
+#define PFN_cuThreadExchangeStreamCaptureMode  PFN_cuThreadExchangeStreamCaptureMode_v10010
+#define PFN_cuStreamEndCapture  __API_TYPEDEF_PTSZ(PFN_cuStreamEndCapture, 10000, 10000)
+#define PFN_cuStreamIsCapturing  __API_TYPEDEF_PTSZ(PFN_cuStreamIsCapturing, 10000, 10000)
+#define PFN_cuStreamGetCaptureInfo  __API_TYPEDEF_PTSZ(PFN_cuStreamGetCaptureInfo, 10010, 10010)
+#define PFN_cuStreamGetCaptureInfo_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamGetCaptureInfo, 11030, 11030)
+#define PFN_cuStreamUpdateCaptureDependencies  __API_TYPEDEF_PTSZ(PFN_cuStreamUpdateCaptureDependencies, 11030, 11030)
+#define PFN_cuStreamAttachMemAsync  __API_TYPEDEF_PTSZ(PFN_cuStreamAttachMemAsync, 6000, 7000)
+#define PFN_cuStreamQuery  __API_TYPEDEF_PTSZ(PFN_cuStreamQuery, 2000, 7000)
+#define PFN_cuStreamSynchronize  __API_TYPEDEF_PTSZ(PFN_cuStreamSynchronize, 2000, 7000)
+#define PFN_cuStreamDestroy  PFN_cuStreamDestroy_v4000
+#define PFN_cuStreamCopyAttributes  __API_TYPEDEF_PTSZ(PFN_cuStreamCopyAttributes, 11000, 11000)
+#define PFN_cuStreamGetAttribute  __API_TYPEDEF_PTSZ(PFN_cuStreamGetAttribute, 11000, 11000)
+#define PFN_cuStreamSetAttribute  __API_TYPEDEF_PTSZ(PFN_cuStreamSetAttribute, 11000, 11000)
+#define PFN_cuEventCreate  PFN_cuEventCreate_v2000
+#define PFN_cuEventRecord  __API_TYPEDEF_PTSZ(PFN_cuEventRecord, 2000, 7000)
+#define PFN_cuEventRecordWithFlags  __API_TYPEDEF_PTSZ(PFN_cuEventRecordWithFlags, 11010, 11010)
+#define PFN_cuEventQuery  PFN_cuEventQuery_v2000
+#define PFN_cuEventSynchronize  PFN_cuEventSynchronize_v2000
+#define PFN_cuEventDestroy  PFN_cuEventDestroy_v4000
+#define PFN_cuEventElapsedTime  PFN_cuEventElapsedTime_v2000
+#define PFN_cuImportExternalMemory  PFN_cuImportExternalMemory_v10000
+#define PFN_cuExternalMemoryGetMappedBuffer  PFN_cuExternalMemoryGetMappedBuffer_v10000
+#define PFN_cuExternalMemoryGetMappedMipmappedArray  PFN_cuExternalMemoryGetMappedMipmappedArray_v10000
+#define PFN_cuDestroyExternalMemory  PFN_cuDestroyExternalMemory_v10000
+#define PFN_cuImportExternalSemaphore  PFN_cuImportExternalSemaphore_v10000
+#define PFN_cuSignalExternalSemaphoresAsync  __API_TYPEDEF_PTSZ(PFN_cuSignalExternalSemaphoresAsync, 10000, 10000)
+#define PFN_cuWaitExternalSemaphoresAsync  __API_TYPEDEF_PTSZ(PFN_cuWaitExternalSemaphoresAsync, 10000, 10000)
+#define PFN_cuDestroyExternalSemaphore  PFN_cuDestroyExternalSemaphore_v10000
+#define PFN_cuStreamWaitValue32  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue32, 8000, 8000)
+#define PFN_cuStreamWaitValue64  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue64, 9000, 9000)
+#define PFN_cuStreamWriteValue32  __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue32, 8000, 8000)
+#define PFN_cuStreamWriteValue64  __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue64, 9000, 9000)
+#define PFN_cuStreamBatchMemOp  __API_TYPEDEF_PTSZ(PFN_cuStreamBatchMemOp, 8000, 8000)
+#define PFN_cuStreamWaitValue32_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue32, 11070, 11070)
+#define PFN_cuStreamWaitValue64_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue64, 11070, 11070)
+#define PFN_cuStreamWriteValue32_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue32, 11070, 11070)
+#define PFN_cuStreamWriteValue64_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue64, 11070, 11070)
+#define PFN_cuStreamBatchMemOp_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamBatchMemOp, 11070, 11070)
+#define PFN_cuFuncGetAttribute  PFN_cuFuncGetAttribute_v2020
+#define PFN_cuFuncSetAttribute  PFN_cuFuncSetAttribute_v9000
+#define PFN_cuFuncSetCacheConfig  PFN_cuFuncSetCacheConfig_v3000
+#define PFN_cuFuncSetSharedMemConfig  PFN_cuFuncSetSharedMemConfig_v4020
+#define PFN_cuLaunchKernel  __API_TYPEDEF_PTSZ(PFN_cuLaunchKernel, 4000, 7000)
+#define PFN_cuLaunchKernelEx __API_TYPEDEF_PTSZ(PFN_cuLaunchKernelEx, 11060, 11060)
+#define PFN_cuLaunchCooperativeKernel  __API_TYPEDEF_PTSZ(PFN_cuLaunchCooperativeKernel, 9000, 9000)
+#define PFN_cuLaunchCooperativeKernelMultiDevice  PFN_cuLaunchCooperativeKernelMultiDevice_v9000
+#define PFN_cuLaunchHostFunc  __API_TYPEDEF_PTSZ(PFN_cuLaunchHostFunc, 10000, 10000)
+#define PFN_cuFuncSetBlockShape  PFN_cuFuncSetBlockShape_v2000
+#define PFN_cuFuncSetSharedSize  PFN_cuFuncSetSharedSize_v2000
+#define PFN_cuParamSetSize  PFN_cuParamSetSize_v2000
+#define PFN_cuParamSeti  PFN_cuParamSeti_v2000
+#define PFN_cuParamSetf  PFN_cuParamSetf_v2000
+#define PFN_cuParamSetv  PFN_cuParamSetv_v2000
+#define PFN_cuLaunch  PFN_cuLaunch_v2000
+#define PFN_cuLaunchGrid  PFN_cuLaunchGrid_v2000
+#define PFN_cuLaunchGridAsync  PFN_cuLaunchGridAsync_v2000
+#define PFN_cuParamSetTexRef  PFN_cuParamSetTexRef_v2000
+#define PFN_cuGraphCreate  PFN_cuGraphCreate_v10000
+#define PFN_cuGraphAddKernelNode  PFN_cuGraphAddKernelNode_v10000
+#define PFN_cuGraphKernelNodeGetParams  PFN_cuGraphKernelNodeGetParams_v10000
+#define PFN_cuGraphKernelNodeSetParams  PFN_cuGraphKernelNodeSetParams_v10000
+#define PFN_cuGraphAddMemcpyNode  PFN_cuGraphAddMemcpyNode_v10000
+#define PFN_cuGraphMemcpyNodeGetParams  PFN_cuGraphMemcpyNodeGetParams_v10000
+#define PFN_cuGraphMemcpyNodeSetParams  PFN_cuGraphMemcpyNodeSetParams_v10000
+#define PFN_cuGraphAddMemsetNode  PFN_cuGraphAddMemsetNode_v10000
+#define PFN_cuGraphMemsetNodeGetParams  PFN_cuGraphMemsetNodeGetParams_v10000
+#define PFN_cuGraphMemsetNodeSetParams  PFN_cuGraphMemsetNodeSetParams_v10000
+#define PFN_cuGraphAddHostNode  PFN_cuGraphAddHostNode_v10000
+#define PFN_cuGraphHostNodeGetParams  PFN_cuGraphHostNodeGetParams_v10000
+#define PFN_cuGraphHostNodeSetParams  PFN_cuGraphHostNodeSetParams_v10000
+#define PFN_cuGraphAddChildGraphNode  PFN_cuGraphAddChildGraphNode_v10000
+#define PFN_cuGraphChildGraphNodeGetGraph  PFN_cuGraphChildGraphNodeGetGraph_v10000
+#define PFN_cuGraphAddEmptyNode  PFN_cuGraphAddEmptyNode_v10000
+#define PFN_cuGraphAddEventRecordNode  PFN_cuGraphAddEventRecordNode_v11010
+#define PFN_cuGraphEventRecordNodeGetEvent  PFN_cuGraphEventRecordNodeGetEvent_v11010
+#define PFN_cuGraphEventRecordNodeSetEvent  PFN_cuGraphEventRecordNodeSetEvent_v11010
+#define PFN_cuGraphAddEventWaitNode  PFN_cuGraphAddEventWaitNode_v11010
+#define PFN_cuGraphEventWaitNodeGetEvent  PFN_cuGraphEventWaitNodeGetEvent_v11010
+#define PFN_cuGraphEventWaitNodeSetEvent  PFN_cuGraphEventWaitNodeSetEvent_v11010
+#define PFN_cuGraphAddExternalSemaphoresSignalNode  PFN_cuGraphAddExternalSemaphoresSignalNode_v11020
+#define PFN_cuGraphExternalSemaphoresSignalNodeGetParams  PFN_cuGraphExternalSemaphoresSignalNodeGetParams_v11020
+#define PFN_cuGraphExternalSemaphoresSignalNodeSetParams  PFN_cuGraphExternalSemaphoresSignalNodeSetParams_v11020
+#define PFN_cuGraphAddExternalSemaphoresWaitNode  PFN_cuGraphAddExternalSemaphoresWaitNode_v11020
+#define PFN_cuGraphExternalSemaphoresWaitNodeGetParams  PFN_cuGraphExternalSemaphoresWaitNodeGetParams_v11020
+#define PFN_cuGraphExternalSemaphoresWaitNodeSetParams  PFN_cuGraphExternalSemaphoresWaitNodeSetParams_v11020
+#define PFN_cuGraphAddBatchMemOpNode PFN_cuGraphAddBatchMemOpNode_v11070
+#define PFN_cuGraphBatchMemOpNodeGetParams PFN_cuGraphBatchMemOpNodeGetParams_v11070
+#define PFN_cuGraphBatchMemOpNodeSetParams PFN_cuGraphBatchMemOpNodeSetParams _v11070
+#define PFN_cuGraphExecBatchMemOpNodeSetParams PFN_cuGraphExecBatchMemOpNodeSetParams_v11070
+#define PFN_cuGraphClone  PFN_cuGraphClone_v10000
+#define PFN_cuGraphNodeFindInClone  PFN_cuGraphNodeFindInClone_v10000
+#define PFN_cuGraphNodeGetType  PFN_cuGraphNodeGetType_v10000
+#define PFN_cuGraphGetNodes  PFN_cuGraphGetNodes_v10000
+#define PFN_cuGraphGetRootNodes  PFN_cuGraphGetRootNodes_v10000
+#define PFN_cuGraphGetEdges  PFN_cuGraphGetEdges_v10000
+#define PFN_cuGraphNodeGetDependencies  PFN_cuGraphNodeGetDependencies_v10000
+#define PFN_cuGraphNodeGetDependentNodes  PFN_cuGraphNodeGetDependentNodes_v10000
+#define PFN_cuGraphAddDependencies  PFN_cuGraphAddDependencies_v10000
+#define PFN_cuGraphRemoveDependencies  PFN_cuGraphRemoveDependencies_v10000
+#define PFN_cuGraphDestroyNode  PFN_cuGraphDestroyNode_v10000
+#define PFN_cuGraphInstantiate  PFN_cuGraphInstantiate_v11000
+#define PFN_cuGraphInstantiateWithFlags  PFN_cuGraphInstantiateWithFlags_v11040
+#define PFN_cuGraphExecKernelNodeSetParams  PFN_cuGraphExecKernelNodeSetParams_v10010
+#define PFN_cuGraphExecMemcpyNodeSetParams  PFN_cuGraphExecMemcpyNodeSetParams_v10020
+#define PFN_cuGraphExecMemsetNodeSetParams  PFN_cuGraphExecMemsetNodeSetParams_v10020
+#define PFN_cuGraphExecHostNodeSetParams  PFN_cuGraphExecHostNodeSetParams_v10020
+#define PFN_cuGraphExecChildGraphNodeSetParams  PFN_cuGraphExecChildGraphNodeSetParams_v11010
+#define PFN_cuGraphExecEventRecordNodeSetEvent  PFN_cuGraphExecEventRecordNodeSetEvent_v11010
+#define PFN_cuGraphExecEventWaitNodeSetEvent  PFN_cuGraphExecEventWaitNodeSetEvent_v11010
+#define PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams  PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams_v11020
+#define PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams  PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams_v11020
+#define PFN_cuGraphUpload  __API_TYPEDEF_PTSZ(PFN_cuGraphUpload, 11010, 11010)
+#define PFN_cuGraphLaunch  __API_TYPEDEF_PTSZ(PFN_cuGraphLaunch, 10000, 10000)
+#define PFN_cuGraphExecDestroy  PFN_cuGraphExecDestroy_v10000
+#define PFN_cuGraphDestroy  PFN_cuGraphDestroy_v10000
+#define PFN_cuGraphExecUpdate  PFN_cuGraphExecUpdate_v10020
+#define PFN_cuGraphKernelNodeCopyAttributes  PFN_cuGraphKernelNodeCopyAttributes_v11000
+#define PFN_cuGraphKernelNodeGetAttribute  PFN_cuGraphKernelNodeGetAttribute_v11000
+#define PFN_cuGraphKernelNodeSetAttribute  PFN_cuGraphKernelNodeSetAttribute_v11000
+#define PFN_cuGraphDebugDotPrint  PFN_cuGraphDebugDotPrint_v11030
+#define PFN_cuGraphAddMemAllocNode  PFN_cuGraphAddMemAllocNode_v11040
+#define PFN_cuGraphMemAllocNodeGetParams PFN_cuGraphMemAllocNodeGetParams_v11040
+#define PFN_cuGraphAddMemFreeNode  PFN_cuGraphAddMemFreeNode_v11040
+#define PFN_cuGraphMemFreeNodeGetParams PFN_cuGraphMemFreeNodeGetParams_v11040
+#define PFN_cuGraphNodeSetEnabled PFN_cuGraphNodeSetEnabled_v11060
+#define PFN_cuGraphNodeGetEnabled PFN_cuGraphNodeGetEnabled_v11060
+#define PFN_cuDeviceGraphMemTrim  PFN_cuDeviceGraphMemTrim_v11040
+#define PFN_cuDeviceGetGraphMemAttribute  PFN_cuDeviceGetGraphMemAttribute_v11040
+#define PFN_cuDeviceSetGraphMemAttribute  PFN_cuDeviceSetGraphMemAttribute_v11040
+#define PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor  PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor_v6050
+#define PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags  PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000
+#define PFN_cuOccupancyMaxPotentialBlockSize  PFN_cuOccupancyMaxPotentialBlockSize_v6050
+#define PFN_cuOccupancyMaxPotentialBlockSizeWithFlags  PFN_cuOccupancyMaxPotentialBlockSizeWithFlags_v7000
+#define PFN_cuOccupancyAvailableDynamicSMemPerBlock  PFN_cuOccupancyAvailableDynamicSMemPerBlock_v10020
+#define PFN_cuOccupancyMaxPotentialClusterSize  PFN_cuOccupancyMaxPotentialClusterSize_v11070
+#define PFN_cuOccupancyMaxActiveClusters  PFN_cuOccupancyMaxActiveClusters_v11070
+#define PFN_cuTexRefSetArray  PFN_cuTexRefSetArray_v2000
+#define PFN_cuTexRefSetMipmappedArray  PFN_cuTexRefSetMipmappedArray_v5000
+#define PFN_cuTexRefSetAddress  PFN_cuTexRefSetAddress_v3020
+#define PFN_cuTexRefSetAddress2D  PFN_cuTexRefSetAddress2D_v4010
+#define PFN_cuTexRefSetFormat  PFN_cuTexRefSetFormat_v2000
+#define PFN_cuTexRefSetAddressMode  PFN_cuTexRefSetAddressMode_v2000
+#define PFN_cuTexRefSetFilterMode  PFN_cuTexRefSetFilterMode_v2000
+#define PFN_cuTexRefSetMipmapFilterMode  PFN_cuTexRefSetMipmapFilterMode_v5000
+#define PFN_cuTexRefSetMipmapLevelBias  PFN_cuTexRefSetMipmapLevelBias_v5000
+#define PFN_cuTexRefSetMipmapLevelClamp  PFN_cuTexRefSetMipmapLevelClamp_v5000
+#define PFN_cuTexRefSetMaxAnisotropy  PFN_cuTexRefSetMaxAnisotropy_v5000
+#define PFN_cuTexRefSetBorderColor  PFN_cuTexRefSetBorderColor_v8000
+#define PFN_cuTexRefSetFlags  PFN_cuTexRefSetFlags_v2000
+#define PFN_cuTexRefGetAddress  PFN_cuTexRefGetAddress_v3020
+#define PFN_cuTexRefGetArray  PFN_cuTexRefGetArray_v2000
+#define PFN_cuTexRefGetMipmappedArray  PFN_cuTexRefGetMipmappedArray_v5000
+#define PFN_cuTexRefGetAddressMode  PFN_cuTexRefGetAddressMode_v2000
+#define PFN_cuTexRefGetFilterMode  PFN_cuTexRefGetFilterMode_v2000
+#define PFN_cuTexRefGetFormat  PFN_cuTexRefGetFormat_v2000
+#define PFN_cuTexRefGetMipmapFilterMode  PFN_cuTexRefGetMipmapFilterMode_v5000
+#define PFN_cuTexRefGetMipmapLevelBias  PFN_cuTexRefGetMipmapLevelBias_v5000
+#define PFN_cuTexRefGetMipmapLevelClamp  PFN_cuTexRefGetMipmapLevelClamp_v5000
+#define PFN_cuTexRefGetMaxAnisotropy  PFN_cuTexRefGetMaxAnisotropy_v5000
+#define PFN_cuTexRefGetBorderColor  PFN_cuTexRefGetBorderColor_v8000
+#define PFN_cuTexRefGetFlags  PFN_cuTexRefGetFlags_v2000
+#define PFN_cuTexRefCreate  PFN_cuTexRefCreate_v2000
+#define PFN_cuTexRefDestroy  PFN_cuTexRefDestroy_v2000
+#define PFN_cuSurfRefSetArray  PFN_cuSurfRefSetArray_v3000
+#define PFN_cuSurfRefGetArray  PFN_cuSurfRefGetArray_v3000
+#define PFN_cuTexObjectCreate  PFN_cuTexObjectCreate_v5000
+#define PFN_cuTexObjectDestroy  PFN_cuTexObjectDestroy_v5000
+#define PFN_cuTexObjectGetResourceDesc  PFN_cuTexObjectGetResourceDesc_v5000
+#define PFN_cuTexObjectGetTextureDesc  PFN_cuTexObjectGetTextureDesc_v5000
+#define PFN_cuTexObjectGetResourceViewDesc  PFN_cuTexObjectGetResourceViewDesc_v5000
+#define PFN_cuSurfObjectCreate  PFN_cuSurfObjectCreate_v5000
+#define PFN_cuSurfObjectDestroy  PFN_cuSurfObjectDestroy_v5000
+#define PFN_cuSurfObjectGetResourceDesc  PFN_cuSurfObjectGetResourceDesc_v5000
+#define PFN_cuDeviceCanAccessPeer  PFN_cuDeviceCanAccessPeer_v4000
+#define PFN_cuCtxEnablePeerAccess  PFN_cuCtxEnablePeerAccess_v4000
+#define PFN_cuCtxDisablePeerAccess  PFN_cuCtxDisablePeerAccess_v4000
+#define PFN_cuDeviceGetP2PAttribute  PFN_cuDeviceGetP2PAttribute_v8000
+#define PFN_cuGraphicsUnregisterResource  PFN_cuGraphicsUnregisterResource_v3000
+#define PFN_cuGraphicsSubResourceGetMappedArray  PFN_cuGraphicsSubResourceGetMappedArray_v3000
+#define PFN_cuGraphicsResourceGetMappedMipmappedArray  PFN_cuGraphicsResourceGetMappedMipmappedArray_v5000
+#define PFN_cuGraphicsResourceGetMappedPointer  PFN_cuGraphicsResourceGetMappedPointer_v3020
+#define PFN_cuGraphicsResourceSetMapFlags  PFN_cuGraphicsResourceSetMapFlags_v6050
+#define PFN_cuGraphicsMapResources  __API_TYPEDEF_PTSZ(PFN_cuGraphicsMapResources, 3000, 7000)
+#define PFN_cuGraphicsUnmapResources  __API_TYPEDEF_PTSZ(PFN_cuGraphicsUnmapResources, 3000, 7000)
+#define PFN_cuGetExportTable  PFN_cuGetExportTable_v3000
+#define PFN_cuFuncGetModule  PFN_cuFuncGetModule_v11000
+#define PFN_cuFlushGPUDirectRDMAWrites PFN_cuFlushGPUDirectRDMAWrites_v11030
+#define PFN_cuGetProcAddress  PFN_cuGetProcAddress_v11030
+#define PFN_cuUserObjectCreate  PFN_cuUserObjectCreate_v11030
+#define PFN_cuUserObjectRetain  PFN_cuUserObjectRetain_v11030
+#define PFN_cuUserObjectRelease  PFN_cuUserObjectRelease_v11030
+#define PFN_cuGraphRetainUserObject  PFN_cuGraphRetainUserObject_v11030
+#define PFN_cuGraphReleaseUserObject  PFN_cuGraphReleaseUserObject_v11030
+#define PFN_cuModuleGetLoadingMode  PFN_cuModuleGetLoadingMode_v11070
+#define PFN_cuMemGetHandleForAddressRange  PFN_cuMemGetHandleForAddressRange_v11070
+/*
+ * Type definitions for functions defined in cuda.h
+ */
+typedef CUresult (CUDAAPI *PFN_cuGetErrorString_v6000)(CUresult error, const char **pStr);
+typedef CUresult (CUDAAPI *PFN_cuGetErrorName_v6000)(CUresult error, const char **pStr);
+typedef CUresult (CUDAAPI *PFN_cuInit_v2000)(unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion_v2020)(int *driverVersion);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGet_v2000)(CUdevice_v1 *device, int ordinal);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetCount_v2000)(int *count);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetName_v2000)(char *name, int len, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetUuid_v9020)(CUuuid *uuid, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetUuid_v11040)(CUuuid *uuid, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetLuid_v10000)(char *luid, unsigned int *deviceNodeMask, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceTotalMem_v3020)(size_t *bytes, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetTexture1DLinearMaxWidth_v11010)(size_t *maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetAttribute_v2000)(int *pi, CUdevice_attribute attrib, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetNvSciSyncAttributes_v10020)(void *nvSciSyncAttrList, CUdevice_v1 dev, int flags);
+typedef CUresult (CUDAAPI *PFN_cuDeviceSetMemPool_v11020)(CUdevice_v1 dev, CUmemoryPool pool);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetMemPool_v11020)(CUmemoryPool *pool, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetDefaultMemPool_v11020)(CUmemoryPool *pool_out, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetProperties_v2000)(CUdevprop_v1 *prop, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceComputeCapability_v2000)(int *major, int *minor, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRetain_v7000)(CUcontext *pctx, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRelease_v11000)(CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxSetFlags_v11000)(CUdevice_v1 dev, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxGetState_v7000)(CUdevice_v1 dev, unsigned int *flags, int *active);
+typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxReset_v11000)(CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetExecAffinitySupport_v11040)(int *pi, CUexecAffinityType type, CUdevice dev);
+typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v3020)(CUcontext *pctx, unsigned int flags, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v11040)(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuCtxDestroy_v4000)(CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxPushCurrent_v4000)(CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxPopCurrent_v4000)(CUcontext *pctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxSetCurrent_v4000)(CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetCurrent_v4000)(CUcontext *pctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetDevice_v2000)(CUdevice_v1 *device);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetFlags_v7000)(unsigned int *flags);
+typedef CUresult (CUDAAPI *PFN_cuCtxSynchronize_v2000)(void);
+typedef CUresult (CUDAAPI *PFN_cuCtxSetLimit_v3010)(CUlimit limit, size_t value);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetLimit_v3010)(size_t *pvalue, CUlimit limit);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetCacheConfig_v3020)(CUfunc_cache *pconfig);
+typedef CUresult (CUDAAPI *PFN_cuCtxSetCacheConfig_v3020)(CUfunc_cache config);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetSharedMemConfig_v4020)(CUsharedconfig *pConfig);
+typedef CUresult (CUDAAPI *PFN_cuCtxSetSharedMemConfig_v4020)(CUsharedconfig config);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetApiVersion_v3020)(CUcontext ctx, unsigned int *version);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetStreamPriorityRange_v5050)(int *leastPriority, int *greatestPriority);
+typedef CUresult (CUDAAPI *PFN_cuCtxResetPersistingL2Cache_v11000)(void);
+typedef CUresult (CUDAAPI *PFN_cuCtxAttach_v2000)(CUcontext *pctx, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuCtxDetach_v2000)(CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetExecAffinity_v11040)(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type);
+typedef CUresult (CUDAAPI *PFN_cuModuleLoad_v2000)(CUmodule *module, const char *fname);
+typedef CUresult (CUDAAPI *PFN_cuModuleLoadData_v2000)(CUmodule *module, const void *image);
+typedef CUresult (CUDAAPI *PFN_cuModuleLoadDataEx_v2010)(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
+typedef CUresult (CUDAAPI *PFN_cuModuleLoadFatBinary_v2000)(CUmodule *module, const void *fatCubin);
+typedef CUresult (CUDAAPI *PFN_cuModuleUnload_v2000)(CUmodule hmod);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetFunction_v2000)(CUfunction *hfunc, CUmodule hmod, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetGlobal_v3020)(CUdeviceptr_v2 *dptr, size_t *bytes, CUmodule hmod, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetTexRef_v2000)(CUtexref *pTexRef, CUmodule hmod, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetSurfRef_v3000)(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuLinkCreate_v6050)(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
+typedef CUresult (CUDAAPI *PFN_cuLinkAddData_v6050)(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues);
+typedef CUresult (CUDAAPI *PFN_cuLinkAddFile_v6050)(CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues);
+typedef CUresult (CUDAAPI *PFN_cuLinkComplete_v5050)(CUlinkState state, void **cubinOut, size_t *sizeOut);
+typedef CUresult (CUDAAPI *PFN_cuLinkDestroy_v5050)(CUlinkState state);
+typedef CUresult (CUDAAPI *PFN_cuMemGetInfo_v3020)(size_t *free, size_t *total);
+typedef CUresult (CUDAAPI *PFN_cuMemAlloc_v3020)(CUdeviceptr_v2 *dptr, size_t bytesize);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocPitch_v3020)(CUdeviceptr_v2 *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
+typedef CUresult (CUDAAPI *PFN_cuMemFree_v3020)(CUdeviceptr_v2 dptr);
+typedef CUresult (CUDAAPI *PFN_cuMemGetAddressRange_v3020)(CUdeviceptr_v2 *pbase, size_t *psize, CUdeviceptr_v2 dptr);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocHost_v3020)(void **pp, size_t bytesize);
+typedef CUresult (CUDAAPI *PFN_cuMemFreeHost_v2000)(void *p);
+typedef CUresult (CUDAAPI *PFN_cuMemHostAlloc_v2020)(void **pp, size_t bytesize, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuMemHostGetDevicePointer_v3020)(CUdeviceptr_v2 *pdptr, void *p, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuMemHostGetFlags_v2030)(unsigned int *pFlags, void *p);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocManaged_v6000)(CUdeviceptr_v2 *dptr, size_t bytesize, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetByPCIBusId_v4010)(CUdevice_v1 *dev, const char *pciBusId);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetPCIBusId_v4010)(char *pciBusId, int len, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuIpcGetEventHandle_v4010)(CUipcEventHandle_v1 *pHandle, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuIpcOpenEventHandle_v4010)(CUevent *phEvent, CUipcEventHandle_v1 handle);
+typedef CUresult (CUDAAPI *PFN_cuIpcGetMemHandle_v4010)(CUipcMemHandle_v1 *pHandle, CUdeviceptr_v2 dptr);
+typedef CUresult (CUDAAPI *PFN_cuIpcOpenMemHandle_v11000)(CUdeviceptr_v2 *pdptr, CUipcMemHandle_v1 handle, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuIpcCloseMemHandle_v4010)(CUdeviceptr_v2 dptr);
+typedef CUresult (CUDAAPI *PFN_cuMemHostRegister_v6050)(void *p, size_t bytesize, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuMemHostUnregister_v4000)(void *p);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy_v7000_ptds)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyPeer_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v7000_ptds)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v7000_ptds)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v7000_ptds)(const CUDA_MEMCPY2D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v7000_ptds)(const CUDA_MEMCPY2D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v7000_ptds)(const CUDA_MEMCPY3D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeer_v7000_ptds)(const CUDA_MEMCPY3D_PEER_v1 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAsync_v7000_ptsz)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyPeerAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v7000_ptsz)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v7000_ptsz)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v7000_ptsz)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v7000_ptsz)(const CUDA_MEMCPY2D_v2 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v7000_ptsz)(const CUDA_MEMCPY3D_v2 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeerAsync_v7000_ptsz)(const CUDA_MEMCPY3D_PEER_v1 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD8Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD16Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD32Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuArrayCreate_v3020)(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v2 *pAllocateArray);
+typedef CUresult (CUDAAPI *PFN_cuArrayGetDescriptor_v3020)(CUDA_ARRAY_DESCRIPTOR_v2 *pArrayDescriptor, CUarray hArray);
+typedef CUresult (CUDAAPI *PFN_cuArrayGetSparseProperties_v11010)(CUDA_ARRAY_SPARSE_PROPERTIES_v1 *sparseProperties, CUarray array);
+typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetSparseProperties_v11010)(CUDA_ARRAY_SPARSE_PROPERTIES_v1 *sparseProperties, CUmipmappedArray mipmap);
+typedef CUresult (CUDAAPI *PFN_cuArrayGetMemoryRequirements_v11060)(CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 *memoryRequirements, CUarray array, CUdevice device);
+typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetMemoryRequirements_v11060)(CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 *memoryRequirements, CUmipmappedArray mipmap, CUdevice device);
+typedef CUresult (CUDAAPI *PFN_cuArrayGetPlane_v11020)(CUarray *pPlaneArray, CUarray hArray, unsigned int planeIdx);
+typedef CUresult (CUDAAPI *PFN_cuArrayDestroy_v2000)(CUarray hArray);
+typedef CUresult (CUDAAPI *PFN_cuArray3DCreate_v3020)(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v2 *pAllocateArray);
+typedef CUresult (CUDAAPI *PFN_cuArray3DGetDescriptor_v3020)(CUDA_ARRAY3D_DESCRIPTOR_v2 *pArrayDescriptor, CUarray hArray);
+typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayCreate_v5000)(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v2 *pMipmappedArrayDesc, unsigned int numMipmapLevels);
+typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetLevel_v5000)(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level);
+typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayDestroy_v5000)(CUmipmappedArray hMipmappedArray);
+typedef CUresult (CUDAAPI *PFN_cuMemAddressReserve_v10020)(CUdeviceptr_v2 *ptr, size_t size, size_t alignment, CUdeviceptr_v2 addr, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemAddressFree_v10020)(CUdeviceptr_v2 ptr, size_t size);
+typedef CUresult (CUDAAPI *PFN_cuMemCreate_v10020)(CUmemGenericAllocationHandle_v1 *handle, size_t size, const CUmemAllocationProp_v1 *prop, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemRelease_v10020)(CUmemGenericAllocationHandle_v1 handle);
+typedef CUresult (CUDAAPI *PFN_cuMemMap_v10020)(CUdeviceptr_v2 ptr, size_t size, size_t offset, CUmemGenericAllocationHandle_v1 handle, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemMapArrayAsync_v11010_ptsz)(CUarrayMapInfo_v1 *mapInfoList, unsigned int count, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemUnmap_v10020)(CUdeviceptr_v2 ptr, size_t size);
+typedef CUresult (CUDAAPI *PFN_cuMemSetAccess_v10020)(CUdeviceptr_v2 ptr, size_t size, const CUmemAccessDesc_v1 *desc, size_t count);
+typedef CUresult (CUDAAPI *PFN_cuMemGetAccess_v10020)(unsigned long long *flags, const CUmemLocation_v1 *location, CUdeviceptr_v2 ptr);
+typedef CUresult (CUDAAPI *PFN_cuMemExportToShareableHandle_v10020)(void *shareableHandle, CUmemGenericAllocationHandle_v1 handle, CUmemAllocationHandleType handleType, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemImportFromShareableHandle_v10020)(CUmemGenericAllocationHandle_v1 *handle, void *osHandle, CUmemAllocationHandleType shHandleType);
+typedef CUresult (CUDAAPI *PFN_cuMemGetAllocationGranularity_v10020)(size_t *granularity, const CUmemAllocationProp_v1 *prop, CUmemAllocationGranularity_flags option);
+typedef CUresult (CUDAAPI *PFN_cuMemGetAllocationPropertiesFromHandle_v10020)(CUmemAllocationProp_v1 *prop, CUmemGenericAllocationHandle_v1 handle);
+typedef CUresult (CUDAAPI *PFN_cuMemRetainAllocationHandle_v11000)(CUmemGenericAllocationHandle_v1 *handle, void *addr);
+typedef CUresult (CUDAAPI *PFN_cuMemFreeAsync_v11020_ptsz)(CUdeviceptr_v2 dptr, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocAsync_v11020_ptsz)(CUdeviceptr_v2 *dptr, size_t bytesize, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolTrimTo_v11020)(CUmemoryPool pool, size_t minBytesToKeep);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolSetAttribute_v11020)(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolGetAttribute_v11020)(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolSetAccess_v11020)(CUmemoryPool pool, const CUmemAccessDesc_v1 *map, size_t count);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolGetAccess_v11020)(CUmemAccess_flags *flags, CUmemoryPool memPool, CUmemLocation_v1 *location);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolCreate_v11020)(CUmemoryPool *pool, const CUmemPoolProps_v1 *poolProps);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolDestroy_v11020)(CUmemoryPool pool);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocFromPoolAsync_v11020_ptsz)(CUdeviceptr_v2 *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolExportToShareableHandle_v11020)(void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolImportFromShareableHandle_v11020)(CUmemoryPool *pool_out, void *handle, CUmemAllocationHandleType handleType, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolExportPointer_v11020)(CUmemPoolPtrExportData_v1 *shareData_out, CUdeviceptr_v2 ptr);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolImportPointer_v11020)(CUdeviceptr_v2 *ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData_v1 *shareData);
+typedef CUresult (CUDAAPI *PFN_cuPointerGetAttribute_v4000)(void *data, CUpointer_attribute attribute, CUdeviceptr_v2 ptr);
+typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v8000_ptsz)(CUdeviceptr_v2 devPtr, size_t count, CUdevice_v1 dstDevice, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemAdvise_v8000)(CUdeviceptr_v2 devPtr, size_t count, CUmem_advise advice, CUdevice_v1 device);
+typedef CUresult (CUDAAPI *PFN_cuMemRangeGetAttribute_v8000)(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr_v2 devPtr, size_t count);
+typedef CUresult (CUDAAPI *PFN_cuMemRangeGetAttributes_v8000)(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr_v2 devPtr, size_t count);
+typedef CUresult (CUDAAPI *PFN_cuPointerSetAttribute_v6000)(const void *value, CUpointer_attribute attribute, CUdeviceptr_v2 ptr);
+typedef CUresult (CUDAAPI *PFN_cuPointerGetAttributes_v7000)(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr_v2 ptr);
+typedef CUresult (CUDAAPI *PFN_cuStreamCreate_v2000)(CUstream *phStream, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamCreateWithPriority_v5050)(CUstream *phStream, unsigned int flags, int priority);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetPriority_v7000_ptsz)(CUstream hStream, int *priority);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetFlags_v7000_ptsz)(CUstream hStream, unsigned int *flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCtx_v9020_ptsz)(CUstream hStream, CUcontext *pctx);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitEvent_v7000_ptsz)(CUstream hStream, CUevent hEvent, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamAddCallback_v7000_ptsz)(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10010_ptsz)(CUstream hStream, CUstreamCaptureMode mode);
+typedef CUresult (CUDAAPI *PFN_cuThreadExchangeStreamCaptureMode_v10010)(CUstreamCaptureMode *mode);
+typedef CUresult (CUDAAPI *PFN_cuStreamEndCapture_v10000_ptsz)(CUstream hStream, CUgraph *phGraph);
+typedef CUresult (CUDAAPI *PFN_cuStreamIsCapturing_v10000_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v10010_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v11030_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v11030_ptsz)(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamAttachMemAsync_v7000_ptsz)(CUstream hStream, CUdeviceptr_v2 dptr, size_t length, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamQuery_v7000_ptsz)(CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamSynchronize_v7000_ptsz)(CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamDestroy_v4000)(CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamCopyAttributes_v11000_ptsz)(CUstream dst, CUstream src);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetAttribute_v11000_ptsz)(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue_v1 *value_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamSetAttribute_v11000_ptsz)(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue_v1 *value);
+typedef CUresult (CUDAAPI *PFN_cuEventCreate_v2000)(CUevent *phEvent, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuEventRecord_v7000_ptsz)(CUevent hEvent, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuEventRecordWithFlags_v11010_ptsz)(CUevent hEvent, CUstream hStream, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuEventQuery_v2000)(CUevent hEvent);
+typedef CUresult (CUDAAPI *PFN_cuEventSynchronize_v2000)(CUevent hEvent);
+typedef CUresult (CUDAAPI *PFN_cuEventDestroy_v4000)(CUevent hEvent);
+typedef CUresult (CUDAAPI *PFN_cuEventElapsedTime_v2000)(float *pMilliseconds, CUevent hStart, CUevent hEnd);
+typedef CUresult (CUDAAPI *PFN_cuImportExternalMemory_v10000)(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1 *memHandleDesc);
+typedef CUresult (CUDAAPI *PFN_cuExternalMemoryGetMappedBuffer_v10000)(CUdeviceptr_v2 *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1 *bufferDesc);
+typedef CUresult (CUDAAPI *PFN_cuExternalMemoryGetMappedMipmappedArray_v10000)(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1 *mipmapDesc);
+typedef CUresult (CUDAAPI *PFN_cuDestroyExternalMemory_v10000)(CUexternalMemory extMem);
+typedef CUresult (CUDAAPI *PFN_cuImportExternalSemaphore_v10000)(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1 *semHandleDesc);
+typedef CUresult (CUDAAPI *PFN_cuSignalExternalSemaphoresAsync_v10000_ptsz)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
+typedef CUresult (CUDAAPI *PFN_cuWaitExternalSemaphoresAsync_v10000_ptsz)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
+typedef CUresult (CUDAAPI *PFN_cuDestroyExternalSemaphore_v10000)(CUexternalSemaphore extSem);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v8000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v9000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v8000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v9000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v8000_ptsz)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams_v1 *paramArray, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v11070_ptsz)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuFuncGetAttribute_v2020)(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
+typedef CUresult (CUDAAPI *PFN_cuFuncSetAttribute_v9000)(CUfunction hfunc, CUfunction_attribute attrib, int value);
+typedef CUresult (CUDAAPI *PFN_cuFuncSetCacheConfig_v3000)(CUfunction hfunc, CUfunc_cache config);
+typedef CUresult (CUDAAPI *PFN_cuFuncSetSharedMemConfig_v4020)(CUfunction hfunc, CUsharedconfig config);
+typedef CUresult (CUDAAPI *PFN_cuLaunchKernel_v7000_ptsz)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+typedef CUresult (CUDAAPI *PFN_cuLaunchKernelEx_v11060_ptsz)(const CUlaunchConfig *config, CUfunction f, void **kernelParams, void **extra);
+typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernel_v9000_ptsz)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams);
+typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernelMultiDevice_v9000)(CUDA_LAUNCH_PARAMS_v1 *launchParamsList, unsigned int numDevices, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuLaunchHostFunc_v10000_ptsz)(CUstream hStream, CUhostFn fn, void *userData);
+typedef CUresult (CUDAAPI *PFN_cuFuncSetBlockShape_v2000)(CUfunction hfunc, int x, int y, int z);
+typedef CUresult (CUDAAPI *PFN_cuFuncSetSharedSize_v2000)(CUfunction hfunc, unsigned int bytes);
+typedef CUresult (CUDAAPI *PFN_cuParamSetSize_v2000)(CUfunction hfunc, unsigned int numbytes);
+typedef CUresult (CUDAAPI *PFN_cuParamSeti_v2000)(CUfunction hfunc, int offset, unsigned int value);
+typedef CUresult (CUDAAPI *PFN_cuParamSetf_v2000)(CUfunction hfunc, int offset, float value);
+typedef CUresult (CUDAAPI *PFN_cuParamSetv_v2000)(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
+typedef CUresult (CUDAAPI *PFN_cuLaunch_v2000)(CUfunction f);
+typedef CUresult (CUDAAPI *PFN_cuLaunchGrid_v2000)(CUfunction f, int grid_width, int grid_height);
+typedef CUresult (CUDAAPI *PFN_cuLaunchGridAsync_v2000)(CUfunction f, int grid_width, int grid_height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuParamSetTexRef_v2000)(CUfunction hfunc, int texunit, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuGraphCreate_v10000)(CUgraph *phGraph, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddKernelNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeGetParams_v10000)(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddMemcpyNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D_v2 *copyParams, CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemcpyNodeGetParams_v10000)(CUgraphNode hNode, CUDA_MEMCPY3D_v2 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemcpyNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_MEMCPY3D_v2 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddMemsetNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS_v1 *memsetParams, CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemsetNodeGetParams_v10000)(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemsetNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddHostNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphHostNodeGetParams_v10000)(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphHostNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddChildGraphNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphChildGraphNodeGetGraph_v10000)(CUgraphNode hNode, CUgraph *phGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddEmptyNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddEventRecordNode_v11010)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphEventRecordNodeGetEvent_v11010)(CUgraphNode hNode, CUevent *event_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphEventRecordNodeSetEvent_v11010)(CUgraphNode hNode, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddEventWaitNode_v11010)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphEventWaitNodeGetEvent_v11010)(CUgraphNode hNode, CUevent *event_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphEventWaitNodeSetEvent_v11010)(CUgraphNode hNode, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddExternalSemaphoresSignalNode_v11020)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresSignalNodeGetParams_v11020)(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *params_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresSignalNodeSetParams_v11020)(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddExternalSemaphoresWaitNode_v11020)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresWaitNodeGetParams_v11020)(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *params_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresWaitNodeSetParams_v11020)(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddBatchMemOpNode_v11070)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphBatchMemOpNodeGetParams_v11070)(CUgraphNode hNode, CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphBatchMemOpNodeSetParams_v11070)(CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecBatchMemOpNodeSetParams_v11070)(CUgraphExec graphExec, CUgraphNode node, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphClone_v10000)(CUgraph *phGraphClone, CUgraph originalGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeFindInClone_v10000)(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetType_v10000)(CUgraphNode hNode, CUgraphNodeType *type);
+typedef CUresult (CUDAAPI *PFN_cuGraphGetNodes_v10000)(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes);
+typedef CUresult (CUDAAPI *PFN_cuGraphGetRootNodes_v10000)(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes);
+typedef CUresult (CUDAAPI *PFN_cuGraphGetEdges_v10000)(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependencies_v10000)(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependentNodes_v10000)(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddDependencies_v10000)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphRemoveDependencies_v10000)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphDestroyNode_v10000)(CUgraphNode hNode);
+typedef CUresult (CUDAAPI *PFN_cuGraphInstantiate_v11000)(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
+typedef CUresult (CUDAAPI *PFN_cuGraphInstantiateWithFlags_v11040)(CUgraphExec *phGraphExec, CUgraph hGraph, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecKernelNodeSetParams_v10010)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecMemcpyNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D_v2 *copyParams, CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecMemsetNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS_v1 *memsetParams, CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecHostNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecChildGraphNodeSetParams_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecEventRecordNodeSetEvent_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecEventWaitNodeSetEvent_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams_v11020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams_v11020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphUpload_v11010_ptsz)(CUgraphExec hGraphExec, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphLaunch_v10000_ptsz)(CUgraphExec hGraphExec, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecDestroy_v10000)(CUgraphExec hGraphExec);
+typedef CUresult (CUDAAPI *PFN_cuGraphDestroy_v10000)(CUgraph hGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecUpdate_v10020)(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeCopyAttributes_v11000)(CUgraphNode dst, CUgraphNode src);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeGetAttribute_v11000)(CUgraphNode hNode, CUkernelNodeAttrID attr, CUkernelNodeAttrValue_v1 *value_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeSetAttribute_v11000)(CUgraphNode hNode, CUkernelNodeAttrID attr, const CUkernelNodeAttrValue_v1 *value);
+typedef CUresult (CUDAAPI *PFN_cuGraphDebugDotPrint_v11030)(CUgraph hGraph, const char *path, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddMemAllocNode_v11040)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemAllocNodeGetParams_v11040)(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddMemFreeNode_v11040)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemFreeNodeGetParams_v11040)(CUgraphNode hNode, CUdeviceptr *dptr_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeSetEnabled_v11060)(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int isEnabled);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetEnabled_v11060)(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int *isEnabled);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGraphMemTrim_v11040)(CUdevice device);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetGraphMemAttribute_v11040)(CUdevice device, CUgraphMem_attribute attr, void* value);
+typedef CUresult (CUDAAPI *PFN_cuDeviceSetGraphMemAttribute_v11040)(CUdevice device, CUgraphMem_attribute attr, void* value);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor_v6050)(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000)(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialBlockSize_v6050)(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialBlockSizeWithFlags_v7000)(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyAvailableDynamicSMemPerBlock_v10020)(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialClusterSize_v11070)(int *clusterSize, CUfunction func, const CUlaunchConfig *config);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveClusters_v11070)(int *numClusters, CUfunction func, const CUlaunchConfig *config);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetArray_v2000)(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmappedArray_v5000)(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress_v3020)(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr_v2 dptr, size_t bytes);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v4010)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v2 *desc, CUdeviceptr_v2 dptr, size_t Pitch);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetFormat_v2000)(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddressMode_v2000)(CUtexref hTexRef, int dim, CUaddress_mode am);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetFilterMode_v2000)(CUtexref hTexRef, CUfilter_mode fm);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapFilterMode_v5000)(CUtexref hTexRef, CUfilter_mode fm);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapLevelBias_v5000)(CUtexref hTexRef, float bias);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapLevelClamp_v5000)(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetMaxAnisotropy_v5000)(CUtexref hTexRef, unsigned int maxAniso);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetBorderColor_v8000)(CUtexref hTexRef, float *pBorderColor);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetFlags_v2000)(CUtexref hTexRef, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddress_v3020)(CUdeviceptr_v2 *pdptr, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetArray_v2000)(CUarray *phArray, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmappedArray_v5000)(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddressMode_v2000)(CUaddress_mode *pam, CUtexref hTexRef, int dim);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetFilterMode_v2000)(CUfilter_mode *pfm, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetFormat_v2000)(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapFilterMode_v5000)(CUfilter_mode *pfm, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapLevelBias_v5000)(float *pbias, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapLevelClamp_v5000)(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetMaxAnisotropy_v5000)(int *pmaxAniso, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetBorderColor_v8000)(float *pBorderColor, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetFlags_v2000)(unsigned int *pFlags, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefCreate_v2000)(CUtexref *pTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefDestroy_v2000)(CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuSurfRefSetArray_v3000)(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuSurfRefGetArray_v3000)(CUarray *phArray, CUsurfref hSurfRef);
+typedef CUresult (CUDAAPI *PFN_cuTexObjectCreate_v5000)(CUtexObject_v1 *pTexObject, const CUDA_RESOURCE_DESC_v1 *pResDesc, const CUDA_TEXTURE_DESC_v1 *pTexDesc, const CUDA_RESOURCE_VIEW_DESC_v1 *pResViewDesc);
+typedef CUresult (CUDAAPI *PFN_cuTexObjectDestroy_v5000)(CUtexObject_v1 texObject);
+typedef CUresult (CUDAAPI *PFN_cuTexObjectGetResourceDesc_v5000)(CUDA_RESOURCE_DESC_v1 *pResDesc, CUtexObject_v1 texObject);
+typedef CUresult (CUDAAPI *PFN_cuTexObjectGetTextureDesc_v5000)(CUDA_TEXTURE_DESC_v1 *pTexDesc, CUtexObject_v1 texObject);
+typedef CUresult (CUDAAPI *PFN_cuTexObjectGetResourceViewDesc_v5000)(CUDA_RESOURCE_VIEW_DESC_v1 *pResViewDesc, CUtexObject_v1 texObject);
+typedef CUresult (CUDAAPI *PFN_cuSurfObjectCreate_v5000)(CUsurfObject_v1 *pSurfObject, const CUDA_RESOURCE_DESC_v1 *pResDesc);
+typedef CUresult (CUDAAPI *PFN_cuSurfObjectDestroy_v5000)(CUsurfObject_v1 surfObject);
+typedef CUresult (CUDAAPI *PFN_cuSurfObjectGetResourceDesc_v5000)(CUDA_RESOURCE_DESC_v1 *pResDesc, CUsurfObject_v1 surfObject);
+typedef CUresult (CUDAAPI *PFN_cuDeviceCanAccessPeer_v4000)(int *canAccessPeer, CUdevice_v1 dev, CUdevice_v1 peerDev);
+typedef CUresult (CUDAAPI *PFN_cuCtxEnablePeerAccess_v4000)(CUcontext peerContext, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuCtxDisablePeerAccess_v4000)(CUcontext peerContext);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetP2PAttribute_v8000)(int *value, CUdevice_P2PAttribute attrib, CUdevice_v1 srcDevice, CUdevice_v1 dstDevice);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsUnregisterResource_v3000)(CUgraphicsResource resource);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsSubResourceGetMappedArray_v3000)(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedMipmappedArray_v5000)(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedPointer_v3020)(CUdeviceptr_v2 *pDevPtr, size_t *pSize, CUgraphicsResource resource);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceSetMapFlags_v6050)(CUgraphicsResource resource, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsMapResources_v7000_ptsz)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsUnmapResources_v7000_ptsz)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGetExportTable_v3000)(const void **ppExportTable, const CUuuid *pExportTableId);
+typedef CUresult (CUDAAPI *PFN_cuFuncGetModule_v11000)(CUmodule *hmod, CUfunction hfunc);
+typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v3020)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v3020)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v3020)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v3020)(CUarray dstArray, size_t dstOffset, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v3020)(CUdeviceptr_v2 dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v3020)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v3020)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v3020)(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v3020)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v3020)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v3020)(const CUDA_MEMCPY2D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v3020)(const CUDA_MEMCPY2D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v3020)(const CUDA_MEMCPY3D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v3020)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v3020)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v3020)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v3020)(const CUDA_MEMCPY2D_v2 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v3020)(const CUDA_MEMCPY3D_v2 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v3020)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v3020)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v3020)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy_v4000)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAsync_v4000)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyPeer_v4000)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyPeerAsync_v4000)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeer_v4000)(const CUDA_MEMCPY3D_PEER_v1 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeerAsync_v4000)(const CUDA_MEMCPY3D_PEER_v1 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD8Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD16Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD32Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetPriority_v5050)(CUstream hStream, int *priority);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetFlags_v5050)(CUstream hStream, unsigned int *flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCtx_v9020)(CUstream hStream, CUcontext *pctx);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitEvent_v3020)(CUstream hStream, CUevent hEvent, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamAddCallback_v5000)(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamAttachMemAsync_v6000)(CUstream hStream, CUdeviceptr_v2 dptr, size_t length, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamQuery_v2000)(CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamSynchronize_v2000)(CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuEventRecord_v2000)(CUevent hEvent, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuEventRecordWithFlags_v11010)(CUevent hEvent, CUstream hStream, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuLaunchKernel_v4000)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+typedef CUresult (CUDAAPI *PFN_cuLaunchKernelEx_v11060)(const CUlaunchConfig *config, CUfunction f, void **kernelParams, void **extra);
+typedef CUresult (CUDAAPI *PFN_cuLaunchHostFunc_v10000)(CUstream hStream, CUhostFn fn, void *userData);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsMapResources_v3000)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsUnmapResources_v3000)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v8000)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v8000)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v9000)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v9000)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v8000)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v11070)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v8000)(CUdeviceptr_v2 devPtr, size_t count, CUdevice_v1 dstDevice, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernel_v9000)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams);
+typedef CUresult (CUDAAPI *PFN_cuSignalExternalSemaphoresAsync_v10000)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
+typedef CUresult (CUDAAPI *PFN_cuWaitExternalSemaphoresAsync_v10000)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
+typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10010)(CUstream hStream, CUstreamCaptureMode mode);
+typedef CUresult (CUDAAPI *PFN_cuStreamEndCapture_v10000)(CUstream hStream, CUgraph *phGraph);
+typedef CUresult (CUDAAPI *PFN_cuStreamIsCapturing_v10000)(CUstream hStream, CUstreamCaptureStatus *captureStatus);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v10010)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v11030)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v11030)(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphUpload_v11010)(CUgraphExec hGraph, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphLaunch_v10000)(CUgraphExec hGraph, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamCopyAttributes_v11000)(CUstream dstStream, CUstream srcStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetAttribute_v11000)(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue_v1 *value);
+typedef CUresult (CUDAAPI *PFN_cuStreamSetAttribute_v11000)(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue_v1 *param);
+typedef CUresult (CUDAAPI *PFN_cuMemMapArrayAsync_v11010)(CUarrayMapInfo_v1 *mapInfoList, unsigned int count, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemFreeAsync_v11020)(CUdeviceptr_v2 dptr, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocAsync_v11020)(CUdeviceptr_v2 *dptr, size_t bytesize, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocFromPoolAsync_v11020)(CUdeviceptr_v2 *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuFlushGPUDirectRDMAWrites_v11030)(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope);
+typedef CUresult (CUDAAPI *PFN_cuUserObjectCreate_v11030)(CUuserObject *object_out, void *ptr, CUhostFn destroy, unsigned int initialRefcount, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuUserObjectRetain_v11030)(CUuserObject object, unsigned int count);
+typedef CUresult (CUDAAPI *PFN_cuUserObjectRelease_v11030)(CUuserObject object, unsigned int count);
+typedef CUresult (CUDAAPI *PFN_cuGraphRetainUserObject_v11030)(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphReleaseUserObject_v11030)(CUgraph graph, CUuserObject object, unsigned int count);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetLoadingMode_v11070)(CUmoduleLoadingMode *mode);
+typedef CUresult (CUDAAPI *PFN_cuMemGetHandleForAddressRange_v11070)(void *handle, CUdeviceptr dptr, size_t size, CUmemRangeHandleType handleType, unsigned long long flags);
+/*
+ * Type definitions for older versioned functions in cuda.h
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    typedef CUresult (CUDAAPI *PFN_cuMemHostRegister_v4000)(void *p, size_t bytesize, unsigned int Flags);
+    typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceSetMapFlags_v3000)(CUgraphicsResource resource, unsigned int flags);
+    typedef CUresult (CUDAAPI *PFN_cuLinkCreate_v5050)(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
+    typedef CUresult (CUDAAPI *PFN_cuLinkAddData_v5050)(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues);
+    typedef CUresult (CUDAAPI *PFN_cuLinkAddFile_v5050)(CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues);
+    typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v3020)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v2 *desc, CUdeviceptr_v2 dptr, size_t Pitch);
+    typedef CUresult (CUDAAPI *PFN_cuDeviceTotalMem_v2000)(unsigned int *bytes, CUdevice_v1 dev);
+    typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v2000)(CUcontext *pctx, unsigned int flags, CUdevice_v1 dev);
+    typedef CUresult (CUDAAPI *PFN_cuModuleGetGlobal_v2000)(CUdeviceptr_v1 *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
+    typedef CUresult (CUDAAPI *PFN_cuMemGetInfo_v2000)(unsigned int *free, unsigned int *total);
+    typedef CUresult (CUDAAPI *PFN_cuMemAlloc_v2000)(CUdeviceptr_v1 *dptr, unsigned int bytesize);
+    typedef CUresult (CUDAAPI *PFN_cuMemAllocPitch_v2000)(CUdeviceptr_v1 *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes);
+    typedef CUresult (CUDAAPI *PFN_cuMemFree_v2000)(CUdeviceptr_v1 dptr);
+    typedef CUresult (CUDAAPI *PFN_cuMemGetAddressRange_v2000)(CUdeviceptr_v1 *pbase, unsigned int *psize, CUdeviceptr_v1 dptr);
+    typedef CUresult (CUDAAPI *PFN_cuMemAllocHost_v2000)(void **pp, unsigned int bytesize);
+    typedef CUresult (CUDAAPI *PFN_cuMemHostGetDevicePointer_v2020)(CUdeviceptr_v1 *pdptr, void *p, unsigned int Flags);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v2000)(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v2000)(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v2000)(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v2000)(CUarray dstArray, unsigned int dstOffset, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v2000)(CUdeviceptr_v1 dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v2000)(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v2000)(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v2000)(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v2000)(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v2000)(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v2000)(const CUDA_MEMCPY2D_v1 *pCopy);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v2000)(const CUDA_MEMCPY2D_v1 *pCopy);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v2000)(const CUDA_MEMCPY3D_v1 *pCopy);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v2000)(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v2000)(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v3000)(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v2000)(const CUDA_MEMCPY2D_v1 *pCopy, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v2000)(const CUDA_MEMCPY3D_v1 *pCopy, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v2000)(CUdeviceptr_v1 dstDevice, unsigned char uc, unsigned int N);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v2000)(CUdeviceptr_v1 dstDevice, unsigned short us, unsigned int N);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v2000)(CUdeviceptr_v1 dstDevice, unsigned int ui, unsigned int N);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
+    typedef CUresult (CUDAAPI *PFN_cuArrayCreate_v2000)(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray);
+    typedef CUresult (CUDAAPI *PFN_cuArrayGetDescriptor_v2000)(CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
+    typedef CUresult (CUDAAPI *PFN_cuArray3DCreate_v2000)(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray);
+    typedef CUresult (CUDAAPI *PFN_cuArray3DGetDescriptor_v2000)(CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
+    typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress_v2000)(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr_v1 dptr, unsigned int bytes);
+    typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v2020)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v1 *desc, CUdeviceptr_v1 dptr, unsigned int Pitch);
+    typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddress_v2000)(CUdeviceptr_v1 *pdptr, CUtexref hTexRef);
+    typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedPointer_v3000)(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
+    typedef CUresult (CUDAAPI *PFN_cuCtxDestroy_v2000)(CUcontext ctx);
+    typedef CUresult (CUDAAPI *PFN_cuCtxPopCurrent_v2000)(CUcontext *pctx);
+    typedef CUresult (CUDAAPI *PFN_cuCtxPushCurrent_v2000)(CUcontext ctx);
+    typedef CUresult (CUDAAPI *PFN_cuStreamDestroy_v2000)(CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuEventDestroy_v2000)(CUevent hEvent);
+    typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRelease_v7000)(CUdevice_v1 dev);
+    typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxReset_v7000)(CUdevice_v1 dev);
+    typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxSetFlags_v7000)(CUdevice_v1 dev, unsigned int flags);
+    typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10000)(CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10000_ptsz)(CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuIpcOpenMemHandle_v4010)(CUdeviceptr_v2 *pdptr, CUipcMemHandle_v1 handle, unsigned int Flags);
+    typedef CUresult (CUDAAPI *PFN_cuGraphInstantiate_v10000)(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
+#endif
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+#endif // file guard

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaVDPAUTypedefs.h ADDED Viewed

	@@ -0,0 +1,90 @@

+/*
+ * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef CUDAVDPAUTYPEDEFS_H
+#define CUDAVDPAUTYPEDEFS_H
+// Dependent includes for cudavdpau.h
+#include <vdpau/vdpau.h>
+#include <cudaVDPAU.h>
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+/*
+ * Macros for the latest version for each driver function in cudaVDPAU.h
+ */
+#define PFN_cuVDPAUGetDevice  PFN_cuVDPAUGetDevice_v3010
+#define PFN_cuVDPAUCtxCreate  PFN_cuVDPAUCtxCreate_v3020
+#define PFN_cuGraphicsVDPAURegisterVideoSurface  PFN_cuGraphicsVDPAURegisterVideoSurface_v3010
+#define PFN_cuGraphicsVDPAURegisterOutputSurface  PFN_cuGraphicsVDPAURegisterOutputSurface_v3010
+/**
+ * Type definitions for functions defined in cudaVDPAU.h
+ */
+typedef CUresult (CUDAAPI *PFN_cuVDPAUGetDevice_v3010)(CUdevice_v1 *pDevice, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+typedef CUresult (CUDAAPI *PFN_cuVDPAUCtxCreate_v3020)(CUcontext *pCtx, unsigned int flags, CUdevice_v1 device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsVDPAURegisterVideoSurface_v3010)(CUgraphicsResource *pCudaResource, VdpVideoSurface vdpSurface, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsVDPAURegisterOutputSurface_v3010)(CUgraphicsResource *pCudaResource, VdpOutputSurface vdpSurface, unsigned int flags);
+/*
+ * Type definitions for older versioned functions in cudaVDPAU.h
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+typedef CUresult (CUDAAPI *PFN_cuVDPAUCtxCreate_v3010)(CUcontext *pCtx, unsigned int flags, CUdevice_v1 device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+#endif
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+#endif // file guard

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_bf16.h ADDED Viewed

The diff for this file is too large to render. See raw diff

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp16.h ADDED Viewed

The diff for this file is too large to render. See raw diff

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp16.hpp ADDED Viewed

	@@ -0,0 +1,2614 @@

+/*
+* Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO LICENSEE:
+*
+* This source code and/or documentation ("Licensed Deliverables") are
+* subject to NVIDIA intellectual property rights under U.S. and
+* international Copyright laws.
+*
+* These Licensed Deliverables contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and
+* conditions of a form of NVIDIA software license agreement by and
+* between NVIDIA and Licensee ("License Agreement") or electronically
+* accepted by Licensee.  Notwithstanding any terms or conditions to
+* the contrary in the License Agreement, reproduction or disclosure
+* of the Licensed Deliverables to any third party without the express
+* written consent of NVIDIA is prohibited.
+*
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+* OF THESE LICENSED DELIVERABLES.
+*
+* U.S. Government End Users.  These Licensed Deliverables are a
+* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+* 1995), consisting of "commercial computer software" and "commercial
+* computer software documentation" as such terms are used in 48
+* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+* only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+* U.S. Government End Users acquire the Licensed Deliverables with
+* only those rights set forth herein.
+*
+* Any use of the Licensed Deliverables in individual and commercial
+* software must include, in the user documentation and internal
+* comments to the code, the above Disclaimer and U.S. Government End
+* Users Notice.
+*/
+#if !defined(__CUDA_FP16_HPP__)
+#define __CUDA_FP16_HPP__
+#if !defined(__CUDA_FP16_H__)
+#error "Do not include this file directly. Instead, include cuda_fp16.h."
+#endif
+#if !defined(_MSC_VER) && __cplusplus >= 201103L
+#   define __CPP_VERSION_AT_LEAST_11_FP16
+#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
+#   define __CPP_VERSION_AT_LEAST_11_FP16
+#endif
+/* C++11 header for std::move.
+ * In RTC mode, std::move is provided implicitly; don't include the header
+ */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16) && !defined(__CUDACC_RTC__)
+#include <utility>
+#endif /* __cplusplus >= 201103L && !defined(__CUDACC_RTC__) */
+/* C++ header for std::memcpy (used for type punning in host-side implementations).
+ * When compiling as a CUDA source file memcpy is provided implicitly.
+ * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
+ */
+#if defined(__cplusplus) && !defined(__CUDACC__)
+#include <cstring>
+#endif /* defined(__cplusplus) && !defined(__CUDACC__) */
+/* Set up function decorations */
+#if defined(__CUDACC__)
+#define __CUDA_FP16_DECL__ static __device__ __inline__
+#define __CUDA_HOSTDEVICE_FP16_DECL__ static __host__ __device__ __inline__
+#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
+#define __CUDA_HOSTDEVICE__ __host__ __device__
+#else /* !defined(__CUDACC__) */
+#if defined(__GNUC__)
+#define __CUDA_HOSTDEVICE_FP16_DECL__ static __attribute__ ((unused))
+#else
+#define __CUDA_HOSTDEVICE_FP16_DECL__ static
+#endif /* defined(__GNUC__) */
+#define __CUDA_HOSTDEVICE__
+#endif /* defined(__CUDACC_) */
+/* Set up structure-alignment attribute */
+#if defined(__CUDACC__)
+#define __CUDA_ALIGN__(align) __align__(align)
+#else
+/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */
+#if __cplusplus >= 201103L
+#define __CUDA_ALIGN__(n) alignas(n)    /* C++11 kindly gives us a keyword for this */
+#else /* !defined(__CPP_VERSION_AT_LEAST_11_FP16)*/
+#if defined(__GNUC__)
+#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#define __CUDA_ALIGN__(n) __declspec(align(n))
+#else
+#define __CUDA_ALIGN__(n)
+#endif /* defined(__GNUC__) */
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+#endif /* defined(__CUDACC__) */
+/* Macros to allow half & half2 to be used by inline assembly */
+#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
+#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
+#define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))
+#define __HALF2_TO_CUI(var) *(reinterpret_cast<const unsigned int *>(&(var)))
+/* Macros for half & half2 binary arithmetic */
+#define __BINARY_OP_HALF_MACRO(name) /* do */ {\
+   __half val; \
+   asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2;\n}" \
+        :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b))); \
+   return val; \
+} /* while(0) */
+#define __BINARY_OP_HALF2_MACRO(name) /* do */ {\
+   __half2 val; \
+   asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2;\n}" \
+        :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
+   return val; \
+} /* while(0) */
+#define __TERNARY_OP_HALF_MACRO(name) /* do */ {\
+   __half val; \
+   asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2,%3;\n}" \
+        :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b)),"h"(__HALF_TO_CUS(c))); \
+   return val; \
+} /* while(0) */
+#define __TERNARY_OP_HALF2_MACRO(name) /* do */ {\
+   __half2 val; \
+   asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2,%3;\n}" \
+        :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b)),"r"(__HALF2_TO_CUI(c))); \
+   return val; \
+} /* while(0) */
+/**
+* Types which allow static initialization of "half" and "half2" until
+* these become an actual builtin. Note this initialization is as a
+* bitfield representation of "half", and not a conversion from short->half.
+* Such a representation will be deprecated in a future version of CUDA.
+* (Note these are visible to non-nvcc compilers, including C-only compilation)
+*/
+typedef struct __CUDA_ALIGN__(2) {
+    unsigned short x;
+} __half_raw;
+typedef struct __CUDA_ALIGN__(4) {
+    unsigned short x;
+    unsigned short y;
+} __half2_raw;
+/* All other definitions in this file are only visible to C++ compilers */
+#if defined(__cplusplus)
+/* Hide GCC member initialization list warnings because of host/device in-function init requirement */
+#if defined(__GNUC__)
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Weffc++"
+#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
+#endif /* defined(__GNUC__) */
+/* class' : multiple assignment operators specified
+   The class has multiple assignment operators of a single type. This warning is informational */
+#if defined(_MSC_VER) && _MSC_VER >= 1500
+#pragma warning( push )
+#pragma warning( disable:4522 )
+#endif /* defined(__GNUC__) */
+struct __CUDA_ALIGN__(2) __half {
+protected:
+    unsigned short __x;
+public:
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16)
+    __half() = default;
+#else
+    __CUDA_HOSTDEVICE__ __half() { }
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+    /* Convert to/from __half_raw */
+    __CUDA_HOSTDEVICE__ __half(const __half_raw &hr) : __x(hr.x) { }
+    __CUDA_HOSTDEVICE__ __half &operator=(const __half_raw &hr) { __x = hr.x; return *this; }
+    __CUDA_HOSTDEVICE__ volatile __half &operator=(const __half_raw &hr) volatile { __x = hr.x; return *this; }
+    __CUDA_HOSTDEVICE__ volatile __half &operator=(const volatile __half_raw &hr) volatile { __x = hr.x; return *this; }
+    __CUDA_HOSTDEVICE__ operator __half_raw() const { __half_raw ret; ret.x = __x; return ret; }
+    __CUDA_HOSTDEVICE__ operator __half_raw() const volatile { __half_raw ret; ret.x = __x; return ret; }
+#if !defined(__CUDA_NO_HALF_CONVERSIONS__)
+    /* Construct from float/double */
+    __CUDA_HOSTDEVICE__ __half(const float f) { __x = __float2half(f).__x;  }
+    __CUDA_HOSTDEVICE__ __half(const double f) { __x = __double2half(f).__x;  }
+    __CUDA_HOSTDEVICE__ operator float() const { return __half2float(*this); }
+    __CUDA_HOSTDEVICE__ __half &operator=(const float f) { __x = __float2half(f).__x; return *this; }
+    /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */
+    __CUDA_HOSTDEVICE__ __half &operator=(const double f) { __x = __double2half(f).__x; return *this; }
+/* Member functions only available to nvcc compilation so far */
+#if defined(__CUDACC__)
+    /* Allow automatic construction from types supported natively in hardware */
+    /* Note we do avoid constructor init-list because of special host/device compilation rules */
+    __CUDA_HOSTDEVICE__ __half(const short val) { __x = __short2half_rn(val).__x;  }
+    __CUDA_HOSTDEVICE__ __half(const unsigned short val) { __x = __ushort2half_rn(val).__x;  }
+    __CUDA_HOSTDEVICE__ __half(const int val) { __x = __int2half_rn(val).__x;  }
+    __CUDA_HOSTDEVICE__ __half(const unsigned int val) { __x = __uint2half_rn(val).__x;  }
+    __CUDA_HOSTDEVICE__ __half(const long long val) { __x = __ll2half_rn(val).__x;  }
+    __CUDA_HOSTDEVICE__ __half(const unsigned long long val) { __x = __ull2half_rn(val).__x; }
+    /* Allow automatic casts to supported builtin types, matching all that are permitted with float */
+    __CUDA_HOSTDEVICE__ operator short() const { return __half2short_rz(*this); }
+    __CUDA_HOSTDEVICE__ __half &operator=(const short val) { __x = __short2half_rn(val).__x; return *this; }
+    __CUDA_HOSTDEVICE__ operator unsigned short() const { return __half2ushort_rz(*this); }
+    __CUDA_HOSTDEVICE__ __half &operator=(const unsigned short val) { __x = __ushort2half_rn(val).__x; return *this; }
+    __CUDA_HOSTDEVICE__ operator int() const { return __half2int_rz(*this); }
+    __CUDA_HOSTDEVICE__ __half &operator=(const int val) { __x = __int2half_rn(val).__x; return *this; }
+    __CUDA_HOSTDEVICE__ operator unsigned int() const { return __half2uint_rz(*this); }
+    __CUDA_HOSTDEVICE__ __half &operator=(const unsigned int val) { __x = __uint2half_rn(val).__x; return *this; }
+    __CUDA_HOSTDEVICE__ operator long long() const { return __half2ll_rz(*this); }
+    __CUDA_HOSTDEVICE__ __half &operator=(const long long val) { __x = __ll2half_rn(val).__x; return *this; }
+    __CUDA_HOSTDEVICE__ operator unsigned long long() const { return __half2ull_rz(*this); }
+    __CUDA_HOSTDEVICE__ __half &operator=(const unsigned long long val) { __x = __ull2half_rn(val).__x; return *this; }
+    /* Boolean conversion - note both 0 and -0 must return false */
+    __CUDA_HOSTDEVICE__ operator bool() const { return (__x & 0x7FFFU) != 0U; }
+#endif /* defined(__CUDACC__) */
+#endif /* !defined(__CUDA_NO_HALF_CONVERSIONS__) */
+};
+/* Global-space operator functions are only available to nvcc compilation */
+#if defined(__CUDACC__)
+/* Arithmetic FP16 operations only supported on arch >= 5.3 */
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+#if !defined(__CUDA_NO_HALF_OPERATORS__)
+/* Some basic arithmetic operations expected of a builtin */
+__device__ __forceinline__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); }
+__device__ __forceinline__ __half operator-(const __half &lh, const __half &rh) { return __hsub(lh, rh); }
+__device__ __forceinline__ __half operator*(const __half &lh, const __half &rh) { return __hmul(lh, rh); }
+__device__ __forceinline__ __half operator/(const __half &lh, const __half &rh) { return __hdiv(lh, rh); }
+__device__ __forceinline__ __half &operator+=(__half &lh, const __half &rh) { lh = __hadd(lh, rh); return lh; }
+__device__ __forceinline__ __half &operator-=(__half &lh, const __half &rh) { lh = __hsub(lh, rh); return lh; }
+__device__ __forceinline__ __half &operator*=(__half &lh, const __half &rh) { lh = __hmul(lh, rh); return lh; }
+__device__ __forceinline__ __half &operator/=(__half &lh, const __half &rh) { lh = __hdiv(lh, rh); return lh; }
+/* Note for increment and decrement we use the raw value 0x3C00U equating to half(1.0F), to avoid the extra conversion */
+__device__ __forceinline__ __half &operator++(__half &h)      { __half_raw one; one.x = 0x3C00U; h += one; return h; }
+__device__ __forceinline__ __half &operator--(__half &h)      { __half_raw one; one.x = 0x3C00U; h -= one; return h; }
+__device__ __forceinline__ __half  operator++(__half &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+    const __half ret = h;
+    __half_raw one;
+    one.x = 0x3C00U;
+    h += one;
+    return ret;
+}
+__device__ __forceinline__ __half  operator--(__half &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+    const __half ret = h;
+    __half_raw one;
+    one.x = 0x3C00U;
+    h -= one;
+    return ret;
+}
+/* Unary plus and inverse operators */
+__device__ __forceinline__ __half operator+(const __half &h) { return h; }
+__device__ __forceinline__ __half operator-(const __half &h) { return __hneg(h); }
+/* Some basic comparison operations to make it look like a builtin */
+__device__ __forceinline__ bool operator==(const __half &lh, const __half &rh) { return __heq(lh, rh); }
+__device__ __forceinline__ bool operator!=(const __half &lh, const __half &rh) { return __hneu(lh, rh); }
+__device__ __forceinline__ bool operator> (const __half &lh, const __half &rh) { return __hgt(lh, rh); }
+__device__ __forceinline__ bool operator< (const __half &lh, const __half &rh) { return __hlt(lh, rh); }
+__device__ __forceinline__ bool operator>=(const __half &lh, const __half &rh) { return __hge(lh, rh); }
+__device__ __forceinline__ bool operator<=(const __half &lh, const __half &rh) { return __hle(lh, rh); }
+#endif /* !defined(__CUDA_NO_HALF_OPERATORS__) */
+#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) */
+#endif /* defined(__CUDACC__) */
+/* __half2 is visible to non-nvcc host compilers */
+struct __CUDA_ALIGN__(4) __half2 {
+    __half x;
+    __half y;
+    // All construct/copy/assign/move
+public:
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16)
+    __half2() = default;
+    __CUDA_HOSTDEVICE__ __half2(const __half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); }
+    __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); return *this; }
+#else
+    __CUDA_HOSTDEVICE__ __half2() { }
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+    __CUDA_HOSTDEVICE__ __half2(const __half &a, const __half &b) : x(a), y(b) { }
+    __CUDA_HOSTDEVICE__ __half2(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); }
+    __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); return *this; }
+    /* Convert to/from __half2_raw */
+    __CUDA_HOSTDEVICE__ __half2(const __half2_raw &h2r ) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); }
+    __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2_raw &h2r) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); return *this; }
+    __CUDA_HOSTDEVICE__ operator __half2_raw() const { __half2_raw ret; ret.x = 0U; ret.y = 0U; __HALF2_TO_UI(ret) = __HALF2_TO_CUI(*this); return ret; }
+};
+/* Global-space operator functions are only available to nvcc compilation */
+#if defined(__CUDACC__)
+/* Arithmetic FP16x2 operations only supported on arch >= 5.3 */
+#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) && !defined(__CUDA_NO_HALF2_OPERATORS__)
+__device__ __forceinline__ __half2 operator+(const __half2 &lh, const __half2 &rh) { return __hadd2(lh, rh); }
+__device__ __forceinline__ __half2 operator-(const __half2 &lh, const __half2 &rh) { return __hsub2(lh, rh); }
+__device__ __forceinline__ __half2 operator*(const __half2 &lh, const __half2 &rh) { return __hmul2(lh, rh); }
+__device__ __forceinline__ __half2 operator/(const __half2 &lh, const __half2 &rh) { return __h2div(lh, rh); }
+__device__ __forceinline__ __half2& operator+=(__half2 &lh, const __half2 &rh) { lh = __hadd2(lh, rh); return lh; }
+__device__ __forceinline__ __half2& operator-=(__half2 &lh, const __half2 &rh) { lh = __hsub2(lh, rh); return lh; }
+__device__ __forceinline__ __half2& operator*=(__half2 &lh, const __half2 &rh) { lh = __hmul2(lh, rh); return lh; }
+__device__ __forceinline__ __half2& operator/=(__half2 &lh, const __half2 &rh) { lh = __h2div(lh, rh); return lh; }
+__device__ __forceinline__ __half2 &operator++(__half2 &h)      { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hadd2(h, one); return h; }
+__device__ __forceinline__ __half2 &operator--(__half2 &h)      { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hsub2(h, one); return h; }
+__device__ __forceinline__ __half2  operator++(__half2 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+    const __half2 ret = h;
+    __half2_raw one;
+    one.x = 0x3C00U;
+    one.y = 0x3C00U;
+    h = __hadd2(h, one);
+    return ret;
+}
+__device__ __forceinline__ __half2  operator--(__half2 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+    const __half2 ret = h;
+    __half2_raw one;
+    one.x = 0x3C00U;
+    one.y = 0x3C00U;
+    h = __hsub2(h, one);
+    return ret;
+}
+__device__ __forceinline__ __half2 operator+(const __half2 &h) { return h; }
+__device__ __forceinline__ __half2 operator-(const __half2 &h) { return __hneg2(h); }
+__device__ __forceinline__ bool operator==(const __half2 &lh, const __half2 &rh) { return __hbeq2(lh, rh); }
+__device__ __forceinline__ bool operator!=(const __half2 &lh, const __half2 &rh) { return __hbneu2(lh, rh); }
+__device__ __forceinline__ bool operator>(const __half2 &lh, const __half2 &rh) { return __hbgt2(lh, rh); }
+__device__ __forceinline__ bool operator<(const __half2 &lh, const __half2 &rh) { return __hblt2(lh, rh); }
+__device__ __forceinline__ bool operator>=(const __half2 &lh, const __half2 &rh) { return __hbge2(lh, rh); }
+__device__ __forceinline__ bool operator<=(const __half2 &lh, const __half2 &rh) { return __hble2(lh, rh); }
+#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) */
+#endif /* defined(__CUDACC__) */
+/* Restore warning for multiple assignment operators */
+#if defined(_MSC_VER) && _MSC_VER >= 1500
+#pragma warning( pop )
+#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */
+/* Restore -Weffc++ warnings from here on */
+#if defined(__GNUC__)
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic pop
+#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
+#endif /* defined(__GNUC__) */
+#undef __CUDA_HOSTDEVICE__
+#undef __CUDA_ALIGN__
+#ifndef __CUDACC_RTC__  /* no host functions in NVRTC mode */
+static inline unsigned short __internal_float2half(const float f, unsigned int &sign, unsigned int &remainder)
+{
+    unsigned int x;
+    unsigned int u;
+    unsigned int result;
+#if defined(__CUDACC__)
+    (void)memcpy(&x, &f, sizeof(f));
+#else
+    (void)std::memcpy(&x, &f, sizeof(f));
+#endif
+    u = (x & 0x7fffffffU);
+    sign = ((x >> 16U) & 0x8000U);
+    // NaN/+Inf/-Inf
+    if (u >= 0x7f800000U) {
+        remainder = 0U;
+        result = ((u == 0x7f800000U) ? (sign | 0x7c00U) : 0x7fffU);
+    } else if (u > 0x477fefffU) { // Overflows
+        remainder = 0x80000000U;
+        result = (sign | 0x7bffU);
+    } else if (u >= 0x38800000U) { // Normal numbers
+        remainder = u << 19U;
+        u -= 0x38000000U;
+        result = (sign | (u >> 13U));
+    } else if (u < 0x33000001U) { // +0/-0
+        remainder = u;
+        result = sign;
+    } else { // Denormal numbers
+        const unsigned int exponent = u >> 23U;
+        const unsigned int shift = 0x7eU - exponent;
+        unsigned int mantissa = (u & 0x7fffffU);
+        mantissa |= 0x800000U;
+        remainder = mantissa << (32U - shift);
+        result = (sign | (mantissa >> shift));
+        result &= 0x0000FFFFU;
+    }
+    return static_cast<unsigned short>(result);
+}
+#endif  /* #if !defined(__CUDACC_RTC__) */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a)
+{
+#if defined(__CUDA_ARCH__)
+    __half val;
+    asm("{  cvt.rn.f16.f64 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "d"(a));
+    return val;
+#else
+    __half result;
+    /*
+    // Perform rounding to 11 bits of precision, convert value
+    // to float and call existing float to half conversion.
+    // By pre-rounding to 11 bits we avoid additional rounding
+    // in float to half conversion.
+    */
+    unsigned long long int absa;
+    unsigned long long int ua;
+    #if defined(__CUDACC__)
+        (void)memcpy(&ua, &a, sizeof(a));
+    #else
+        (void)std::memcpy(&ua, &a, sizeof(a));
+    #endif
+    absa = (ua & 0x7fffffffffffffffULL);
+    if ((absa >= 0x40f0000000000000ULL) || (absa <= 0x3e60000000000000ULL))
+    {
+        /*
+        // |a| >= 2^16 or NaN or |a| <= 2^(-25)
+        // double-rounding is not a problem
+        */
+        result = __float2half(static_cast<float>(a));
+    }
+    else
+    {
+        /*
+        // here 2^(-25) < |a| < 2^16
+        // prepare shifter value such that a + shifter
+        // done in double precision performs round-to-nearest-even
+        // and (a + shifter) - shifter results in a rounded to
+        // 11 bits of precision. Shifter needs to have exponent of
+        // a plus 53 - 11 = 42 and a leading bit in mantissa to guard
+        // against negative values.
+        // So need to have |a| capped to avoid overflow in exponent.
+        // For inputs that are smaller than half precision minnorm
+        // we prepare fixed shifter exponent.
+        */
+        unsigned long long shifterBits;
+        if (absa >= 0x3f10000000000000ULL)
+        {
+            /*
+            // Here if |a| >= 2^(-14)
+            // add 42 to exponent bits
+            */
+            shifterBits  = (ua & 0x7ff0000000000000ULL) + 0x02A0000000000000ULL;
+        }
+        else
+        {
+            /*
+            // 2^(-25) < |a| < 2^(-14), potentially results in denormal
+            // set exponent bits to 42 - 14 + bias
+            */
+            shifterBits = 0x41B0000000000000ULL;
+        }
+        // set leading mantissa bit to protect against negative inputs
+        shifterBits |= 0x0008000000000000ULL;
+        double shifter;
+        #if defined(__CUDACC__)
+            (void)memcpy(&shifter, &shifterBits, sizeof(shifterBits));
+        #else
+            (void)std::memcpy(&shifter, &shifterBits, sizeof(shifterBits));
+        #endif
+        double aShiftRound = a + shifter;
+        /*
+        // Prevent the compiler from optimizing away a + shifter - shifter
+        // by doing intermediate memcopy and harmless bitwize operation
+        */
+        unsigned long long int aShiftRoundBits;
+        #if defined(__CUDACC__)
+            (void)memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound));
+        #else
+            (void)std::memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound));
+        #endif
+        // the value is positive, so this operation doesn't change anything
+        aShiftRoundBits &= 0x7fffffffffffffffULL;
+        #if defined(__CUDACC__)
+            (void)memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound));
+        #else
+            (void)std::memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound));
+        #endif
+        result = __float2half(static_cast<float>(aShiftRound - shifter));
+    }
+    return result;
+#endif
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a)
+{
+    __half val;
+#if defined(__CUDA_ARCH__)
+    asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
+#else
+    __half_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2half(a, sign, remainder);
+    if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+        r.x++;
+    }
+    val = r;
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a)
+{
+    __half val;
+#if defined(__CUDA_ARCH__)
+    asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
+#else
+    __half_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2half(a, sign, remainder);
+    if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+        r.x++;
+    }
+    val = r;
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a)
+{
+    __half val;
+#if defined(__CUDA_ARCH__)
+    asm("{  cvt.rz.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
+#else
+    __half_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2half(a, sign, remainder);
+    val = r;
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a)
+{
+    __half val;
+#if defined(__CUDA_ARCH__)
+    asm("{  cvt.rm.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
+#else
+    __half_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2half(a, sign, remainder);
+    if ((remainder != 0U) && (sign != 0U)) {
+        r.x++;
+    }
+    val = r;
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a)
+{
+    __half val;
+#if defined(__CUDA_ARCH__)
+    asm("{  cvt.rp.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
+#else
+    __half_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2half(a, sign, remainder);
+    if ((remainder != 0U) && (sign == 0U)) {
+        r.x++;
+    }
+    val = r;
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a)
+{
+    __half2 val;
+#if defined(__CUDA_ARCH__)
+    asm("{.reg .f16 low;\n"
+        "  cvt.rn.f16.f32 low, %1;\n"
+        "  mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a));
+#else
+    val = __half2(__float2half_rn(a), __float2half_rn(a));
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b)
+{
+    __half2 val;
+#if defined(__CUDA_ARCH__)
+#if (__CUDA_ARCH__ >= 800)
+    asm("{ cvt.rn.f16x2.f32 %0, %2, %1; }\n"
+        : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b));
+#else
+    asm("{.reg .f16 low,high;\n"
+        "  cvt.rn.f16.f32 low, %1;\n"
+        "  cvt.rn.f16.f32 high, %2;\n"
+        "  mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b));
+#endif
+#else
+    val = __half2(__float2half_rn(a), __float2half_rn(b));
+#endif
+    return val;
+}
+#ifndef __CUDACC_RTC__  /* no host functions in NVRTC mode */
+static inline float __internal_half2float(const unsigned short h)
+{
+    unsigned int sign = ((static_cast<unsigned int>(h) >> 15U) & 1U);
+    unsigned int exponent = ((static_cast<unsigned int>(h) >> 10U) & 0x1fU);
+    unsigned int mantissa = ((static_cast<unsigned int>(h) & 0x3ffU) << 13U);
+    float f;
+    if (exponent == 0x1fU) { /* NaN or Inf */
+        /* discard sign of a NaN */
+        sign = ((mantissa != 0U) ? (sign >> 1U) : sign);
+        mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U);
+        exponent = 0xffU;
+    } else if (exponent == 0U) { /* Denorm or Zero */
+        if (mantissa != 0U) {
+            unsigned int msb;
+            exponent = 0x71U;
+            do {
+                msb = (mantissa & 0x400000U);
+                mantissa <<= 1U; /* normalize */
+                --exponent;
+            } while (msb == 0U);
+            mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
+        }
+    } else {
+        exponent += 0x70U;
+    }
+    const unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa);
+#if defined(__CUDACC__)
+    (void)memcpy(&f, &u, sizeof(u));
+#else
+    (void)std::memcpy(&f, &u, sizeof(u));
+#endif
+    return f;
+}
+#endif  /* !defined(__CUDACC_RTC__) */
+__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a)
+{
+    float val;
+#if defined(__CUDA_ARCH__)
+    asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(a)));
+#else
+    val = __internal_half2float(static_cast<__half_raw>(a).x);
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a)
+{
+    float val;
+#if defined(__CUDA_ARCH__)
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high},%1;\n"
+        "  cvt.f32.f16 %0, low;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a)));
+#else
+    val = __internal_half2float(static_cast<__half2_raw>(a).x);
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a)
+{
+    float val;
+#if defined(__CUDA_ARCH__)
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high},%1;\n"
+        "  cvt.f32.f16 %0, high;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a)));
+#else
+    val = __internal_half2float(static_cast<__half2_raw>(a).y);
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h)
+{
+    short int i;
+#if defined __CUDA_ARCH__
+    asm("cvt.rzi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+#else
+    const float f = __half2float(h);
+    const short int max_val = (short int)0x7fffU;
+    const short int min_val = (short int)0x8000U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<short int>(f);
+    }
+#endif
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h)
+{
+    unsigned short int i;
+#if defined __CUDA_ARCH__
+    asm("cvt.rzi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+#else
+    const float f = __half2float(h);
+    const unsigned short int max_val = 0xffffU;
+    const unsigned short int min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0U;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<unsigned short int>(f);
+    }
+#endif
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h)
+{
+    int i;
+#if defined __CUDA_ARCH__
+    asm("cvt.rzi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+#else
+    const float f = __half2float(h);
+    const int max_val = (int)0x7fffffffU;
+    const int min_val = (int)0x80000000U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<int>(f);
+    }
+#endif
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h)
+{
+    unsigned int i;
+#if defined __CUDA_ARCH__
+    asm("cvt.rzi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+#else
+    const float f = __half2float(h);
+    const unsigned int max_val = 0xffffffffU;
+    const unsigned int min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0U;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<unsigned int>(f);
+    }
+#endif
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h)
+{
+    long long int i;
+#if defined __CUDA_ARCH__
+    asm("cvt.rzi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+#else
+    const float f = __half2float(h);
+    const long long int max_val = (long long int)0x7fffffffffffffffULL;
+    const long long int min_val = (long long int)0x8000000000000000ULL;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = min_val;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<long long int>(f);
+    }
+#endif
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h)
+{
+    unsigned long long int i;
+#if defined __CUDA_ARCH__
+    asm("cvt.rzi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+#else
+    const float f = __half2float(h);
+    const unsigned long long int max_val = 0xffffffffffffffffULL;
+    const unsigned long long int min_val = 0ULL;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0x8000000000000000ULL;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<unsigned long long int>(f);
+    }
+#endif
+    return i;
+}
+/* Intrinsic functions only available to nvcc compilers */
+#if defined(__CUDACC__)
+/* CUDA vector-types compatible vector creation function (note returns __half2, not half2) */
+__VECTOR_FUNCTIONS_DECL__ __half2 make_half2(const __half x, const __half y)
+{
+    __half2 t; t.x = x; t.y = y; return t;
+}
+#undef __VECTOR_FUNCTIONS_DECL__
+/* Definitions of intrinsics */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a)
+{
+    const __half2 val = __floats2half2_rn(a.x, a.y);
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a)
+{
+    float hi_float;
+    float lo_float;
+#if defined(__CUDA_ARCH__)
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high},%1;\n"
+        "  cvt.f32.f16 %0, low;}\n" : "=f"(lo_float) : "r"(__HALF2_TO_CUI(a)));
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high},%1;\n"
+        "  cvt.f32.f16 %0, high;}\n" : "=f"(hi_float) : "r"(__HALF2_TO_CUI(a)));
+#else
+    lo_float = __internal_half2float(((__half2_raw)a).x);
+    hi_float = __internal_half2float(((__half2_raw)a).y);
+#endif
+    return make_float2(lo_float, hi_float);
+}
+__CUDA_FP16_DECL__ int __half2int_rn(const __half h)
+{
+    int i;
+    asm("cvt.rni.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ int __half2int_rd(const __half h)
+{
+    int i;
+    asm("cvt.rmi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ int __half2int_ru(const __half h)
+{
+    int i;
+    asm("cvt.rpi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i)
+{
+    __half h;
+#if defined(__CUDA_ARCH__)
+    asm("cvt.rn.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+#else
+    // double-rounding is not a problem here: if integer
+    // has more than 24 bits, it is already too large to
+    // be represented in half precision, and result will
+    // be infinity.
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+#endif
+    return h;
+}
+__CUDA_FP16_DECL__ __half __int2half_rz(const int i)
+{
+    __half h;
+    asm("cvt.rz.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ __half __int2half_rd(const int i)
+{
+    __half h;
+    asm("cvt.rm.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ __half __int2half_ru(const int i)
+{
+    __half h;
+    asm("cvt.rp.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ short int __half2short_rn(const __half h)
+{
+    short int i;
+    asm("cvt.rni.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ short int __half2short_rd(const __half h)
+{
+    short int i;
+    asm("cvt.rmi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ short int __half2short_ru(const __half h)
+{
+    short int i;
+    asm("cvt.rpi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i)
+{
+    __half h;
+#if defined __CUDA_ARCH__
+    asm("cvt.rn.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+#else
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+#endif
+    return h;
+}
+__CUDA_FP16_DECL__ __half __short2half_rz(const short int i)
+{
+    __half h;
+    asm("cvt.rz.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ __half __short2half_rd(const short int i)
+{
+    __half h;
+    asm("cvt.rm.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ __half __short2half_ru(const short int i)
+{
+    __half h;
+    asm("cvt.rp.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h)
+{
+    unsigned int i;
+    asm("cvt.rni.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h)
+{
+    unsigned int i;
+    asm("cvt.rmi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h)
+{
+    unsigned int i;
+    asm("cvt.rpi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i)
+{
+    __half h;
+#if defined __CUDA_ARCH__
+    asm("cvt.rn.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+#else
+    // double-rounding is not a problem here: if integer
+    // has more than 24 bits, it is already too large to
+    // be represented in half precision, and result will
+    // be infinity.
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+#endif
+    return h;
+}
+__CUDA_FP16_DECL__ __half __uint2half_rz(const unsigned int i)
+{
+    __half h;
+    asm("cvt.rz.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ __half __uint2half_rd(const unsigned int i)
+{
+    __half h;
+    asm("cvt.rm.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ __half __uint2half_ru(const unsigned int i)
+{
+    __half h;
+    asm("cvt.rp.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h)
+{
+    unsigned short int i;
+    asm("cvt.rni.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h)
+{
+    unsigned short int i;
+    asm("cvt.rmi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h)
+{
+    unsigned short int i;
+    asm("cvt.rpi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i)
+{
+    __half h;
+#if defined __CUDA_ARCH__
+    asm("cvt.rn.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+#else
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+#endif
+    return h;
+}
+__CUDA_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i)
+{
+    __half h;
+    asm("cvt.rz.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i)
+{
+    __half h;
+    asm("cvt.rm.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i)
+{
+    __half h;
+    asm("cvt.rp.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h)
+{
+    unsigned long long int i;
+    asm("cvt.rni.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h)
+{
+    unsigned long long int i;
+    asm("cvt.rmi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h)
+{
+    unsigned long long int i;
+    asm("cvt.rpi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int i)
+{
+    __half h;
+#if defined(__CUDA_ARCH__)
+    asm("cvt.rn.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+#else
+    // double-rounding is not a problem here: if integer
+    // has more than 24 bits, it is already too large to
+    // be represented in half precision, and result will
+    // be infinity.
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+#endif
+    return h;
+}
+__CUDA_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i)
+{
+    __half h;
+    asm("cvt.rz.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i)
+{
+    __half h;
+    asm("cvt.rm.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i)
+{
+    __half h;
+    asm("cvt.rp.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h)
+{
+    long long int i;
+    asm("cvt.rni.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h)
+{
+    long long int i;
+    asm("cvt.rmi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h)
+{
+    long long int i;
+    asm("cvt.rpi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i)
+{
+    __half h;
+#if defined(__CUDA_ARCH__)
+    asm("cvt.rn.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+#else
+    // double-rounding is not a problem here: if integer
+    // has more than 24 bits, it is already too large to
+    // be represented in half precision, and result will
+    // be infinity.
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+#endif
+    return h;
+}
+__CUDA_FP16_DECL__ __half __ll2half_rz(const long long int i)
+{
+    __half h;
+    asm("cvt.rz.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ __half __ll2half_rd(const long long int i)
+{
+    __half h;
+    asm("cvt.rm.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ __half __ll2half_ru(const long long int i)
+{
+    __half h;
+    asm("cvt.rp.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ __half htrunc(const __half h)
+{
+    __half r;
+    asm("cvt.rzi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half hceil(const __half h)
+{
+    __half r;
+    asm("cvt.rpi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half hfloor(const __half h)
+{
+    __half r;
+    asm("cvt.rmi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half hrint(const __half h)
+{
+    __half r;
+    asm("cvt.rni.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  cvt.rzi.f16.f16 low, low;\n"
+        "  cvt.rzi.f16.f16 high, high;\n"
+        "  mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  cvt.rpi.f16.f16 low, low;\n"
+        "  cvt.rpi.f16.f16 high, high;\n"
+        "  mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  cvt.rmi.f16.f16 low, low;\n"
+        "  cvt.rmi.f16.f16 high, high;\n"
+        "  mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  cvt.rni.f16.f16 low, low;\n"
+        "  cvt.rni.f16.f16 high, high;\n"
+        "  mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b)
+{
+    __half2 val;
+    asm("{.reg .f16 alow,ahigh,blow,bhigh;\n"
+        "  mov.b32 {alow,ahigh}, %1;\n"
+        "  mov.b32 {blow,bhigh}, %2;\n"
+        "  mov.b32 %0, {alow,blow};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b)
+{
+    __half2 val;
+    asm("{.reg .f16 alow,ahigh,blow,bhigh;\n"
+        "  mov.b32 {alow,ahigh}, %1;\n"
+        "  mov.b32 {blow,bhigh}, %2;\n"
+        "  mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half __low2half(const __half2 a)
+{
+    __half ret;
+    asm("{.reg .f16 low,high;\n"
+        " mov.b32 {low,high}, %1;\n"
+        " mov.b16 %0, low;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a)));
+    return ret;
+}
+__CUDA_FP16_DECL__ int __hisinf(const __half a)
+{
+    int retval;
+    if (__HALF_TO_CUS(a) == 0xFC00U) {
+        retval = -1;
+    } else if (__HALF_TO_CUS(a) == 0x7C00U) {
+        retval = 1;
+    } else {
+        retval = 0;
+    }
+    return retval;
+}
+__CUDA_FP16_DECL__ __half2 __low2half2(const __half2 a)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 __high2half2(const __half2 a)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {high,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half __high2half(const __half2 a)
+{
+    __half ret;
+    asm("{.reg .f16 low,high;\n"
+        " mov.b32 {low,high}, %1;\n"
+        " mov.b16 %0, high;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a)));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b)
+{
+    __half2 val;
+    asm("{  mov.b32 %0, {%1,%2};}\n"
+        : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 __half2half2(const __half a)
+{
+    __half2 val;
+    asm("{  mov.b32 %0, {%1,%1};}\n"
+        : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {high,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ short int __half_as_short(const __half h)
+{
+    return static_cast<short int>(__HALF_TO_CUS(h));
+}
+__CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h)
+{
+    return __HALF_TO_CUS(h);
+}
+__CUDA_FP16_DECL__ __half __short_as_half(const short int i)
+{
+    __half h;
+    __HALF_TO_US(h) = static_cast<unsigned short int>(i);
+    return h;
+}
+__CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i)
+{
+    __half h;
+    __HALF_TO_US(h) = i;
+    return h;
+}
+/******************************************************************************
+*                             __half arithmetic                             *
+******************************************************************************/
+__CUDA_FP16_DECL__ __half __hmax(const __half a, const __half b)
+{
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
+    __BINARY_OP_HALF_MACRO(max)
+#else
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    float fr;
+    asm("{max.f32 %0,%1,%2;\n}"
+        :"=f"(fr) : "f"(fa), "f"(fb));
+    const __half hr = __float2half(fr);
+    return hr;
+#endif
+}
+__CUDA_FP16_DECL__ __half __hmin(const __half a, const __half b)
+{
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
+    __BINARY_OP_HALF_MACRO(min)
+#else
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    float fr;
+    asm("{min.f32 %0,%1,%2;\n}"
+        :"=f"(fr) : "f"(fa), "f"(fb));
+    const __half hr = __float2half(fr);
+    return hr;
+#endif
+}
+/******************************************************************************
+*                            __half2 arithmetic                             *
+******************************************************************************/
+__CUDA_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b)
+{
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
+    __BINARY_OP_HALF2_MACRO(max)
+#else
+    const float2 fa = __half22float2(a);
+    const float2 fb = __half22float2(b);
+    float2 fr;
+    asm("{max.f32 %0,%1,%2;\n}"
+        :"=f"(fr.x) : "f"(fa.x), "f"(fb.x));
+    asm("{max.f32 %0,%1,%2;\n}"
+        :"=f"(fr.y) : "f"(fa.y), "f"(fb.y));
+    const __half2 hr = __float22half2_rn(fr);
+    return hr;
+#endif
+}
+__CUDA_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b)
+{
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
+    __BINARY_OP_HALF2_MACRO(min)
+#else
+    const float2 fa = __half22float2(a);
+    const float2 fb = __half22float2(b);
+    float2 fr;
+    asm("{min.f32 %0,%1,%2;\n}"
+        :"=f"(fr.x) : "f"(fa.x), "f"(fb.x));
+    asm("{min.f32 %0,%1,%2;\n}"
+        :"=f"(fr.y) : "f"(fa.y), "f"(fb.y));
+    const __half2 hr = __float22half2_rn(fr);
+    return hr;
+#endif
+}
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300)
+/******************************************************************************
+*                           __half, __half2 warp shuffle                     *
+******************************************************************************/
+#define __SHUFFLE_HALF2_MACRO(name) /* do */ {\
+   __half2 r; \
+   asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3;\n}" \
+       :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c)); \
+   return r; \
+} /* while(0) */
+#define __SHUFFLE_SYNC_HALF2_MACRO(name) /* do */ {\
+   __half2 r; \
+   asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3,%4;\n}" \
+       :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \
+   return r; \
+} /* while(0) */
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+__CUDA_FP16_DECL__ __half2 __shfl(const __half2 var, const int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_HALF2_MACRO(shfl.idx.b32)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_up(const __half2 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U;
+    __SHUFFLE_HALF2_MACRO(shfl.up.b32)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_down(const __half2 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_HALF2_MACRO(shfl.down.b32)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_xor(const __half2 var, const int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_HALF2_MACRO(shfl.bfly.b32)
+}
+#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */
+__CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned mask, const __half2 var, const int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.idx.b32)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U;
+    __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.up.b32)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.down.b32)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned mask, const __half2 var, const int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.bfly.b32)
+}
+#undef __SHUFFLE_HALF2_MACRO
+#undef __SHUFFLE_SYNC_HALF2_MACRO
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+__CUDA_FP16_DECL__ __half __shfl(const __half var, const int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl(temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_up(const __half var, const unsigned int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_up(temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_down(const __half var, const unsigned int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_down(temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_xor(const __half var, const int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_xor(temp1, delta, width);
+    return __low2half(temp2);
+}
+#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */
+__CUDA_FP16_DECL__ __half __shfl_sync(const unsigned mask, const __half var, const int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_sync(mask, temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned mask, const __half var, const unsigned int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_up_sync(mask, temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned mask, const __half var, const unsigned int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_down_sync(mask, temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned mask, const __half var, const int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_xor_sync(mask, temp1, delta, width);
+    return __low2half(temp2);
+}
+#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300)*/
+/******************************************************************************
+*               __half and __half2 __ldg,__ldcg,__ldca,__ldcs                *
+******************************************************************************/
+#if defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __LDG_PTR   "l"
+#else
+#define __LDG_PTR   "r"
+#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
+__CUDA_FP16_DECL__ __half2 __ldg(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.nc.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldg(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.nc.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __ldcg(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.cg.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldcg(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.cg.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __ldca(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.ca.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldca(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.ca.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __ldcs(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.cs.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldcs(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.cs.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __ldlu(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.lu.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldlu(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.lu.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __ldcv(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.cv.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldcv(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.cv.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_FP16_DECL__ void __stwb(__half2 *const ptr, const __half2 value)
+{
+    asm ("st.global.wb.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stwb(__half *const ptr, const __half value)
+{
+    asm ("st.global.wb.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__HALF_TO_CUS(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stcg(__half2 *const ptr, const __half2 value)
+{
+    asm ("st.global.cg.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stcg(__half *const ptr, const __half value)
+{
+    asm ("st.global.cg.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__HALF_TO_CUS(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stcs(__half2 *const ptr, const __half2 value)
+{
+    asm ("st.global.cs.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stcs(__half *const ptr, const __half value)
+{
+    asm ("st.global.cs.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__HALF_TO_CUS(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stwt(__half2 *const ptr, const __half2 value)
+{
+    asm ("st.global.wt.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value)
+{
+    asm ("st.global.wt.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__HALF_TO_CUS(value)) : "memory");
+}
+#undef __LDG_PTR
+#endif /*defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))*/
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+/******************************************************************************
+*                             __half2 comparison                             *
+******************************************************************************/
+#define __COMPARISON_OP_HALF2_MACRO(name) /* do */ {\
+   __half2 val; \
+   asm( "{ " __CUDA_FP16_STRINGIFY(name) ".f16x2.f16x2 %0,%1,%2;\n}" \
+        :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
+   return val; \
+} /* while(0) */
+__CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b)
+{
+    __COMPARISON_OP_HALF2_MACRO(set.eq)
+}
+__CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b)
+{
+    __COMPARISON_OP_HALF2_MACRO(set.ne)
+}
+__CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b)
+{
+    __COMPARISON_OP_HALF2_MACRO(set.le)
+}
+__CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b)
+{
+    __COMPARISON_OP_HALF2_MACRO(set.ge)
+}
+__CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b)
+{
+    __COMPARISON_OP_HALF2_MACRO(set.lt)
+}
+__CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b)
+{
+    __COMPARISON_OP_HALF2_MACRO(set.gt)
+}
+__CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b)
+{
+    __COMPARISON_OP_HALF2_MACRO(set.equ)
+}
+__CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b)
+{
+    __COMPARISON_OP_HALF2_MACRO(set.neu)
+}
+__CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b)
+{
+    __COMPARISON_OP_HALF2_MACRO(set.leu)
+}
+__CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b)
+{
+    __COMPARISON_OP_HALF2_MACRO(set.geu)
+}
+__CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b)
+{
+    __COMPARISON_OP_HALF2_MACRO(set.ltu)
+}
+__CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b)
+{
+    __COMPARISON_OP_HALF2_MACRO(set.gtu)
+}
+#undef __COMPARISON_OP_HALF2_MACRO
+#define __BOOL_COMPARISON_OP_HALF2_MACRO(name) /* do */ {\
+   __half2 val; \
+   bool retval; \
+   asm( "{ " __CUDA_FP16_STRINGIFY(name) ".f16x2.f16x2 %0,%1,%2;\n}" \
+        :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
+   if (__HALF2_TO_CUI(val) == 0x3C003C00U) {\
+      retval = true; \
+   } else { \
+      retval = false; \
+   }\
+   return retval;\
+} /* while(0) */
+__CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b)
+{
+    __BOOL_COMPARISON_OP_HALF2_MACRO(set.eq)
+}
+__CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b)
+{
+    __BOOL_COMPARISON_OP_HALF2_MACRO(set.ne)
+}
+__CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b)
+{
+    __BOOL_COMPARISON_OP_HALF2_MACRO(set.le)
+}
+__CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b)
+{
+    __BOOL_COMPARISON_OP_HALF2_MACRO(set.ge)
+}
+__CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b)
+{
+    __BOOL_COMPARISON_OP_HALF2_MACRO(set.lt)
+}
+__CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b)
+{
+    __BOOL_COMPARISON_OP_HALF2_MACRO(set.gt)
+}
+__CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b)
+{
+    __BOOL_COMPARISON_OP_HALF2_MACRO(set.equ)
+}
+__CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b)
+{
+    __BOOL_COMPARISON_OP_HALF2_MACRO(set.neu)
+}
+__CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b)
+{
+    __BOOL_COMPARISON_OP_HALF2_MACRO(set.leu)
+}
+__CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b)
+{
+    __BOOL_COMPARISON_OP_HALF2_MACRO(set.geu)
+}
+__CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b)
+{
+    __BOOL_COMPARISON_OP_HALF2_MACRO(set.ltu)
+}
+__CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b)
+{
+    __BOOL_COMPARISON_OP_HALF2_MACRO(set.gtu)
+}
+#undef __BOOL_COMPARISON_OP_HALF2_MACRO
+/******************************************************************************
+*                             __half comparison                              *
+******************************************************************************/
+#define __COMPARISON_OP_HALF_MACRO(name) /* do */ {\
+   unsigned short val; \
+   asm( "{ .reg .pred __$temp3;\n" \
+        "  setp." __CUDA_FP16_STRINGIFY(name) ".f16  __$temp3, %1, %2;\n" \
+        "  selp.u16 %0, 1, 0, __$temp3;}" \
+        : "=h"(val) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); \
+   return (val != 0U) ? true : false; \
+} /* while(0) */
+__CUDA_FP16_DECL__ bool __heq(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(eq)
+}
+__CUDA_FP16_DECL__ bool __hne(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(ne)
+}
+__CUDA_FP16_DECL__ bool __hle(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(le)
+}
+__CUDA_FP16_DECL__ bool __hge(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(ge)
+}
+__CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(lt)
+}
+__CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(gt)
+}
+__CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(equ)
+}
+__CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(neu)
+}
+__CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(leu)
+}
+__CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(geu)
+}
+__CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(ltu)
+}
+__CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(gtu)
+}
+#undef __COMPARISON_OP_HALF_MACRO
+/******************************************************************************
+*                            __half2 arithmetic                             *
+******************************************************************************/
+__CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b)
+{
+    __BINARY_OP_HALF2_MACRO(add)
+}
+__CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b)
+{
+    __BINARY_OP_HALF2_MACRO(sub)
+}
+__CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b)
+{
+    __BINARY_OP_HALF2_MACRO(mul)
+}
+__CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b)
+{
+    __BINARY_OP_HALF2_MACRO(add.sat)
+}
+__CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b)
+{
+    __BINARY_OP_HALF2_MACRO(sub.sat)
+}
+__CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b)
+{
+    __BINARY_OP_HALF2_MACRO(mul.sat)
+}
+__CUDA_FP16_DECL__ __half2 __hadd2_rn(const __half2 a, const __half2 b)
+{
+    __BINARY_OP_HALF2_MACRO(add.rn)
+}
+__CUDA_FP16_DECL__ __half2 __hsub2_rn(const __half2 a, const __half2 b)
+{
+    __BINARY_OP_HALF2_MACRO(sub.rn)
+}
+__CUDA_FP16_DECL__ __half2 __hmul2_rn(const __half2 a, const __half2 b)
+{
+    __BINARY_OP_HALF2_MACRO(mul.rn)
+}
+__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c)
+{
+    __TERNARY_OP_HALF2_MACRO(fma.rn)
+}
+__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c)
+{
+    __TERNARY_OP_HALF2_MACRO(fma.rn.sat)
+}
+__CUDA_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b) {
+    __half ha = __low2half(a);
+    __half hb = __low2half(b);
+    const __half v1 = __hdiv(ha, hb);
+    ha = __high2half(a);
+    hb = __high2half(b);
+    const __half v2 = __hdiv(ha, hb);
+    return __halves2half2(v1, v2);
+}
+/******************************************************************************
+*                             __half arithmetic                             *
+******************************************************************************/
+__CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(add)
+}
+__CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(sub)
+}
+__CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(mul)
+}
+__CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(add.sat)
+}
+__CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(sub.sat)
+}
+__CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(mul.sat)
+}
+__CUDA_FP16_DECL__ __half __hadd_rn(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(add.rn)
+}
+__CUDA_FP16_DECL__ __half __hsub_rn(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(sub.rn)
+}
+__CUDA_FP16_DECL__ __half __hmul_rn(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(mul.rn)
+}
+__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c)
+{
+    __TERNARY_OP_HALF_MACRO(fma.rn)
+}
+__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c)
+{
+    __TERNARY_OP_HALF_MACRO(fma.rn.sat)
+}
+__CUDA_FP16_DECL__ __half __hdiv(const __half a, const __half b) {
+    __half v;
+    __half abs;
+    __half den;
+    __HALF_TO_US(den) = 0x008FU;
+    float rcp;
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    asm("{rcp.approx.ftz.f32 %0, %1;\n}" :"=f"(rcp) : "f"(fb));
+    float fv = rcp * fa;
+    v = __float2half(fv);
+    __HALF_TO_US(abs) = static_cast<unsigned short>(static_cast<unsigned int>(__HALF_TO_CUS(v)) & 0x00007FFFU);
+    if (__hlt(abs, den) && (!(__HALF_TO_CUS(abs) == 0x0000U))) {
+        const float err = __fmaf_rn(-fb, fv, fa);
+        fv = __fmaf_rn(rcp, err, fv);
+        v = __float2half(fv);
+    }
+    return v;
+}
+/******************************************************************************
+*                             __half2 functions                  *
+******************************************************************************/
+#define __SPEC_CASE2(i,r, spc, ulp) \
+   "{.reg.b32 spc, ulp, p;\n"\
+   "  mov.b32 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\
+   "  mov.b32 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\
+   "  set.eq.f16x2.f16x2 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\
+   "  fma.rn.f16x2 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n"
+#define __SPEC_CASE(i,r, spc, ulp) \
+   "{.reg.b16 spc, ulp, p;\n"\
+   "  mov.b16 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\
+   "  mov.b16 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\
+   "  set.eq.f16.f16 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\
+   "  fma.rn.f16 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n"
+#define __APPROX_FCAST(fun) /* do */ {\
+   __half val;\
+   asm("{.reg.b32         f;        \n"\
+                " .reg.b16         r;        \n"\
+                "  mov.b16         r,%1;     \n"\
+                "  cvt.f32.f16     f,r;      \n"\
+                "  " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32   f,f;  \n"\
+                "  cvt.rn.f16.f32      r,f;  \n"\
+                "  mov.b16         %0,r;     \n"\
+                "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));\
+   return val;\
+} /* while(0) */
+#define __APPROX_FCAST2(fun) /* do */ {\
+   __half2 val;\
+   asm("{.reg.b16         hl, hu;         \n"\
+                " .reg.b32         fl, fu;         \n"\
+                "  mov.b32         {hl, hu}, %1;   \n"\
+                "  cvt.f32.f16     fl, hl;         \n"\
+                "  cvt.f32.f16     fu, hu;         \n"\
+                "  " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32   fl, fl;     \n"\
+                "  " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32   fu, fu;     \n"\
+                "  cvt.rn.f16.f32      hl, fl;     \n"\
+                "  cvt.rn.f16.f32      hu, fu;     \n"\
+                "  mov.b32         %0, {hl, hu};   \n"\
+                "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));       \
+   return val;\
+} /* while(0) */
+static __device__ __forceinline__ float __float_simpl_sinf(float a);
+static __device__ __forceinline__ float __float_simpl_cosf(float a);
+__CUDA_FP16_DECL__ __half hsin(const __half a) {
+    const float sl = __float_simpl_sinf(__half2float(a));
+    __half r = __float2half_rn(sl);
+    asm("{\n\t"
+        "  .reg.b16 i,r,t;     \n\t"
+        "  mov.b16 r, %0;      \n\t"
+        "  mov.b16 i, %1;      \n\t"
+        "  and.b16 t, r, 0x8000U; \n\t"
+        "  abs.f16 r, r;   \n\t"
+        "  abs.f16 i, i;   \n\t"
+        __SPEC_CASE(i, r, 0X32B3U, 0x0800U)
+        __SPEC_CASE(i, r, 0X5CB0U, 0x9000U)
+        "  or.b16  r,r,t;      \n\t"
+        "  mov.b16 %0, r;      \n"
+        "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a) {
+    const float sl = __float_simpl_sinf(__half2float(a.x));
+    const float sh = __float_simpl_sinf(__half2float(a.y));
+    __half2 r = __floats2half2_rn(sl, sh);
+    asm("{\n\t"
+        "  .reg.b32 i,r,t;             \n\t"
+        "  mov.b32 r, %0;              \n\t"
+        "  mov.b32 i, %1;              \n\t"
+        "  and.b32 t, r, 0x80008000U;   \n\t"
+        "  abs.f16x2 r, r;   \n\t"
+        "  abs.f16x2 i, i;   \n\t"
+        __SPEC_CASE2(i, r, 0X32B332B3U, 0x08000800U)
+        __SPEC_CASE2(i, r, 0X5CB05CB0U, 0x90009000U)
+        "  or.b32  r, r, t;            \n\t"
+        "  mov.b32 %0, r;              \n"
+        "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half hcos(const __half a) {
+    const float cl = __float_simpl_cosf(__half2float(a));
+    __half r = __float2half_rn(cl);
+    asm("{\n\t"
+        "  .reg.b16 i,r;        \n\t"
+        "  mov.b16 r, %0;       \n\t"
+        "  mov.b16 i, %1;       \n\t"
+        "  abs.f16 i, i;        \n\t"
+        __SPEC_CASE(i, r, 0X2B7CU, 0x1000U)
+        "  mov.b16 %0, r;       \n"
+        "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a) {
+    const float cl = __float_simpl_cosf(__half2float(a.x));
+    const float ch = __float_simpl_cosf(__half2float(a.y));
+    __half2 r = __floats2half2_rn(cl, ch);
+    asm("{\n\t"
+        "  .reg.b32 i,r;   \n\t"
+        "  mov.b32 r, %0;  \n\t"
+        "  mov.b32 i, %1;  \n\t"
+        "  abs.f16x2 i, i; \n\t"
+        __SPEC_CASE2(i, r, 0X2B7C2B7CU, 0x10001000U)
+        "  mov.b32 %0, r;  \n"
+        "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
+    return r;
+}
+static __device__ __forceinline__ float __internal_trig_reduction_kernel(const float a, unsigned int *const quadrant)
+{
+    const float ar = __fmaf_rn(a, 0.636619772F, 12582912.0F);
+    const unsigned q = __float_as_uint(ar);
+    const float j = __fsub_rn(ar, 12582912.0F);
+    float t = __fmaf_rn(j, -1.5707962512969971e+000F, a);
+    t = __fmaf_rn(j, -7.5497894158615964e-008F, t);
+    *quadrant = q;
+    return t;
+}
+static __device__ __forceinline__ float __internal_sin_cos_kernel(const float x, const unsigned int i)
+{
+    float z;
+    const float x2 = x*x;
+    float a8;
+    float a6;
+    float a4;
+    float a2;
+    float a1;
+    float a0;
+    if ((i & 1U) != 0U) {
+        // cos
+        a8 =  2.44331571e-5F;
+        a6 = -1.38873163e-3F;
+        a4 =  4.16666457e-2F;
+        a2 = -5.00000000e-1F;
+        a1 = x2;
+        a0 = 1.0F;
+    }
+    else {
+        // sin
+        a8 = -1.95152959e-4F;
+        a6 =  8.33216087e-3F;
+        a4 = -1.66666546e-1F;
+        a2 = 0.0F;
+        a1 = x;
+        a0 = x;
+    }
+    z = __fmaf_rn(a8, x2, a6);
+    z = __fmaf_rn(z, x2, a4);
+    z = __fmaf_rn(z, x2, a2);
+    z = __fmaf_rn(z, a1, a0);
+    if ((i & 2U) != 0U) {
+        z = -z;
+    }
+    return z;
+}
+static __device__ __forceinline__ float __float_simpl_sinf(float a)
+{
+    float z;
+    unsigned i;
+    a = __internal_trig_reduction_kernel(a, &i);
+    z = __internal_sin_cos_kernel(a, i);
+    return z;
+}
+static __device__ __forceinline__ float __float_simpl_cosf(float a)
+{
+    float z;
+    unsigned i;
+    a = __internal_trig_reduction_kernel(a, &i);
+    z = __internal_sin_cos_kernel(a, (i & 0x3U) + 1U);
+    return z;
+}
+__CUDA_FP16_DECL__ __half hexp(const __half a) {
+    __half val;
+    asm("{.reg.b32         f, C, nZ;       \n"
+        " .reg.b16         h,r;            \n"
+        "  mov.b16         h,%1;           \n"
+        "  cvt.f32.f16     f,h;            \n"
+        "  mov.b32         C, 0x3fb8aa3bU; \n"
+        "  mov.b32         nZ, 0x80000000U;\n"
+        "  fma.rn.f32      f,f,C,nZ;       \n"
+        "  ex2.approx.ftz.f32  f,f;        \n"
+        "  cvt.rn.f16.f32      r,f;        \n"
+        __SPEC_CASE(h, r, 0X1F79U, 0x9400U)
+        __SPEC_CASE(h, r, 0X25CFU, 0x9400U)
+        __SPEC_CASE(h, r, 0XC13BU, 0x0400U)
+        __SPEC_CASE(h, r, 0XC1EFU, 0x0200U)
+        "  mov.b16         %0,r;           \n"
+        "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         h,r,fl,fu,C,nZ; \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  mov.b32         h, %1;          \n"
+        "  cvt.f32.f16     fl, hl;         \n"
+        "  cvt.f32.f16     fu, hu;         \n"
+        "  mov.b32         C, 0x3fb8aa3bU; \n"
+        "  mov.b32         nZ, 0x80000000U;\n"
+        "  fma.rn.f32      fl,fl,C,nZ;     \n"
+        "  fma.rn.f32      fu,fu,C,nZ;     \n"
+        "  ex2.approx.ftz.f32  fl, fl;     \n"
+        "  ex2.approx.ftz.f32  fu, fu;     \n"
+        "  cvt.rn.f16.f32      hl, fl;     \n"
+        "  cvt.rn.f16.f32      hu, fu;     \n"
+        "  mov.b32         r, {hl, hu};    \n"
+        __SPEC_CASE2(h, r, 0X1F791F79U, 0x94009400U)
+        __SPEC_CASE2(h, r, 0X25CF25CFU, 0x94009400U)
+        __SPEC_CASE2(h, r, 0XC13BC13BU, 0x04000400U)
+        __SPEC_CASE2(h, r, 0XC1EFC1EFU, 0x02000200U)
+        "  mov.b32         %0, r;  \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half hexp2(const __half a) {
+    __half val;
+    asm("{.reg.b32         f, ULP;         \n"
+        " .reg.b16         r;              \n"
+        "  mov.b16         r,%1;           \n"
+        "  cvt.f32.f16     f,r;            \n"
+        "  ex2.approx.ftz.f32      f,f;    \n"
+        "  mov.b32         ULP, 0x33800000U;\n"
+        "  fma.rn.f32      f,f,ULP,f;      \n"
+        "  cvt.rn.f16.f32      r,f;        \n"
+        "  mov.b16         %0,r;           \n"
+        "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         fl, fu, ULP;    \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  cvt.f32.f16     fl, hl;         \n"
+        "  cvt.f32.f16     fu, hu;         \n"
+        "  ex2.approx.ftz.f32  fl, fl;     \n"
+        "  ex2.approx.ftz.f32  fu, fu;     \n"
+        "  mov.b32         ULP, 0x33800000U;\n"
+        "  fma.rn.f32      fl,fl,ULP,fl;   \n"
+        "  fma.rn.f32      fu,fu,ULP,fu;   \n"
+        "  cvt.rn.f16.f32      hl, fl;     \n"
+        "  cvt.rn.f16.f32      hu, fu;     \n"
+        "  mov.b32         %0, {hl, hu};   \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half hexp10(const __half a) {
+    __half val;
+    asm("{.reg.b16         h,r;            \n"
+        " .reg.b32         f, C, nZ;       \n"
+        "  mov.b16         h, %1;          \n"
+        "  cvt.f32.f16     f, h;           \n"
+        "  mov.b32         C, 0x40549A78U; \n"
+        "  mov.b32         nZ, 0x80000000U;\n"
+        "  fma.rn.f32      f,f,C,nZ;       \n"
+        "  ex2.approx.ftz.f32  f, f;       \n"
+        "  cvt.rn.f16.f32      r, f;       \n"
+        __SPEC_CASE(h, r, 0x34DEU, 0x9800U)
+        __SPEC_CASE(h, r, 0x9766U, 0x9000U)
+        __SPEC_CASE(h, r, 0x9972U, 0x1000U)
+        __SPEC_CASE(h, r, 0xA5C4U, 0x1000U)
+        __SPEC_CASE(h, r, 0xBF0AU, 0x8100U)
+        "  mov.b16         %0, r;          \n"
+        "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         h,r,fl,fu,C,nZ; \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  mov.b32         h, %1;          \n"
+        "  cvt.f32.f16     fl, hl;         \n"
+        "  cvt.f32.f16     fu, hu;         \n"
+        "  mov.b32         C, 0x40549A78U; \n"
+        "  mov.b32         nZ, 0x80000000U;\n"
+        "  fma.rn.f32      fl,fl,C,nZ;     \n"
+        "  fma.rn.f32      fu,fu,C,nZ;     \n"
+        "  ex2.approx.ftz.f32  fl, fl;     \n"
+        "  ex2.approx.ftz.f32  fu, fu;     \n"
+        "  cvt.rn.f16.f32      hl, fl;     \n"
+        "  cvt.rn.f16.f32      hu, fu;     \n"
+        "  mov.b32         r, {hl, hu};    \n"
+        __SPEC_CASE2(h, r, 0x34DE34DEU, 0x98009800U)
+        __SPEC_CASE2(h, r, 0x97669766U, 0x90009000U)
+        __SPEC_CASE2(h, r, 0x99729972U, 0x10001000U)
+        __SPEC_CASE2(h, r, 0xA5C4A5C4U, 0x10001000U)
+        __SPEC_CASE2(h, r, 0xBF0ABF0AU, 0x81008100U)
+        "  mov.b32         %0, r;  \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half hlog2(const __half a) {
+    __half val;
+    asm("{.reg.b16         h, r;           \n"
+        " .reg.b32         f;              \n"
+        "  mov.b16         h, %1;          \n"
+        "  cvt.f32.f16     f, h;           \n"
+        "  lg2.approx.ftz.f32  f, f;       \n"
+        "  cvt.rn.f16.f32      r, f;       \n"
+        __SPEC_CASE(r, r, 0xA2E2U, 0x8080U)
+        __SPEC_CASE(r, r, 0xBF46U, 0x9400U)
+        "  mov.b16         %0, r;          \n"
+        "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         fl, fu, r, p;   \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  cvt.f32.f16     fl, hl;         \n"
+        "  cvt.f32.f16     fu, hu;         \n"
+        "  lg2.approx.ftz.f32  fl, fl;     \n"
+        "  lg2.approx.ftz.f32  fu, fu;     \n"
+        "  cvt.rn.f16.f32      hl, fl;     \n"
+        "  cvt.rn.f16.f32      hu, fu;     \n"
+        "  mov.b32         r, {hl, hu};    \n"
+        __SPEC_CASE2(r, r, 0xA2E2A2E2U, 0x80808080U)
+        __SPEC_CASE2(r, r, 0xBF46BF46U, 0x94009400U)
+        "  mov.b32         %0, r;          \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half hlog(const __half a) {
+    __half val;
+    asm("{.reg.b32         f, C;           \n"
+        " .reg.b16         r,h;            \n"
+        "  mov.b16         h,%1;           \n"
+        "  cvt.f32.f16     f,h;            \n"
+        "  lg2.approx.ftz.f32  f,f;        \n"
+        "  mov.b32         C, 0x3f317218U;  \n"
+        "  mul.f32         f,f,C;          \n"
+        "  cvt.rn.f16.f32      r,f;        \n"
+        __SPEC_CASE(h, r, 0X160DU, 0x9C00U)
+        __SPEC_CASE(h, r, 0X3BFEU, 0x8010U)
+        __SPEC_CASE(h, r, 0X3C0BU, 0x8080U)
+        __SPEC_CASE(h, r, 0X6051U, 0x1C00U)
+        "  mov.b16         %0,r;           \n"
+        "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2log(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;             \n"
+        " .reg.b32         r, fl, fu, C, h;    \n"
+        "  mov.b32         {hl, hu}, %1;       \n"
+        "  mov.b32         h, %1;              \n"
+        "  cvt.f32.f16     fl, hl;             \n"
+        "  cvt.f32.f16     fu, hu;             \n"
+        "  lg2.approx.ftz.f32  fl, fl;         \n"
+        "  lg2.approx.ftz.f32  fu, fu;         \n"
+        "  mov.b32         C, 0x3f317218U;     \n"
+        "  mul.f32         fl,fl,C;            \n"
+        "  mul.f32         fu,fu,C;            \n"
+        "  cvt.rn.f16.f32      hl, fl;         \n"
+        "  cvt.rn.f16.f32      hu, fu;         \n"
+        "  mov.b32         r, {hl, hu};        \n"
+        __SPEC_CASE2(h, r, 0X160D160DU, 0x9C009C00U)
+        __SPEC_CASE2(h, r, 0X3BFE3BFEU, 0x80108010U)
+        __SPEC_CASE2(h, r, 0X3C0B3C0BU, 0x80808080U)
+        __SPEC_CASE2(h, r, 0X60516051U, 0x1C001C00U)
+        "  mov.b32         %0, r;              \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half hlog10(const __half a) {
+    __half val;
+    asm("{.reg.b16         h, r;           \n"
+        " .reg.b32         f, C;           \n"
+        "  mov.b16         h, %1;          \n"
+        "  cvt.f32.f16     f, h;           \n"
+        "  lg2.approx.ftz.f32  f, f;       \n"
+        "  mov.b32         C, 0x3E9A209BU; \n"
+        "  mul.f32         f,f,C;          \n"
+        "  cvt.rn.f16.f32      r, f;       \n"
+        __SPEC_CASE(h, r, 0x338FU, 0x1000U)
+        __SPEC_CASE(h, r, 0x33F8U, 0x9000U)
+        __SPEC_CASE(h, r, 0x57E1U, 0x9800U)
+        __SPEC_CASE(h, r, 0x719DU, 0x9C00U)
+        "  mov.b16         %0, r;          \n"
+        "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;             \n"
+        " .reg.b32         r, fl, fu, C, h;    \n"
+        "  mov.b32         {hl, hu}, %1;       \n"
+        "  mov.b32         h, %1;              \n"
+        "  cvt.f32.f16     fl, hl;             \n"
+        "  cvt.f32.f16     fu, hu;             \n"
+        "  lg2.approx.ftz.f32  fl, fl;         \n"
+        "  lg2.approx.ftz.f32  fu, fu;         \n"
+        "  mov.b32         C, 0x3E9A209BU;     \n"
+        "  mul.f32         fl,fl,C;            \n"
+        "  mul.f32         fu,fu,C;            \n"
+        "  cvt.rn.f16.f32      hl, fl;         \n"
+        "  cvt.rn.f16.f32      hu, fu;         \n"
+        "  mov.b32         r, {hl, hu};        \n"
+        __SPEC_CASE2(h, r, 0x338F338FU, 0x10001000U)
+        __SPEC_CASE2(h, r, 0x33F833F8U, 0x90009000U)
+        __SPEC_CASE2(h, r, 0x57E157E1U, 0x98009800U)
+        __SPEC_CASE2(h, r, 0x719D719DU, 0x9C009C00U)
+        "  mov.b32         %0, r;              \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+#undef __SPEC_CASE2
+#undef __SPEC_CASE
+__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a) {
+    __APPROX_FCAST2(rcp)
+}
+__CUDA_FP16_DECL__ __half hrcp(const __half a) {
+    __APPROX_FCAST(rcp)
+}
+__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a) {
+    __APPROX_FCAST2(rsqrt)
+}
+__CUDA_FP16_DECL__ __half hrsqrt(const __half a) {
+    __APPROX_FCAST(rsqrt)
+}
+__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a) {
+    __APPROX_FCAST2(sqrt)
+}
+__CUDA_FP16_DECL__ __half hsqrt(const __half a) {
+    __APPROX_FCAST(sqrt)
+}
+#undef __APPROX_FCAST
+#undef __APPROX_FCAST2
+__CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a)
+{
+    __half2 r;
+    asm("{set.nan.f16x2.f16x2 %0,%1,%2;\n}"
+        :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(a)));
+    return r;
+}
+__CUDA_FP16_DECL__ bool __hisnan(const __half a)
+{
+    __half r;
+    asm("{set.nan.f16.f16 %0,%1,%2;\n}"
+        :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(a)));
+    return __HALF_TO_CUS(r) != 0U;
+}
+__CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a)
+{
+    __half2 r;
+    asm("{neg.f16x2 %0,%1;\n}"
+        :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half __hneg(const __half a)
+{
+    __half r;
+    asm("{neg.f16 %0,%1;\n}"
+        :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half2 __habs2(const __half2 a)
+{
+    __half2 r;
+    asm("{abs.f16x2 %0,%1;\n}"
+        :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half __habs(const __half a)
+{
+    __half r;
+    asm("{abs.f16 %0,%1;\n}"
+        :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c)
+{
+    // fast version of complex multiply-accumulate
+    // (a.re, a.im) * (b.re, b.im) + (c.re, c.im)
+    // acc.re = (c.re + a.re*b.re) - a.im*b.im
+    // acc.im = (c.im + a.re*b.im) + a.im*b.re
+    __half real_tmp =  __hfma(a.x, b.x, c.x);
+    __half img_tmp  =  __hfma(a.x, b.y, c.y);
+    real_tmp = __hfma(__hneg(a.y), b.y, real_tmp);
+    img_tmp  = __hfma(a.y,         b.x, img_tmp);
+    return make_half2(real_tmp, img_tmp);
+}
+#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)*/
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
+__CUDA_FP16_DECL__ __half __hmax_nan(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(max.NaN)
+}
+__CUDA_FP16_DECL__ __half __hmin_nan(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(min.NaN)
+}
+__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c)
+{
+    __TERNARY_OP_HALF_MACRO(fma.rn.relu)
+}
+__CUDA_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b)
+{
+    __BINARY_OP_HALF2_MACRO(max.NaN)
+}
+__CUDA_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b)
+{
+    __BINARY_OP_HALF2_MACRO(min.NaN)
+}
+__CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c)
+{
+    __TERNARY_OP_HALF2_MACRO(fma.rn.relu)
+}
+#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)*/
+/* Define __PTR for atomicAdd prototypes below, undef after done */
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __PTR   "l"
+#else
+#define __PTR   "r"
+#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+__CUDA_FP16_DECL__  __half2 atomicAdd(__half2 *const address, const __half2 val) {
+    __half2 r;
+    asm volatile ("{ atom.add.noftz.f16x2 %0,[%1],%2; }\n"
+                  : "=r"(__HALF2_TO_UI(r)) : __PTR(address), "r"(__HALF2_TO_CUI(val))
+                  : "memory");
+   return r;
+}
+#endif /*!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600*/
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+__CUDA_FP16_DECL__  __half atomicAdd(__half *const address, const __half val) {
+    __half r;
+    asm volatile ("{ atom.add.noftz.f16 %0,[%1],%2; }\n"
+                  : "=h"(__HALF_TO_US(r))
+                  : __PTR(address), "h"(__HALF_TO_CUS(val))
+                  : "memory");
+   return r;
+}
+#endif /*!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700*/
+#undef __PTR
+#undef __CUDA_FP16_DECL__
+#endif /* defined(__CUDACC__) */
+#endif /* defined(__cplusplus) */
+#undef __TERNARY_OP_HALF2_MACRO
+#undef __TERNARY_OP_HALF_MACRO
+#undef __BINARY_OP_HALF2_MACRO
+#undef __BINARY_OP_HALF_MACRO
+#undef __CUDA_HOSTDEVICE_FP16_DECL__
+#undef __CUDA_FP16_DECL__
+#undef __HALF_TO_US
+#undef __HALF_TO_CUS
+#undef __HALF2_TO_UI
+#undef __HALF2_TO_CUI
+/* Define first-class types "half" and "half2", unless user specifies otherwise via "#define CUDA_NO_HALF" */
+/* C cannot ever have these types defined here, because __half and __half2 are C++ classes */
+#if defined(__cplusplus) && !defined(CUDA_NO_HALF)
+typedef __half half;
+typedef __half2 half2;
+// for consistency with __nv_bfloat16
+typedef __half      __nv_half;
+typedef __half2     __nv_half2;
+typedef __half_raw  __nv_half_raw;
+typedef __half2_raw __nv_half2_raw;
+typedef __half        nv_half;
+typedef __half2       nv_half2;
+#endif /* defined(__cplusplus) && !defined(CUDA_NO_HALF) */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16)
+#undef __CPP_VERSION_AT_LEAST_11_FP16
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+#endif /* end of include guard: __CUDA_FP16_HPP__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp8.h ADDED Viewed

	@@ -0,0 +1,360 @@

+/*
+ * Copyright 2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef __CUDA_FP8_H__
+#define __CUDA_FP8_H__
+/* Set up function decorations */
+#if defined(__CUDACC__)
+#define __CUDA_FP8_DECL__ static __device__ __inline__
+#define __CUDA_HOSTDEVICE_FP8__ __host__ __device__
+#define __CUDA_HOSTDEVICE_FP8_DECL__ static __host__ __device__ __inline__
+#else /* !defined(__CUDACC__) */
+#if defined(__GNUC__)
+#define __CUDA_HOSTDEVICE_FP8_DECL__ static __attribute__((unused))
+#else
+#define __CUDA_HOSTDEVICE_FP8_DECL__ static
+#endif /* defined(__GNUC__) */
+#define __CUDA_HOSTDEVICE_FP8__
+#endif /* defined(__CUDACC_) */
+#if !defined(_MSC_VER) && __cplusplus >= 201103L
+#define __CPP_VERSION_AT_LEAST_11_FP8
+#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
+#define __CPP_VERSION_AT_LEAST_11_FP8
+#endif
+/* bring in __half_raw data type */
+#include "cuda_fp16.h"
+/* bring in __nv_bfloat16_raw data type */
+#include "cuda_bf16.h"
+/* bring in float2, double4, etc vector types */
+#include "vector_types.h"
+/**
+ * \defgroup CUDA_MATH_INTRINSIC_FP8 FP8 Intrinsics
+ * This section describes fp8 intrinsic functions.
+ * To use these functions, include the header file \p cuda_fp8.h in your
+ * program.
+ */
+/**
+ * \defgroup CUDA_MATH_FP8_MISC FP8 Conversion and Data Movement
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ * To use these functions, include the header file \p cuda_fp8.h in your
+ * program.
+ */
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief 8-bit \p unsigned \p integer
+ * type abstraction used to for \p fp8 floating-point
+ * numbers storage.
+ */
+typedef unsigned char __nv_fp8_storage_t;
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief 16-bit \p unsigned \p integer
+ * type abstraction used to for storage of pairs of
+ * \p fp8 floating-point numbers.
+ */
+typedef unsigned short int __nv_fp8x2_storage_t;
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief 32-bit \p unsigned \p integer
+ * type abstraction used to for storage of tetrads of
+ * \p fp8 floating-point numbers.
+ */
+typedef unsigned int __nv_fp8x4_storage_t;
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Enumerates the modes applicable when
+ * performing a narrowing conversion to \p fp8 destination types.
+ */
+typedef enum __nv_saturation_t {
+    /**
+     * Means no saturation to finite is performed when conversion
+     * results in rounding values outside the range of destination
+     * type.
+     * NOTE: for fp8 type of e4m3 kind, the results that are larger
+     * than the maximum representable finite number of the target
+     * format become NaN.
+     */
+    __NV_NOSAT,
+    /**
+     * Means input larger than the maximum representable
+     * finite number MAXNORM of the target format round to the
+     * MAXNORM of the same sign as input.
+     */
+    __NV_SATFINITE,
+} __nv_saturation_t;
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Enumerates the possible
+ * interpretations of the 8-bit values when referring to them as
+ * \p fp8 types.
+ */
+typedef enum __nv_fp8_interpretation_t {
+    __NV_E4M3, /**< Stands for \p fp8 numbers of \p e4m3 kind. */
+    __NV_E5M2, /**< Stands for \p fp8 numbers of \p e5m2 kind. */
+} __nv_fp8_interpretation_t;
+/* Forward-declaration of C-style APIs */
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p double precision \p x to \p fp8 type of the
+ * requested kind using round-to-nearest-even rounding and requested saturation
+ * mode.
+ *
+ * \details Converts input \p x to \p fp8 type of the kind specified by
+ * \p fp8_interpretation parameter,
+ * using round-to-nearest-even rounding and
+ * saturation mode specified by \p saturate parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
+__nv_cvt_double_to_fp8(const double x, const __nv_saturation_t saturate,
+                       const __nv_fp8_interpretation_t fp8_interpretation);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input vector of two \p double precision numbers packed
+ * in \p double2 \p x into a vector of two values of \p fp8 type of
+ * the requested kind using round-to-nearest-even rounding and requested
+ * saturation mode.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp8 values of the
+ * kind specified by \p fp8_interpretation parameter, using
+ * round-to-nearest-even rounding and saturation mode specified by \p saturate
+ * parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
+__nv_cvt_double2_to_fp8x2(const double2 x, const __nv_saturation_t saturate,
+                          const __nv_fp8_interpretation_t fp8_interpretation);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p single precision \p x to \p fp8 type of the
+ * requested kind using round-to-nearest-even rounding and requested saturation
+ * mode.
+ *
+ * \details Converts input \p x to \p fp8 type of the kind specified by
+ * \p fp8_interpretation parameter,
+ * using round-to-nearest-even rounding and
+ * saturation mode specified by \p saturate parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
+__nv_cvt_float_to_fp8(const float x, const __nv_saturation_t saturate,
+                      const __nv_fp8_interpretation_t fp8_interpretation);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input vector of two \p single precision numbers packed
+ * in \p float2 \p x into a vector of two values of \p fp8 type of
+ * the requested kind using round-to-nearest-even rounding and requested
+ * saturation mode.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp8 values of the
+ * kind specified by \p fp8_interpretation parameter, using
+ * round-to-nearest-even rounding and saturation mode specified by \p saturate
+ * parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
+__nv_cvt_float2_to_fp8x2(const float2 x, const __nv_saturation_t saturate,
+                         const __nv_fp8_interpretation_t fp8_interpretation);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p half precision \p x to \p fp8 type of the requested
+ * kind using round-to-nearest-even rounding and requested saturation mode.
+ *
+ * \details Converts input \p x to \p fp8 type of the kind specified by
+ * \p fp8_interpretation parameter,
+ * using round-to-nearest-even rounding and
+ * saturation mode specified by \p saturate parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
+__nv_cvt_halfraw_to_fp8(const __half_raw x, const __nv_saturation_t saturate,
+                        const __nv_fp8_interpretation_t fp8_interpretation);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input vector of two \p half precision numbers packed
+ * in \p __half2_raw \p x into a vector of two values of \p fp8 type of
+ * the requested kind using round-to-nearest-even rounding and requested
+ * saturation mode.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp8 values of the
+ * kind specified by \p fp8_interpretation parameter, using
+ * round-to-nearest-even rounding and saturation mode specified by \p saturate
+ * parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_halfraw2_to_fp8x2(
+    const __half2_raw x, const __nv_saturation_t saturate,
+    const __nv_fp8_interpretation_t fp8_interpretation);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p nv_bfloat16 precision \p x to \p fp8 type of the
+ * requested kind using round-to-nearest-even rounding and requested saturation
+ * mode.
+ *
+ * \details Converts input \p x to \p fp8 type of the kind specified by
+ * \p fp8_interpretation parameter,
+ * using round-to-nearest-even rounding and
+ * saturation mode specified by \p saturate parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_bfloat16raw_to_fp8(
+    const __nv_bfloat16_raw x, const __nv_saturation_t saturate,
+    const __nv_fp8_interpretation_t fp8_interpretation);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input vector of two \p nv_bfloat16 precision numbers packed
+ * in \p __nv_bfloat162_raw \p x into a vector of two values of \p fp8 type of
+ * the requested kind using round-to-nearest-even rounding and requested
+ * saturation mode.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp8 values of the
+ * kind specified by \p fp8_interpretation parameter, using
+ * round-to-nearest-even rounding and saturation mode specified by \p saturate
+ * parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
+__nv_cvt_bfloat16raw2_to_fp8x2(
+    const __nv_bfloat162_raw x, const __nv_saturation_t saturate,
+    const __nv_fp8_interpretation_t fp8_interpretation);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p fp8 \p x of the specified kind
+ * to \p half precision.
+ *
+ * \details Converts input \p x of \p fp8 type of the kind specified by
+ * \p fp8_interpretation parameter
+ * to \p half precision.
+ *
+ * \returns
+ * - The \p __half_raw value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __half_raw
+__nv_cvt_fp8_to_halfraw(const __nv_fp8_storage_t x,
+                        const __nv_fp8_interpretation_t fp8_interpretation);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input vector of two \p fp8 values of the specified kind
+ * to a vector of two \p half precision values packed in \p __half2_raw
+ * structure.
+ *
+ * \details Converts input vector \p x of \p fp8 type of the kind specified by
+ * \p fp8_interpretation parameter
+ * to a vector of two \p half precision values and returns as \p __half2_raw
+ * structure.
+ *
+ * \returns
+ * - The \p __half2_raw value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw
+__nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x,
+                           const __nv_fp8_interpretation_t fp8_interpretation);
+#if defined(__cplusplus)
+#define __CUDA_FP8_TYPES_EXIST__
+/* Forward-declaration of structures defined in "cuda_fp8.hpp" */
+struct __nv_fp8_e5m2;
+struct __nv_fp8x2_e5m2;
+struct __nv_fp8x4_e5m2;
+struct __nv_fp8_e4m3;
+struct __nv_fp8x2_e4m3;
+struct __nv_fp8x4_e4m3;
+#endif /* defined(__cplusplus) */
+#include "cuda_fp8.hpp"
+#undef __CUDA_FP8_DECL__
+#undef __CUDA_HOSTDEVICE_FP8__
+#undef __CUDA_HOSTDEVICE_FP8_DECL__
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+#undef __CPP_VERSION_AT_LEAST_11_FP8
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+#endif /* end of include guard: __CUDA_FP8_H__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_gl_interop.h ADDED Viewed

	@@ -0,0 +1,508 @@

+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__CUDA_GL_INTEROP_H__)
+#define __CUDA_GL_INTEROP_H__
+#include "cuda_runtime_api.h"
+#if defined(__APPLE__)
+#include <OpenGL/gl.h>
+#else /* __APPLE__ */
+#if defined(__arm__) || defined(__aarch64__)
+#ifndef GL_VERSION
+#error Please include the appropriate gl headers before including cuda_gl_interop.h
+#endif
+#else
+#include <GL/gl.h>
+#endif
+#endif /* __APPLE__ */
+/** \cond impl_private */
+#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+/** \endcond impl_private */
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+/**
+ * \addtogroup CUDART_OPENGL OpenGL Interoperability
+ * This section describes the OpenGL interoperability functions of the CUDA
+ * runtime application programming interface. Note that mapping of OpenGL
+ * resources is performed with the graphics API agnostic, resource mapping
+ * interface described in \ref CUDART_INTEROP "Graphics Interopability".
+ *
+ * @{
+ */
+/**
+ * CUDA devices corresponding to the current OpenGL context
+ */
+enum cudaGLDeviceList
+{
+  cudaGLDeviceListAll           = 1, /**< The CUDA devices for all GPUs used by the current OpenGL context */
+  cudaGLDeviceListCurrentFrame  = 2, /**< The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame */
+  cudaGLDeviceListNextFrame     = 3  /**< The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame  */
+};
+/**
+ * \brief Gets the CUDA devices associated with the current OpenGL context
+ *
+ * Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices
+ * corresponding to the current OpenGL context. Also returns in \p *pCudaDevices
+ * at most \p cudaDeviceCount of the CUDA-compatible devices corresponding to
+ * the current OpenGL context. If any of the GPUs being used by the current OpenGL
+ * context are not CUDA capable then the call will return ::cudaErrorNoDevice.
+ *
+ * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to the
+ *                           current OpenGL context
+ * \param pCudaDevices     - Returned CUDA devices corresponding to the current
+ *                           OpenGL context
+ * \param cudaDeviceCount  - The size of the output device array \p pCudaDevices
+ * \param deviceList       - The set of devices to return.  This set may be
+ *                           ::cudaGLDeviceListAll for all devices,
+ *                           ::cudaGLDeviceListCurrentFrame for the devices used to
+ *                           render the current frame (in SLI), or
+ *                           ::cudaGLDeviceListNextFrame for the devices used to
+ *                           render the next frame (in SLI).
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNoDevice,
+ * ::cudaErrorInvalidGraphicsContext,
+ * ::cudaErrorUnknown
+ *
+ * \note This function is not supported on Mac OS X.
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsMapResources,
+ * ::cudaGraphicsSubResourceGetMappedArray,
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuGLGetDevices
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGLGetDevices(unsigned int *pCudaDeviceCount, int *pCudaDevices, unsigned int cudaDeviceCount, enum cudaGLDeviceList deviceList);
+/**
+ * \brief Register an OpenGL texture or renderbuffer object
+ *
+ * Registers the texture or renderbuffer object specified by \p image for access by CUDA.
+ * A handle to the registered object is returned as \p resource.
+ *
+ * \p target must match the type of the object, and must be one of ::GL_TEXTURE_2D,
+ * ::GL_TEXTURE_RECTANGLE, ::GL_TEXTURE_CUBE_MAP, ::GL_TEXTURE_3D, ::GL_TEXTURE_2D_ARRAY,
+ * or ::GL_RENDERBUFFER.
+ *
+ * The register flags \p flags specify the intended usage, as follows:
+ * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ * - ::cudaGraphicsRegisterFlagsSurfaceLoadStore: Specifies that CUDA will
+ *   bind this resource to a surface reference.
+ * - ::cudaGraphicsRegisterFlagsTextureGather: Specifies that CUDA will perform
+ *   texture gather operations on this resource.
+ *
+ * The following image formats are supported. For brevity's sake, the list is abbreviated.
+ * For ex., {GL_R, GL_RG} X {8, 16} would expand to the following 4 formats
+ * {GL_R8, GL_R16, GL_RG8, GL_RG16} :
+ * - GL_RED, GL_RG, GL_RGBA, GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY
+ * - {GL_R, GL_RG, GL_RGBA} X {8, 16, 16F, 32F, 8UI, 16UI, 32UI, 8I, 16I, 32I}
+ * - {GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY} X
+ * {8, 16, 16F_ARB, 32F_ARB, 8UI_EXT, 16UI_EXT, 32UI_EXT, 8I_EXT, 16I_EXT, 32I_EXT}
+ *
+ * The following image classes are currently disallowed:
+ * - Textures with borders
+ * - Multisampled renderbuffers
+ *
+ * \param resource - Pointer to the returned object handle
+ * \param image    - name of texture or renderbuffer object to be registered
+ * \param target   - Identifies the type of object specified by \p image
+ * \param flags    - Register flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsMapResources,
+ * ::cudaGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsGLRegisterImage
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterImage(struct cudaGraphicsResource **resource, GLuint image, GLenum target, unsigned int flags);
+/**
+ * \brief Registers an OpenGL buffer object
+ *
+ * Registers the buffer object specified by \p buffer for access by
+ * CUDA.  A handle to the registered object is returned as \p
+ * resource.  The register flags \p flags specify the intended usage,
+ * as follows:
+ *
+ * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * \param resource - Pointer to the returned object handle
+ * \param buffer   - name of buffer object to be registered
+ * \param flags    - Register flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsMapResources,
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuGraphicsGLRegisterBuffer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterBuffer(struct cudaGraphicsResource **resource, GLuint buffer, unsigned int flags);
+#ifdef _WIN32
+#ifndef WGL_NV_gpu_affinity
+typedef void* HGPUNV;
+#endif
+/**
+ * \brief Gets the CUDA device associated with hGpu
+ *
+ * Returns the CUDA device associated with a hGpu, if applicable.
+ *
+ * \param device - Returns the device associated with hGpu, or -1 if hGpu is
+ * not a compute device.
+ * \param hGpu   - Handle to a GPU, as queried via WGL_NV_gpu_affinity
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ *
+ * \sa
+ * ::WGL_NV_gpu_affinity,
+ * ::cuWGLGetDevice
+ */
+extern __host__ cudaError_t CUDARTAPI cudaWGLGetDevice(int *device, HGPUNV hGpu);
+#endif
+/** @} */ /* END CUDART_OPENGL */
+/**
+ * \addtogroup CUDART_OPENGL_DEPRECATED OpenGL Interoperability [DEPRECATED]
+ * This section describes deprecated OpenGL interoperability functionality.
+ *
+ * @{
+ */
+/**
+ * CUDA GL Map Flags
+ */
+enum cudaGLMapFlags
+{
+  cudaGLMapFlagsNone         = 0,  /**< Default; Assume resource can be read/written */
+  cudaGLMapFlagsReadOnly     = 1,  /**< CUDA kernels will not write to this resource */
+  cudaGLMapFlagsWriteDiscard = 2   /**< CUDA kernels will only write to and will not read from this resource */
+};
+/**
+ * \brief Sets a CUDA device to use OpenGL interoperability
+ *
+ * \deprecated This function is deprecated as of CUDA 5.0.
+ *
+ * This function is deprecated and should no longer be used.  It is
+ * no longer necessary to associate a CUDA device with an OpenGL
+ * context in order to achieve maximum interoperability performance.
+ *
+ * \param device - Device to use for OpenGL interoperability
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorSetOnActiveProcess
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsGLRegisterBuffer, ::cudaGraphicsGLRegisterImage
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device);
+/**
+ * \brief Registers a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Registers the buffer object of ID \p bufObj for access by
+ * CUDA. This function must be called before CUDA can map the buffer
+ * object.  The OpenGL context used to create the buffer, or another
+ * context from the same share group, must be bound to the current
+ * thread when this is called.
+ *
+ * \param bufObj - Buffer object ID to register
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInitializationError
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsGLRegisterBuffer
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint bufObj);
+/**
+ * \brief Maps a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Maps the buffer object of ID \p bufObj into the address space of
+ * CUDA and returns in \p *devPtr the base pointer of the resulting
+ * mapping.  The buffer must have previously been registered by
+ * calling ::cudaGLRegisterBufferObject().  While a buffer is mapped
+ * by CUDA, any OpenGL operation which references the buffer will
+ * result in undefined behavior.  The OpenGL context used to create
+ * the buffer, or another context from the same share group, must be
+ * bound to the current thread when this is called.
+ *
+ * All streams in the current thread are synchronized with the current
+ * GL context.
+ *
+ * \param devPtr - Returned device pointer to CUDA object
+ * \param bufObj - Buffer object ID to map
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMapBufferObjectFailed
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsMapResources
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj);
+/**
+ * \brief Unmaps a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Unmaps the buffer object of ID \p bufObj for access by CUDA.  When
+ * a buffer is unmapped, the base address returned by
+ * ::cudaGLMapBufferObject() is invalid and subsequent references to
+ * the address result in undefined behavior.  The OpenGL context used
+ * to create the buffer, or another context from the same share group,
+ * must be bound to the current thread when this is called.
+ *
+ * All streams in the current thread are synchronized with the current
+ * GL context.
+ *
+ * \param bufObj - Buffer object to unmap
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnmapBufferObjectFailed
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsUnmapResources
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj);
+/**
+ * \brief Unregisters a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Unregisters the buffer object of ID \p bufObj for access by CUDA
+ * and releases any CUDA resources associated with the buffer.  Once a
+ * buffer is unregistered, it may no longer be mapped by CUDA.  The GL
+ * context used to create the buffer, or another context from the
+ * same share group, must be bound to the current thread when this is
+ * called.
+ *
+ * \param bufObj - Buffer object to unregister
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsUnregisterResource
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint bufObj);
+/**
+ * \brief Set usage flags for mapping an OpenGL buffer
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Set flags for mapping the OpenGL buffer \p bufObj
+ *
+ * Changes to flags will take effect the next time \p bufObj is mapped.
+ * The \p flags argument may be any of the following:
+ *
+ * - ::cudaGLMapFlagsNone: Specifies no hints about how this buffer will
+ * be used. It is therefore assumed that this buffer will be read from and
+ * written to by CUDA kernels. This is the default value.
+ * - ::cudaGLMapFlagsReadOnly: Specifies that CUDA kernels which access this
+ * buffer will not write to the buffer.
+ * - ::cudaGLMapFlagsWriteDiscard: Specifies that CUDA kernels which access
+ * this buffer will not read from the buffer and will write over the
+ * entire contents of the buffer, so none of the data previously stored in
+ * the buffer will be preserved.
+ *
+ * If \p bufObj has not been registered for use with CUDA, then
+ * ::cudaErrorInvalidResourceHandle is returned. If \p bufObj is presently
+ * mapped for access by CUDA, then ::cudaErrorUnknown is returned.
+ *
+ * \param bufObj    - Registered buffer object to set flags for
+ * \param flags     - Parameters for buffer mapping
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsResourceSetMapFlags
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetBufferObjectMapFlags(GLuint bufObj, unsigned int flags);
+/**
+ * \brief Maps a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Maps the buffer object of ID \p bufObj into the address space of
+ * CUDA and returns in \p *devPtr the base pointer of the resulting
+ * mapping.  The buffer must have previously been registered by
+ * calling ::cudaGLRegisterBufferObject().  While a buffer is mapped
+ * by CUDA, any OpenGL operation which references the buffer will
+ * result in undefined behavior.  The OpenGL context used to create
+ * the buffer, or another context from the same share group, must be
+ * bound to the current thread when this is called.
+ *
+ * Stream /p stream is synchronized with the current GL context.
+ *
+ * \param devPtr - Returned device pointer to CUDA object
+ * \param bufObj - Buffer object ID to map
+ * \param stream - Stream to synchronize
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMapBufferObjectFailed
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsMapResources
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObjectAsync(void **devPtr, GLuint bufObj, cudaStream_t stream);
+/**
+ * \brief Unmaps a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Unmaps the buffer object of ID \p bufObj for access by CUDA.  When
+ * a buffer is unmapped, the base address returned by
+ * ::cudaGLMapBufferObject() is invalid and subsequent references to
+ * the address result in undefined behavior.  The OpenGL context used
+ * to create the buffer, or another context from the same share group,
+ * must be bound to the current thread when this is called.
+ *
+ * Stream /p stream is synchronized with the current GL context.
+ *
+ * \param bufObj - Buffer object to unmap
+ * \param stream - Stream to synchronize
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnmapBufferObjectFailed
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsUnmapResources
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObjectAsync(GLuint bufObj, cudaStream_t stream);
+/** @} */ /* END CUDART_OPENGL_DEPRECATED */
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+#undef __CUDA_DEPRECATED
+#endif /* __CUDA_GL_INTEROP_H__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_runtime_api.h ADDED Viewed

The diff for this file is too large to render. See raw diff

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_vdpau_interop.h ADDED Viewed

	@@ -0,0 +1,198 @@

+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__CUDA_VDPAU_INTEROP_H__)
+#define __CUDA_VDPAU_INTEROP_H__
+#include "cuda_runtime_api.h"
+#include <vdpau/vdpau.h>
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+/**
+ * \addtogroup CUDART_VDPAU VDPAU Interoperability
+ * This section describes the VDPAU interoperability functions of the CUDA
+ * runtime application programming interface.
+ *
+ * @{
+ */
+/**
+ * \brief Gets the CUDA device associated with a VdpDevice.
+ *
+ * Returns the CUDA device associated with a VdpDevice, if applicable.
+ *
+ * \param device - Returns the device associated with vdpDevice, or -1 if
+ * the device associated with vdpDevice is not a compute device.
+ * \param vdpDevice - A VdpDevice handle
+ * \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaVDPAUSetVDPAUDevice,
+ * ::cuVDPAUGetDevice
+ */
+extern __host__ cudaError_t CUDARTAPI cudaVDPAUGetDevice(int *device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+/**
+ * \brief Sets a CUDA device to use VDPAU interoperability
+ *
+ * Records \p vdpDevice as the VdpDevice for VDPAU interoperability
+ * with the CUDA device \p device and sets \p device as the current
+ * device for the calling host thread.
+ *
+ * If \p device has already been initialized then this call will fail
+ * with the error ::cudaErrorSetOnActiveProcess.  In this case it is
+ * necessary to reset \p device using ::cudaDeviceReset() before
+ * VDPAU interoperability on \p device may be enabled.
+ *
+ * \param device - Device to use for VDPAU interoperability
+ * \param vdpDevice - The VdpDevice to interoperate with
+ * \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorSetOnActiveProcess
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsVDPAURegisterVideoSurface,
+ * ::cudaGraphicsVDPAURegisterOutputSurface,
+ * ::cudaDeviceReset
+ */
+extern __host__ cudaError_t CUDARTAPI cudaVDPAUSetVDPAUDevice(int device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+/**
+ * \brief Register a VdpVideoSurface object
+ *
+ * Registers the VdpVideoSurface specified by \p vdpSurface for access by CUDA.
+ * A handle to the registered object is returned as \p resource.
+ * The surface's intended usage is specified using \p flags, as follows:
+ *
+ * - ::cudaGraphicsMapFlagsNone: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::cudaGraphicsMapFlagsReadOnly: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::cudaGraphicsMapFlagsWriteDiscard: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * \param resource - Pointer to the returned object handle
+ * \param vdpSurface - VDPAU object to be registered
+ * \param flags - Map flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaVDPAUSetVDPAUDevice,
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsVDPAURegisterVideoSurface
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsVDPAURegisterVideoSurface(struct cudaGraphicsResource **resource, VdpVideoSurface vdpSurface, unsigned int flags);
+/**
+ * \brief Register a VdpOutputSurface object
+ *
+ * Registers the VdpOutputSurface specified by \p vdpSurface for access by CUDA.
+ * A handle to the registered object is returned as \p resource.
+ * The surface's intended usage is specified using \p flags, as follows:
+ *
+ * - ::cudaGraphicsMapFlagsNone: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::cudaGraphicsMapFlagsReadOnly: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::cudaGraphicsMapFlagsWriteDiscard: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * \param resource - Pointer to the returned object handle
+ * \param vdpSurface - VDPAU object to be registered
+ * \param flags - Map flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaVDPAUSetVDPAUDevice,
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsVDPAURegisterOutputSurface
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsVDPAURegisterOutputSurface(struct cudaGraphicsResource **resource, VdpOutputSurface vdpSurface, unsigned int flags);
+/** @} */ /* END CUDART_VDPAU */
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+#endif /* __CUDA_VDPAU_INTEROP_H__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_functions.h ADDED Viewed

	@@ -0,0 +1,65 @@

+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("device_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "device_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__
+#endif
+#include "crt/device_functions.h"
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__
+#endif

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/math_constants.h ADDED Viewed

	@@ -0,0 +1,152 @@

+/*
+ * Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__MATH_CONSTANTS_H__)
+#define __MATH_CONSTANTS_H__
+/* single precision constants */
+#define CUDART_INF_F            __int_as_float(0x7f800000U)
+#define CUDART_NAN_F            __int_as_float(0x7fffffffU)
+#define CUDART_MIN_DENORM_F     __int_as_float(0x00000001U)
+#define CUDART_MAX_NORMAL_F     __int_as_float(0x7f7fffffU)
+#define CUDART_NEG_ZERO_F       __int_as_float(0x80000000U)
+#define CUDART_ZERO_F           0.0F
+#define CUDART_ONE_F            1.0F
+#define CUDART_SQRT_HALF_F      0.707106781F
+#define CUDART_SQRT_HALF_HI_F   0.707106781F
+#define CUDART_SQRT_HALF_LO_F   1.210161749e-08F
+#define CUDART_SQRT_TWO_F       1.414213562F
+#define CUDART_THIRD_F          0.333333333F
+#define CUDART_PIO4_F           0.785398163F
+#define CUDART_PIO2_F           1.570796327F
+#define CUDART_3PIO4_F          2.356194490F
+#define CUDART_2_OVER_PI_F      0.636619772F
+#define CUDART_SQRT_2_OVER_PI_F 0.797884561F
+#define CUDART_PI_F             3.141592654F
+#define CUDART_L2E_F            1.442695041F
+#define CUDART_L2T_F            3.321928094F
+#define CUDART_LG2_F            0.301029996F
+#define CUDART_LGE_F            0.434294482F
+#define CUDART_LN2_F            0.693147181F
+#define CUDART_LNT_F            2.302585093F
+#define CUDART_LNPI_F           1.144729886F
+#define CUDART_TWO_TO_M126_F    1.175494351e-38F
+#define CUDART_TWO_TO_126_F     8.507059173e37F
+#define CUDART_NORM_HUGE_F      3.402823466e38F
+#define CUDART_TWO_TO_23_F      8388608.0F
+#define CUDART_TWO_TO_24_F      16777216.0F
+#define CUDART_TWO_TO_31_F      2147483648.0F
+#define CUDART_TWO_TO_32_F      4294967296.0F
+#define CUDART_REMQUO_BITS_F    3U
+#define CUDART_REMQUO_MASK_F    (~((~0U)<<CUDART_REMQUO_BITS_F))
+#define CUDART_TRIG_PLOSS_F     105615.0F
+/* double precision constants */
+#define CUDART_INF              __longlong_as_double(0x7ff0000000000000ULL)
+#define CUDART_NAN              __longlong_as_double(0xfff8000000000000ULL)
+#define CUDART_NEG_ZERO         __longlong_as_double(0x8000000000000000ULL)
+#define CUDART_MIN_DENORM       __longlong_as_double(0x0000000000000001ULL)
+#define CUDART_ZERO             0.0
+#define CUDART_ONE              1.0
+#define CUDART_SQRT_TWO         1.4142135623730951e+0
+#define CUDART_SQRT_HALF        7.0710678118654757e-1
+#define CUDART_SQRT_HALF_HI     7.0710678118654757e-1
+#define CUDART_SQRT_HALF_LO   (-4.8336466567264567e-17)
+#define CUDART_THIRD            3.3333333333333333e-1
+#define CUDART_TWOTHIRD         6.6666666666666667e-1
+#define CUDART_PIO4             7.8539816339744828e-1
+#define CUDART_PIO4_HI          7.8539816339744828e-1
+#define CUDART_PIO4_LO          3.0616169978683830e-17
+#define CUDART_PIO2             1.5707963267948966e+0
+#define CUDART_PIO2_HI          1.5707963267948966e+0
+#define CUDART_PIO2_LO          6.1232339957367660e-17
+#define CUDART_3PIO4            2.3561944901923448e+0
+#define CUDART_2_OVER_PI        6.3661977236758138e-1
+#define CUDART_PI               3.1415926535897931e+0
+#define CUDART_PI_HI            3.1415926535897931e+0
+#define CUDART_PI_LO            1.2246467991473532e-16
+#define CUDART_SQRT_2PI         2.5066282746310007e+0
+#define CUDART_SQRT_2PI_HI      2.5066282746310007e+0
+#define CUDART_SQRT_2PI_LO    (-1.8328579980459167e-16)
+#define CUDART_SQRT_PIO2        1.2533141373155003e+0
+#define CUDART_SQRT_PIO2_HI     1.2533141373155003e+0
+#define CUDART_SQRT_PIO2_LO   (-9.1642899902295834e-17)
+#define CUDART_SQRT_2OPI        7.9788456080286536e-1
+#define CUDART_L2E              1.4426950408889634e+0
+#define CUDART_L2E_HI           1.4426950408889634e+0
+#define CUDART_L2E_LO           2.0355273740931033e-17
+#define CUDART_L2T              3.3219280948873622e+0
+#define CUDART_LG2              3.0102999566398120e-1
+#define CUDART_LG2_HI           3.0102999566398120e-1
+#define CUDART_LG2_LO         (-2.8037281277851704e-18)
+#define CUDART_LGE              4.3429448190325182e-1
+#define CUDART_LGE_HI           4.3429448190325182e-1
+#define CUDART_LGE_LO           1.09831965021676510e-17
+#define CUDART_LN2              6.9314718055994529e-1
+#define CUDART_LN2_HI           6.9314718055994529e-1
+#define CUDART_LN2_LO           2.3190468138462996e-17
+#define CUDART_LNT              2.3025850929940459e+0
+#define CUDART_LNT_HI           2.3025850929940459e+0
+#define CUDART_LNT_LO         (-2.1707562233822494e-16)
+#define CUDART_LNPI             1.1447298858494002e+0
+#define CUDART_LN2_X_1024       7.0978271289338397e+2
+#define CUDART_LN2_X_1025       7.1047586007394398e+2
+#define CUDART_LN2_X_1075       7.4513321910194122e+2
+#define CUDART_LG2_X_1024       3.0825471555991675e+2
+#define CUDART_LG2_X_1075       3.2360724533877976e+2
+#define CUDART_TWO_TO_23        8388608.0
+#define CUDART_TWO_TO_52        4503599627370496.0
+#define CUDART_TWO_TO_53        9007199254740992.0
+#define CUDART_TWO_TO_54        18014398509481984.0
+#define CUDART_TWO_TO_M54       5.5511151231257827e-17
+#define CUDART_TWO_TO_M1022     2.22507385850720140e-308
+#define CUDART_TRIG_PLOSS       2147483648.0
+#define CUDART_DBL2INT_CVT      6755399441055744.0
+#endif /* !__MATH_CONSTANTS_H__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_intrinsics.h ADDED Viewed

	@@ -0,0 +1,1551 @@

+/*
+ * Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__SM_20_INTRINSICS_H__)
+#define __SM_20_INTRINSICS_H__
+#if defined(__CUDACC_RTC__)
+#define __SM_20_INTRINSICS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __SM_20_INTRINSICS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+#if defined(__cplusplus) && defined(__CUDACC__)
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#include "cuda_runtime_api.h"
+#ifndef __CUDA_ARCH__
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+#if defined(_WIN32)
+# define __DEPRECATED__(msg) __declspec(deprecated(msg))
+#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
+# define __DEPRECATED__(msg) __attribute__((deprecated))
+#else
+# define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
+#endif
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#define __WSB_DEPRECATION_MESSAGE(x) #x"() is not valid on compute_70 and above, and should be replaced with "#x"_sync()."\
+    "To continue using "#x"(), specify virtual architecture compute_60 when targeting sm_70 and above, for example, using the pair of compiler options: -arch=compute_60 -code=sm_70."
+#else
+#define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)."
+#endif
+extern "C"
+{
+extern __device__ __device_builtin__ void                   __threadfence_system(void);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Divide two floating-point values in round-to-nearest-even mode.
+ *
+ * Divides two floating-point values \p x by \p y in round-to-nearest-even mode.
+ *
+ * \return Returns \p x / \p y.
+ *
+ * \note_accuracy_double
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __ddiv_rn(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Divide two floating-point values in round-towards-zero mode.
+ *
+ * Divides two floating-point values \p x by \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x / \p y.
+ *
+ * \note_accuracy_double
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __ddiv_rz(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Divide two floating-point values in round-up mode.
+ *
+ * Divides two floating-point values \p x by \p y in round-up (to positive infinity) mode.
+ *
+ * \return Returns \p x / \p y.
+ *
+ * \note_accuracy_double
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __ddiv_ru(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Divide two floating-point values in round-down mode.
+ *
+ * Divides two floating-point values \p x by \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x / \p y.
+ *
+ * \note_accuracy_double
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __ddiv_rd(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute
+ * \latexonly $\frac{1}{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  in round-to-nearest-even mode.
+ *
+ * Compute the reciprocal of \p x in round-to-nearest-even mode.
+ *
+ * \return Returns
+ * \latexonly $\frac{1}{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __drcp_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute
+ * \latexonly $\frac{1}{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  in round-towards-zero mode.
+ *
+ * Compute the reciprocal of \p x in round-towards-zero mode.
+ *
+ * \return Returns
+ * \latexonly $\frac{1}{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __drcp_rz(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute
+ * \latexonly $\frac{1}{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  in round-up mode.
+ *
+ * Compute the reciprocal of \p x in round-up (to positive infinity) mode.
+ *
+ * \return Returns
+ * \latexonly $\frac{1}{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __drcp_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute
+ * \latexonly $\frac{1}{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  in round-down mode.
+ *
+ * Compute the reciprocal of \p x in round-down (to negative infinity) mode.
+ *
+ * \return Returns
+ * \latexonly $\frac{1}{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __drcp_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  in round-to-nearest-even mode.
+ *
+ * Compute the square root of \p x in round-to-nearest-even mode.
+ *
+ * \return Returns
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __dsqrt_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  in round-towards-zero mode.
+ *
+ * Compute the square root of \p x in round-towards-zero mode.
+ *
+ * \return Returns
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __dsqrt_rz(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  in round-up mode.
+ *
+ * Compute the square root of \p x in round-up (to positive infinity) mode.
+ *
+ * \return Returns
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __dsqrt_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  in round-down mode.
+ *
+ * Compute the square root of \p x in round-down (to negative infinity) mode.
+ *
+ * \return Returns
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __dsqrt_rd(double x);
+extern __device__ __device_builtin__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__ballot)) unsigned int __ballot(int);
+extern __device__ __device_builtin__ int                   __syncthreads_count(int);
+extern __device__ __device_builtin__ int                   __syncthreads_and(int);
+extern __device__ __device_builtin__ int                   __syncthreads_or(int);
+extern __device__ __device_builtin__ long long int         clock64(void);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute fused multiply-add operation in round-to-nearest-even mode, ignore \p -ftz=true compiler flag
+ *
+ * Behavior is the same as ::__fmaf_rn(\p x, \p y, \p z), the difference is in
+ * handling denormalized inputs and outputs: \p -ftz compiler flag has no effect.
+ */
+extern __device__ __device_builtin__ float                  __fmaf_ieee_rn(float x, float y, float z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute fused multiply-add operation in round-down mode, ignore \p -ftz=true compiler flag
+ *
+ * Behavior is the same as ::__fmaf_rd(\p x, \p y, \p z), the difference is in
+ * handling denormalized inputs and outputs: \p -ftz compiler flag has no effect.
+ */
+extern __device__ __device_builtin__ float                  __fmaf_ieee_rd(float x, float y, float z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute fused multiply-add operation in round-up mode, ignore \p -ftz=true compiler flag
+ *
+ * Behavior is the same as ::__fmaf_ru(\p x, \p y, \p z), the difference is in
+ * handling denormalized inputs and outputs: \p -ftz compiler flag has no effect.
+ */
+extern __device__ __device_builtin__ float                  __fmaf_ieee_ru(float x, float y, float z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute fused multiply-add operation in round-towards-zero mode, ignore \p -ftz=true compiler flag
+ *
+ * Behavior is the same as ::__fmaf_rz(\p x, \p y, \p z), the difference is in
+ * handling denormalized inputs and outputs: \p -ftz compiler flag has no effect.
+ */
+extern __device__ __device_builtin__ float                  __fmaf_ieee_rz(float x, float y, float z);
+// SM_13 intrinsics
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret bits in a double as a 64-bit signed integer.
+ *
+ * Reinterpret the bits in the double-precision floating-point value \p x
+ * as a signed 64-bit integer.
+ * \return Returns reinterpreted value.
+ */
+extern __device__ __device_builtin__ long long int         __double_as_longlong(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret bits in a 64-bit signed integer as a double.
+ *
+ * Reinterpret the bits in the 64-bit signed integer value \p x as
+ * a double-precision floating-point value.
+ * \return Returns reinterpreted value.
+ */
+extern __device__ __device_builtin__ double                __longlong_as_double(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation in round-to-nearest-even mode.
+ *
+ * Computes the value of
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single ternary operation, rounding the
+ * result once in round-to-nearest-even mode.
+ *
+ * \return Returns the rounded value of
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation.
+ * - fmaf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ,
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ,
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(\p x, \p y,
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ * - fmaf(\p x, \p y,
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double                __fma_rn(double x, double y, double z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation in round-towards-zero mode.
+ *
+ * Computes the value of
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single ternary operation, rounding the
+ * result once in round-towards-zero mode.
+ *
+ * \return Returns the rounded value of
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation.
+ * - fmaf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ,
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ,
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(\p x, \p y,
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ * - fmaf(\p x, \p y,
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double                __fma_rz(double x, double y, double z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation in round-up mode.
+ *
+ * Computes the value of
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single ternary operation, rounding the
+ * result once in round-up (to positive infinity) mode.
+ *
+ * \return Returns the rounded value of
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation.
+ * - fmaf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ,
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ,
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(\p x, \p y,
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ * - fmaf(\p x, \p y,
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double                __fma_ru(double x, double y, double z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation in round-down mode.
+ *
+ * Computes the value of
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single ternary operation, rounding the
+ * result once in round-down (to negative infinity) mode.
+ *
+ * \return Returns the rounded value of
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation.
+ * - fmaf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ,
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ,
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(\p x, \p y,
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ * - fmaf(\p x, \p y,
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double                __fma_rd(double x, double y, double z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Add two floating-point values in round-to-nearest-even mode.
+ *
+ * Adds two floating-point values \p x and \p y in round-to-nearest-even mode.
+ *
+ * \return Returns \p x + \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dadd_rn(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Add two floating-point values in round-towards-zero mode.
+ *
+ * Adds two floating-point values \p x and \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x + \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dadd_rz(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Add two floating-point values in round-up mode.
+ *
+ * Adds two floating-point values \p x and \p y in round-up (to positive infinity) mode.
+ *
+ * \return Returns \p x + \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dadd_ru(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Add two floating-point values in round-down mode.
+ *
+ * Adds two floating-point values \p x and \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x + \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dadd_rd(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Subtract two floating-point values in round-to-nearest-even mode.
+ *
+ * Subtracts two floating-point values \p x and \p y in round-to-nearest-even mode.
+ *
+ * \return Returns \p x - \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dsub_rn(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Subtract two floating-point values in round-towards-zero mode.
+ *
+ * Subtracts two floating-point values \p x and \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x - \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dsub_rz(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Subtract two floating-point values in round-up mode.
+ *
+ * Subtracts two floating-point values \p x and \p y in round-up (to positive infinity) mode.
+ *
+ * \return Returns \p x - \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dsub_ru(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Subtract two floating-point values in round-down mode.
+ *
+ * Subtracts two floating-point values \p x and \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x - \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dsub_rd(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Multiply two floating-point values in round-to-nearest-even mode.
+ *
+ * Multiplies two floating-point values \p x and \p y in round-to-nearest-even mode.
+ *
+ * \return Returns \p x * \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dmul_rn(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Multiply two floating-point values in round-towards-zero mode.
+ *
+ * Multiplies two floating-point values \p x and \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x * \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dmul_rz(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Multiply two floating-point values in round-up mode.
+ *
+ * Multiplies two floating-point values \p x and \p y in round-up (to positive infinity) mode.
+ *
+ * \return Returns \p x * \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dmul_ru(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Multiply two floating-point values in round-down mode.
+ *
+ * Multiplies two floating-point values \p x and \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x * \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dmul_rd(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a float in round-to-nearest-even mode.
+ *
+ * Convert the double-precision floating-point value \p x to a single-precision
+ * floating-point value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ float                 __double2float_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a float in round-towards-zero mode.
+ *
+ * Convert the double-precision floating-point value \p x to a single-precision
+ * floating-point value in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ float                 __double2float_rz(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a float in round-up mode.
+ *
+ * Convert the double-precision floating-point value \p x to a single-precision
+ * floating-point value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ float                 __double2float_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a float in round-down mode.
+ *
+ * Convert the double-precision floating-point value \p x to a single-precision
+ * floating-point value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ float                 __double2float_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed int in round-to-nearest-even mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed integer value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ int                   __double2int_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed int in round-up mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed integer value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ int                   __double2int_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed int in round-down mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed integer value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ int                   __double2int_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned int in round-to-nearest-even mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned integer value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ unsigned int          __double2uint_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned int in round-up mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned integer value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ unsigned int          __double2uint_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned int in round-down mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned integer value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ unsigned int          __double2uint_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed 64-bit int in round-to-nearest-even mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed 64-bit integer value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ long long int          __double2ll_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed 64-bit int in round-up mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed 64-bit integer value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ long long int          __double2ll_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed 64-bit int in round-down mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed 64-bit integer value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ long long int          __double2ll_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned 64-bit int in round-to-nearest-even mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned 64-bit integer value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ unsigned long long int __double2ull_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned 64-bit int in round-up mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned 64-bit integer value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ unsigned long long int __double2ull_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned 64-bit int in round-down mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned 64-bit integer value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ unsigned long long int __double2ull_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed int to a double.
+ *
+ * Convert the signed integer value \p x to a double-precision floating-point value.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __int2double_rn(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned int to a double.
+ *
+ * Convert the unsigned integer value \p x to a double-precision floating-point value.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __uint2double_rn(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed 64-bit int to a double in round-to-nearest-even mode.
+ *
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
+ * value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ll2double_rn(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed 64-bit int to a double in round-towards-zero mode.
+ *
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
+ * value in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ll2double_rz(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed 64-bit int to a double in round-up mode.
+ *
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
+ * value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ll2double_ru(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed 64-bit int to a double in round-down mode.
+ *
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
+ * value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ll2double_rd(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned 64-bit int to a double in round-to-nearest-even mode.
+ *
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
+ * value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ull2double_rn(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned 64-bit int to a double in round-towards-zero mode.
+ *
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
+ * value in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ull2double_rz(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned 64-bit int to a double in round-up mode.
+ *
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
+ * value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ull2double_ru(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned 64-bit int to a double in round-down mode.
+ *
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
+ * value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ull2double_rd(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret high 32 bits in a double as a signed integer.
+ *
+ * Reinterpret the high 32 bits in the double-precision floating-point value \p x
+ * as a signed integer.
+ * \return Returns reinterpreted value.
+ */
+extern __device__ __device_builtin__ int                    __double2hiint(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret low 32 bits in a double as a signed integer.
+ *
+ * Reinterpret the low 32 bits in the double-precision floating-point value \p x
+ * as a signed integer.
+ * \return Returns reinterpreted value.
+ */
+extern __device__ __device_builtin__ int                    __double2loint(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret high and low 32-bit integer values as a double.
+ *
+ * Reinterpret the integer value of \p hi as the high 32 bits of a
+ * double-precision floating-point value and the integer value of \p lo
+ * as the low 32 bits of the same double-precision floating-point value.
+ * \return Returns reinterpreted value.
+ */
+extern __device__ __device_builtin__ double                 __hiloint2double(int hi, int lo);
+}
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+__SM_20_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__ballot)) unsigned int ballot(bool pred) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ int syncthreads_count(bool pred) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ bool syncthreads_and(bool pred) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ bool syncthreads_or(bool pred) __DEF_IF_HOST
+#undef __DEPRECATED__
+#undef __WSB_DEPRECATION_MESSAGE
+__SM_20_INTRINSICS_DECL__ unsigned int __isGlobal(const void *ptr) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ unsigned int __isShared(const void *ptr) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ unsigned int __isConstant(const void *ptr) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ unsigned int __isLocal(const void *ptr) __DEF_IF_HOST
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+__SM_20_INTRINSICS_DECL__ unsigned int __isGridConstant(const void *ptr) __DEF_IF_HOST
+#endif  /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_global(const void *ptr) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_shared(const void *ptr) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_constant(const void *ptr) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_local(const void *ptr) __DEF_IF_HOST
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_grid_constant(const void *ptr) __DEF_IF_HOST
+#endif  /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
+__SM_20_INTRINSICS_DECL__ void * __cvta_global_to_generic(size_t rawbits) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ void * __cvta_shared_to_generic(size_t rawbits) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ void * __cvta_constant_to_generic(size_t rawbits) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ void * __cvta_local_to_generic(size_t rawbits) __DEF_IF_HOST
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+__SM_20_INTRINSICS_DECL__ void * __cvta_grid_constant_to_generic(size_t rawbits) __DEF_IF_HOST
+#endif  /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
+#endif /* __cplusplus && __CUDACC__ */
+#undef __DEF_IF_HOST
+#undef __SM_20_INTRINSICS_DECL__
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "sm_20_intrinsics.hpp"
+#endif /* !__CUDACC_RTC__ */
+#endif /* !__SM_20_INTRINSICS_H__ && defined(__CUDA_ARCH__) */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_intrinsics.hpp ADDED Viewed

	@@ -0,0 +1,221 @@

+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__SM_20_INTRINSICS_HPP__)
+#define __SM_20_INTRINSICS_HPP__
+#if defined(__CUDACC_RTC__)
+#define __SM_20_INTRINSICS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __SM_20_INTRINSICS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+#if defined(__cplusplus) && defined(__CUDACC__)
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#include "cuda_runtime_api.h"
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+__SM_20_INTRINSICS_DECL__ unsigned int ballot(bool pred)
+{
+  return __ballot((int)pred);
+}
+__SM_20_INTRINSICS_DECL__ int syncthreads_count(bool pred)
+{
+  return __syncthreads_count((int)pred);
+}
+__SM_20_INTRINSICS_DECL__ bool syncthreads_and(bool pred)
+{
+  return (bool)__syncthreads_and((int)pred);
+}
+__SM_20_INTRINSICS_DECL__ bool syncthreads_or(bool pred)
+{
+  return (bool)__syncthreads_or((int)pred);
+}
+extern "C" {
+  __device__ unsigned __nv_isGlobal_impl(const void *);
+  __device__ unsigned __nv_isShared_impl(const void *);
+  __device__ unsigned __nv_isConstant_impl(const void *);
+  __device__ unsigned __nv_isLocal_impl(const void *);
+  __device__ unsigned __nv_isGridConstant_impl(const void *);
+}
+__SM_20_INTRINSICS_DECL__ unsigned int __isGlobal(const void *ptr)
+{
+  return __nv_isGlobal_impl(ptr);
+}
+__SM_20_INTRINSICS_DECL__ unsigned int __isShared(const void *ptr)
+{
+  return __nv_isShared_impl(ptr);
+}
+__SM_20_INTRINSICS_DECL__ unsigned int __isConstant(const void *ptr)
+{
+  return __nv_isConstant_impl(ptr);
+}
+__SM_20_INTRINSICS_DECL__ unsigned int __isLocal(const void *ptr)
+{
+  return __nv_isLocal_impl(ptr);
+}
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+__SM_20_INTRINSICS_DECL__ unsigned int __isGridConstant(const void *ptr)
+{
+  return __nv_isGridConstant_impl(ptr);
+}
+#endif  /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
+extern "C" {
+  __device__ size_t __nv_cvta_generic_to_global_impl(const void *);
+  __device__ size_t __nv_cvta_generic_to_shared_impl(const void *);
+  __device__ size_t __nv_cvta_generic_to_constant_impl(const void *);
+  __device__ size_t __nv_cvta_generic_to_local_impl(const void *);
+  __device__ void * __nv_cvta_global_to_generic_impl(size_t);
+  __device__ void * __nv_cvta_shared_to_generic_impl(size_t);
+  __device__ void * __nv_cvta_constant_to_generic_impl(size_t);
+  __device__ void * __nv_cvta_local_to_generic_impl(size_t);
+}
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_global(const void *p)
+{
+  return __nv_cvta_generic_to_global_impl(p);
+}
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_shared(const void *p)
+{
+  return __nv_cvta_generic_to_shared_impl(p);
+}
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_constant(const void *p)
+{
+  return __nv_cvta_generic_to_constant_impl(p);
+}
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_local(const void *p)
+{
+  return __nv_cvta_generic_to_local_impl(p);
+}
+__SM_20_INTRINSICS_DECL__ void * __cvta_global_to_generic(size_t rawbits)
+{
+  return __nv_cvta_global_to_generic_impl(rawbits);
+}
+__SM_20_INTRINSICS_DECL__ void * __cvta_shared_to_generic(size_t rawbits)
+{
+  return __nv_cvta_shared_to_generic_impl(rawbits);
+}
+__SM_20_INTRINSICS_DECL__ void * __cvta_constant_to_generic(size_t rawbits)
+{
+  return __nv_cvta_constant_to_generic_impl(rawbits);
+}
+__SM_20_INTRINSICS_DECL__ void * __cvta_local_to_generic(size_t rawbits)
+{
+  return __nv_cvta_local_to_generic_impl(rawbits);
+}
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __CVTA_PTR_64 1
+#endif
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_grid_constant(const void *ptr)
+{
+#if __CVTA_PTR_64
+  unsigned long long ret;
+  asm("cvta.to.param.u64 %0, %1;"  : "=l"(ret) : "l"(ptr));
+#else  /* !__CVTA_PTR_64 */
+  unsigned ret;
+  asm("cvta.to.param.u32 %0, %1;"  : "=r"(ret) : "r"(ptr));
+#endif  /* __CVTA_PTR_64 */
+  return (size_t)ret;
+}
+__SM_20_INTRINSICS_DECL__ void * __cvta_grid_constant_to_generic(size_t rawbits)
+{
+  void *ret;
+#if __CVTA_PTR_64
+  unsigned long long in = rawbits;
+  asm("cvta.param.u64 %0, %1;" : "=l"(ret) : "l"(in));
+#else  /* !__CVTA_PTR_64 */
+  unsigned in = rawbits;
+  asm("cvta.param.u32 %0, %1;" : "=r"(ret) : "r"(in));
+#endif  /* __CVTA_PTR_64 */
+  return ret;
+}
+#undef __CVTA_PTR_64
+#endif  /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
+#endif /* __cplusplus && __CUDACC__ */
+#undef __SM_20_INTRINSICS_DECL__
+#endif /* !__SM_20_INTRINSICS_HPP__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_30_intrinsics.h ADDED Viewed

	@@ -0,0 +1,215 @@

+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__SM_30_INTRINSICS_H__)
+#define __SM_30_INTRINSICS_H__
+#if defined(__CUDACC_RTC__)
+#define __SM_30_INTRINSICS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_30_INTRINSICS_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+#if defined(__cplusplus) && defined(__CUDACC__)
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#include "cuda_runtime_api.h"
+#ifndef __CUDA_ARCH__
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+/*******************************************************************************
+*                                                                              *
+*  Below are declarations of SM-3.0 intrinsics which are included as           *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+#if !defined warpSize && !defined __local_warpSize
+#define warpSize    32
+#define __local_warpSize
+#endif
+#if defined(_WIN32)
+# define __DEPRECATED__(msg) __declspec(deprecated(msg))
+#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
+# define __DEPRECATED__(msg) __attribute__((deprecated))
+#else
+# define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
+#endif
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+#define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)."
+#endif
+__SM_30_INTRINSICS_DECL__ unsigned  __fns(unsigned mask, unsigned base, int offset) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ void  __barrier_sync(unsigned id) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ void  __barrier_sync_count(unsigned id, unsigned cnt) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ void  __syncwarp(unsigned mask=0xFFFFFFFF) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ int __all_sync(unsigned mask, int pred) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ int __any_sync(unsigned mask, int pred) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ int __uni_sync(unsigned mask, int pred) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned __ballot_sync(unsigned mask, int pred) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned __activemask() __DEF_IF_HOST
+// Warp register exchange (shuffle) intrinsics.
+// Notes:
+// a) Warp size is hardcoded to 32 here, because the compiler does not know
+//    the "warpSize" constant at this time
+// b) we cannot map the float __shfl to the int __shfl because it'll mess with
+//    the register number (especially if you're doing two shfls to move a double).
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) int __shfl(int var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) unsigned int __shfl(unsigned int var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) int __shfl_up(int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) unsigned int __shfl_up(unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) int __shfl_down(int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) unsigned int __shfl_down(unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) int __shfl_xor(int var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) unsigned int __shfl_xor(unsigned int var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) float __shfl(float var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) float __shfl_up(float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) float __shfl_down(float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) float __shfl_xor(float var, int laneMask, int width=warpSize) __DEF_IF_HOST
+#endif
+__SM_30_INTRINSICS_DECL__ int __shfl_sync(unsigned mask, int var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_sync(unsigned mask, unsigned int var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ int __shfl_up_sync(unsigned mask, int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_up_sync(unsigned mask, unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ int __shfl_down_sync(unsigned mask, int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_down_sync(unsigned mask, unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ int __shfl_xor_sync(unsigned mask, int var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_xor_sync(unsigned mask, unsigned int var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ float __shfl_sync(unsigned mask, float var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ float __shfl_up_sync(unsigned mask, float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ float __shfl_down_sync(unsigned mask, float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ float __shfl_xor_sync(unsigned mask, float var, int laneMask, int width=warpSize) __DEF_IF_HOST
+// 64-bits SHFL
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) unsigned long long __shfl(unsigned long long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) long long __shfl(long long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) long long __shfl_up(long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) unsigned long long __shfl_up(unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) long long __shfl_down(long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) unsigned long long __shfl_down(unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) long long __shfl_xor(long long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) unsigned long long __shfl_xor(unsigned long long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) double __shfl(double var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) double __shfl_up(double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) double __shfl_down(double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) double __shfl_xor(double var, int laneMask, int width=warpSize) __DEF_IF_HOST
+#endif
+__SM_30_INTRINSICS_DECL__ long long __shfl_sync(unsigned mask, long long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_sync(unsigned mask, unsigned long long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ long long __shfl_up_sync(unsigned mask, long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_up_sync(unsigned mask, unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ long long __shfl_down_sync(unsigned mask, long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_down_sync(unsigned mask, unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ long long __shfl_xor_sync(unsigned mask, long long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_xor_sync(unsigned mask, unsigned long long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ double __shfl_sync(unsigned mask, double var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ double __shfl_up_sync(unsigned mask, double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ double __shfl_down_sync(unsigned mask, double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ double __shfl_xor_sync(unsigned mask, double var, int laneMask, int width=warpSize) __DEF_IF_HOST
+// long needs some help to choose between 32-bits and 64-bits
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) long __shfl(long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) unsigned long __shfl(unsigned long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) long __shfl_up(long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) unsigned long __shfl_up(unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) long __shfl_down(long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) unsigned long __shfl_down(unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) long __shfl_xor(long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) unsigned long __shfl_xor(unsigned long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+#endif
+__SM_30_INTRINSICS_DECL__ long __shfl_sync(unsigned mask, long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_sync(unsigned mask, unsigned long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ long __shfl_up_sync(unsigned mask, long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_up_sync(unsigned mask, unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ long __shfl_down_sync(unsigned mask, long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_down_sync(unsigned mask, unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ long __shfl_xor_sync(unsigned mask, long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_xor_sync(unsigned mask, unsigned long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+#undef __DEPRECATED__
+#undef __WSB_DEPRECATION_MESSAGE
+#if defined(__local_warpSize)
+#undef warpSize
+#undef __local_warpSize
+#endif
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 300 */
+#endif /* __cplusplus && __CUDACC__ */
+#undef __DEF_IF_HOST
+#undef __SM_30_INTRINSICS_DECL__
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "sm_30_intrinsics.hpp"
+#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
+#endif /* !__SM_30_INTRINSICS_H__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_30_intrinsics.hpp ADDED Viewed

	@@ -0,0 +1,604 @@

+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__SM_30_INTRINSICS_HPP__)
+#define __SM_30_INTRINSICS_HPP__
+#if defined(__CUDACC_RTC__)
+#define __SM_30_INTRINSICS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_30_INTRINSICS_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+#if defined(__cplusplus) && defined(__CUDACC__)
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#include "cuda_runtime_api.h"
+// In here are intrinsics which are built in to the compiler. These may be
+// referenced by intrinsic implementations from this file.
+extern "C"
+{
+}
+/*******************************************************************************
+*                                                                              *
+*  Below are implementations of SM-3.0 intrinsics which are included as        *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+#if !defined warpSize && !defined __local_warpSize
+#define warpSize    32
+#define __local_warpSize
+#endif
+__SM_30_INTRINSICS_DECL__
+unsigned __fns(unsigned mask, unsigned base, int offset) {
+  extern __device__ __device_builtin__ unsigned int __nvvm_fns(unsigned int mask, unsigned int base, int offset);
+  return __nvvm_fns(mask, base, offset);
+}
+__SM_30_INTRINSICS_DECL__
+void  __barrier_sync(unsigned id) {
+  extern __device__ __device_builtin__ void __nvvm_barrier_sync(unsigned id);
+  return __nvvm_barrier_sync(id);
+}
+__SM_30_INTRINSICS_DECL__
+void  __barrier_sync_count(unsigned id, unsigned cnt) {
+  extern __device__ __device_builtin__ void __nvvm_barrier_sync_cnt(unsigned id, unsigned cnt);
+  return __nvvm_barrier_sync_cnt(id, cnt);
+}
+__SM_30_INTRINSICS_DECL__
+void  __syncwarp(unsigned mask) {
+  extern __device__ __device_builtin__ void __nvvm_bar_warp_sync(unsigned mask);
+  return __nvvm_bar_warp_sync(mask);
+}
+__SM_30_INTRINSICS_DECL__
+int __all_sync(unsigned mask, int pred) {
+  extern __device__ __device_builtin__ int __nvvm_vote_all_sync(unsigned int mask, int pred);
+  return __nvvm_vote_all_sync(mask, pred);
+}
+__SM_30_INTRINSICS_DECL__
+int __any_sync(unsigned mask, int pred) {
+  extern __device__ __device_builtin__ int __nvvm_vote_any_sync(unsigned int mask, int pred);
+  return __nvvm_vote_any_sync(mask, pred);
+}
+__SM_30_INTRINSICS_DECL__
+int __uni_sync(unsigned mask, int pred) {
+  extern __device__ __device_builtin__ int __nvvm_vote_uni_sync(unsigned int mask, int pred);
+  return __nvvm_vote_uni_sync(mask, pred);
+}
+__SM_30_INTRINSICS_DECL__
+unsigned __ballot_sync(unsigned mask, int pred) {
+  extern __device__ __device_builtin__ unsigned int __nvvm_vote_ballot_sync(unsigned int mask, int pred);
+  return __nvvm_vote_ballot_sync(mask, pred);
+}
+__SM_30_INTRINSICS_DECL__
+unsigned __activemask() {
+    unsigned ret;
+    asm volatile ("activemask.b32 %0;" : "=r"(ret));
+    return ret;
+}
+// These are removed starting with compute_70 and onwards
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+__SM_30_INTRINSICS_DECL__ int __shfl(int var, int srcLane, int width) {
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.idx.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(srcLane), "r"(c));
+	return ret;
+}
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl(unsigned int var, int srcLane, int width) {
+	return (unsigned int) __shfl((int)var, srcLane, width);
+}
+__SM_30_INTRINSICS_DECL__ int __shfl_up(int var, unsigned int delta, int width) {
+	int ret;
+	int c = (warpSize-width) << 8;
+	asm volatile ("shfl.up.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(delta), "r"(c));
+	return ret;
+}
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_up(unsigned int var, unsigned int delta, int width) {
+	return (unsigned int) __shfl_up((int)var, delta, width);
+}
+__SM_30_INTRINSICS_DECL__ int __shfl_down(int var, unsigned int delta, int width) {
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.down.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(delta), "r"(c));
+	return ret;
+}
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_down(unsigned int var, unsigned int delta, int width) {
+	return (unsigned int) __shfl_down((int)var, delta, width);
+}
+__SM_30_INTRINSICS_DECL__ int __shfl_xor(int var, int laneMask, int width) {
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(laneMask), "r"(c));
+	return ret;
+}
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_xor(unsigned int var, int laneMask, int width) {
+	return (unsigned int) __shfl_xor((int)var, laneMask, width);
+}
+__SM_30_INTRINSICS_DECL__ float __shfl(float var, int srcLane, int width) {
+	float ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.idx.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(srcLane), "r"(c));
+	return ret;
+}
+__SM_30_INTRINSICS_DECL__ float __shfl_up(float var, unsigned int delta, int width) {
+	float ret;
+        int c;
+	c = (warpSize-width) << 8;
+	asm volatile ("shfl.up.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(delta), "r"(c));
+	return ret;
+}
+__SM_30_INTRINSICS_DECL__ float __shfl_down(float var, unsigned int delta, int width) {
+	float ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.down.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(delta), "r"(c));
+	return ret;
+}
+__SM_30_INTRINSICS_DECL__ float __shfl_xor(float var, int laneMask, int width) {
+	float ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(laneMask), "r"(c));
+	return ret;
+}
+// 64-bits SHFL
+__SM_30_INTRINSICS_DECL__ long long __shfl(long long var, int srcLane, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl(hi, srcLane, width);
+	lo = __shfl(lo, srcLane, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl(unsigned long long var, int srcLane, int width) {
+	return (unsigned long long) __shfl((long long) var, srcLane, width);
+}
+__SM_30_INTRINSICS_DECL__ long long __shfl_up(long long var, unsigned int delta, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_up(hi, delta, width);
+	lo = __shfl_up(lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_up(unsigned long long var, unsigned int delta, int width) {
+	return (unsigned long long) __shfl_up((long long) var, delta, width);
+}
+__SM_30_INTRINSICS_DECL__ long long __shfl_down(long long var, unsigned int delta, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_down(hi, delta, width);
+	lo = __shfl_down(lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_down(unsigned long long var, unsigned int delta, int width) {
+	return (unsigned long long) __shfl_down((long long) var, delta, width);
+}
+__SM_30_INTRINSICS_DECL__ long long __shfl_xor(long long var, int laneMask, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_xor(hi, laneMask, width);
+	lo = __shfl_xor(lo, laneMask, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_xor(unsigned long long var, int laneMask, int width) {
+	return (unsigned long long) __shfl_xor((long long) var, laneMask, width);
+}
+__SM_30_INTRINSICS_DECL__ double __shfl(double var, int srcLane, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl(hi, srcLane, width);
+	lo = __shfl(lo, srcLane, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+__SM_30_INTRINSICS_DECL__ double __shfl_up(double var, unsigned int delta, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_up(hi, delta, width);
+	lo = __shfl_up(lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+__SM_30_INTRINSICS_DECL__ double __shfl_down(double var, unsigned int delta, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_down(hi, delta, width);
+	lo = __shfl_down(lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+__SM_30_INTRINSICS_DECL__ double __shfl_xor(double var, int laneMask, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_xor(hi, laneMask, width);
+	lo = __shfl_xor(lo, laneMask, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+__SM_30_INTRINSICS_DECL__ long __shfl(long var, int srcLane, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl((long long) var, srcLane, width) :
+		__shfl((int) var, srcLane, width);
+}
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl(unsigned long var, int srcLane, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl((unsigned long long) var, srcLane, width) :
+		__shfl((unsigned int) var, srcLane, width);
+}
+__SM_30_INTRINSICS_DECL__ long __shfl_up(long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_up((long long) var, delta, width) :
+		__shfl_up((int) var, delta, width);
+}
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_up(unsigned long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_up((unsigned long long) var, delta, width) :
+		__shfl_up((unsigned int) var, delta, width);
+}
+__SM_30_INTRINSICS_DECL__ long __shfl_down(long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_down((long long) var, delta, width) :
+		__shfl_down((int) var, delta, width);
+}
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_down(unsigned long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_down((unsigned long long) var, delta, width) :
+		__shfl_down((unsigned int) var, delta, width);
+}
+__SM_30_INTRINSICS_DECL__ long __shfl_xor(long var, int laneMask, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_xor((long long) var, laneMask, width) :
+		__shfl_xor((int) var, laneMask, width);
+}
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_xor(unsigned long var, int laneMask, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_xor((unsigned long long) var, laneMask, width) :
+		__shfl_xor((unsigned int) var, laneMask, width);
+}
+#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */
+// Warp register exchange (shuffle) intrinsics.
+// Notes:
+// a) Warp size is hardcoded to 32 here, because the compiler does not know
+//    the "warpSize" constant at this time
+// b) we cannot map the float __shfl to the int __shfl because it'll mess with
+//    the register number (especially if you're doing two shfls to move a double).
+__SM_30_INTRINSICS_DECL__ int __shfl_sync(unsigned mask, int var, int srcLane, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_idx_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_idx_sync(mask, var, srcLane, c);
+	return ret;
+}
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_sync(unsigned mask, unsigned int var, int srcLane, int width) {
+        return (unsigned int) __shfl_sync(mask, (int)var, srcLane, width);
+}
+__SM_30_INTRINSICS_DECL__ int __shfl_up_sync(unsigned mask, int var, unsigned int delta, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_up_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+	int c = (warpSize-width) << 8;
+        ret = __nvvm_shfl_up_sync(mask, var, delta, c);
+	return ret;
+}
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_up_sync(unsigned mask, unsigned int var, unsigned int delta, int width) {
+        return (unsigned int) __shfl_up_sync(mask, (int)var, delta, width);
+}
+__SM_30_INTRINSICS_DECL__ int __shfl_down_sync(unsigned mask, int var, unsigned int delta, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_down_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_down_sync(mask, var, delta, c);
+	return ret;
+}
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_down_sync(unsigned mask, unsigned int var, unsigned int delta, int width) {
+        return (unsigned int) __shfl_down_sync(mask, (int)var, delta, width);
+}
+__SM_30_INTRINSICS_DECL__ int __shfl_xor_sync(unsigned mask, int var, int laneMask, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_bfly_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_bfly_sync(mask, var, laneMask, c);
+	return ret;
+}
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_xor_sync(unsigned mask, unsigned int var, int laneMask, int width) {
+	return (unsigned int) __shfl_xor_sync(mask, (int)var, laneMask, width);
+}
+__SM_30_INTRINSICS_DECL__ float __shfl_sync(unsigned mask, float var, int srcLane, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_idx_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+        int ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_idx_sync(mask, __float_as_int(var), srcLane, c);
+	return __int_as_float(ret);
+}
+__SM_30_INTRINSICS_DECL__ float __shfl_up_sync(unsigned mask, float var, unsigned int delta, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_up_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+        int c;
+	c = (warpSize-width) << 8;
+        ret = __nvvm_shfl_up_sync(mask, __float_as_int(var), delta, c);
+	return __int_as_float(ret);
+}
+__SM_30_INTRINSICS_DECL__ float __shfl_down_sync(unsigned mask, float var, unsigned int delta, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_down_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_down_sync(mask, __float_as_int(var), delta, c);
+	return __int_as_float(ret);
+}
+__SM_30_INTRINSICS_DECL__ float __shfl_xor_sync(unsigned mask, float var, int laneMask, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_bfly_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_bfly_sync(mask, __float_as_int(var), laneMask, c);
+	return __int_as_float(ret);
+}
+// 64-bits SHFL
+__SM_30_INTRINSICS_DECL__ long long __shfl_sync(unsigned mask, long long var, int srcLane, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_sync(mask, hi, srcLane, width);
+	lo = __shfl_sync(mask, lo, srcLane, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_sync(unsigned mask, unsigned long long var, int srcLane, int width) {
+        return (unsigned long long) __shfl_sync(mask, (long long) var, srcLane, width);
+}
+__SM_30_INTRINSICS_DECL__ long long __shfl_up_sync(unsigned mask, long long var, unsigned int delta, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_up_sync(mask, hi, delta, width);
+	lo = __shfl_up_sync(mask, lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_up_sync(unsigned mask, unsigned long long var, unsigned int delta, int width) {
+        return (unsigned long long) __shfl_up_sync(mask, (long long) var, delta, width);
+}
+__SM_30_INTRINSICS_DECL__ long long __shfl_down_sync(unsigned mask, long long var, unsigned int delta, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_down_sync(mask, hi, delta, width);
+	lo = __shfl_down_sync(mask, lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_down_sync(unsigned mask, unsigned long long var, unsigned int delta, int width) {
+        return (unsigned long long) __shfl_down_sync(mask, (long long) var, delta, width);
+}
+__SM_30_INTRINSICS_DECL__ long long __shfl_xor_sync(unsigned mask, long long var, int laneMask, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_xor_sync(mask, hi, laneMask, width);
+	lo = __shfl_xor_sync(mask, lo, laneMask, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_xor_sync(unsigned mask, unsigned long long var, int laneMask, int width) {
+        return (unsigned long long) __shfl_xor_sync(mask, (long long) var, laneMask, width);
+}
+__SM_30_INTRINSICS_DECL__ double __shfl_sync(unsigned mask, double var, int srcLane, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_sync(mask, hi, srcLane, width);
+	lo = __shfl_sync(mask, lo, srcLane, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+__SM_30_INTRINSICS_DECL__ double __shfl_up_sync(unsigned mask, double var, unsigned int delta, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_up_sync(mask, hi, delta, width);
+	lo = __shfl_up_sync(mask, lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+__SM_30_INTRINSICS_DECL__ double __shfl_down_sync(unsigned mask, double var, unsigned int delta, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_down_sync(mask, hi, delta, width);
+	lo = __shfl_down_sync(mask, lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+__SM_30_INTRINSICS_DECL__ double __shfl_xor_sync(unsigned mask, double var, int laneMask, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_xor_sync(mask, hi, laneMask, width);
+	lo = __shfl_xor_sync(mask, lo, laneMask, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+// long needs some help to choose between 32-bits and 64-bits
+__SM_30_INTRINSICS_DECL__ long __shfl_sync(unsigned mask, long var, int srcLane, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+                __shfl_sync(mask, (long long) var, srcLane, width) :
+		__shfl_sync(mask, (int) var, srcLane, width);
+}
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_sync(unsigned mask, unsigned long var, int srcLane, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+                __shfl_sync(mask, (unsigned long long) var, srcLane, width) :
+		__shfl_sync(mask, (unsigned int) var, srcLane, width);
+}
+__SM_30_INTRINSICS_DECL__ long __shfl_up_sync(unsigned mask, long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_up_sync(mask, (long long) var, delta, width) :
+		__shfl_up_sync(mask, (int) var, delta, width);
+}
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_up_sync(unsigned mask, unsigned long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_up_sync(mask, (unsigned long long) var, delta, width) :
+		__shfl_up_sync(mask, (unsigned int) var, delta, width);
+}
+__SM_30_INTRINSICS_DECL__ long __shfl_down_sync(unsigned mask, long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_down_sync(mask, (long long) var, delta, width) :
+		__shfl_down_sync(mask, (int) var, delta, width);
+}
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_down_sync(unsigned mask, unsigned long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_down_sync(mask, (unsigned long long) var, delta, width) :
+		__shfl_down_sync(mask, (unsigned int) var, delta, width);
+}
+__SM_30_INTRINSICS_DECL__ long __shfl_xor_sync(unsigned mask, long var, int laneMask, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_xor_sync(mask, (long long) var, laneMask, width) :
+		__shfl_xor_sync(mask, (int) var, laneMask, width);
+}
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_xor_sync(unsigned mask, unsigned long var, int laneMask, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_xor_sync(mask, (unsigned long long) var, laneMask, width) :
+		__shfl_xor_sync(mask, (unsigned int) var, laneMask, width);
+}
+#if defined(__local_warpSize)
+#undef warpSize
+#undef __local_warpSize
+#endif
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 300 */
+#endif /* __cplusplus && __CUDACC__ */
+#undef __SM_30_INTRINSICS_DECL__
+#endif /* !__SM_30_INTRINSICS_HPP__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_35_atomic_functions.h ADDED Viewed

	@@ -0,0 +1,58 @@

+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 35.235 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.35.235 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__SM_35_ATOMIC_FUNCTIONS_H__)
+#define __SM_35_ATOMIC_FUNCTIONS_H__
+/*******************************************************************************
+* All sm_35 atomics are supported by sm_32 so simply include its header file   *
+*******************************************************************************/
+#include "sm_32_atomic_functions.h"
+#endif /* !__SM_35_ATOMIC_FUNCTIONS_H__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_61_intrinsics.hpp ADDED Viewed

	@@ -0,0 +1,161 @@

+/*
+ * Copyright 2016 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__SM_61_INTRINSICS_HPP__)
+#define __SM_61_INTRINSICS_HPP__
+#if defined(__CUDACC_RTC__)
+#define __SM_61_INTRINSICS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_61_INTRINSICS_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+#if defined(__cplusplus) && defined(__CUDACC__)
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 610
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#include "cuda_runtime_api.h"
+/*******************************************************************************
+*                                                                              *
+*  Below are implementations of SM-6.1 intrinsics which are included as        *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+// 4a
+__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, int srcB, int c) {
+    int ret;
+    asm volatile ("dp4a.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(unsigned int srcA, unsigned int srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp4a.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+__SM_61_INTRINSICS_DECL__ int __dp4a(char4 srcA, char4 srcB, int c) {
+    int ret;
+    asm volatile ("dp4a.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c));
+    return ret;
+}
+__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(uchar4 srcA, uchar4 srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp4a.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c));
+    return ret;
+}
+// 2a.lo
+__SM_61_INTRINSICS_DECL__ int __dp2a_lo(int srcA, int srcB, int c) {
+    int ret;
+    asm volatile ("dp2a.lo.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(unsigned int srcA, unsigned int srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp2a.lo.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+__SM_61_INTRINSICS_DECL__ int __dp2a_lo(short2 srcA, char4 srcB, int c) {
+    int ret;
+    asm volatile ("dp2a.lo.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c));
+    return ret;
+}
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(ushort2 srcA, uchar4 srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp2a.lo.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c));
+    return ret;
+}
+// 2a.hi
+__SM_61_INTRINSICS_DECL__ int __dp2a_hi(int srcA, int srcB, int c) {
+    int ret;
+    asm volatile ("dp2a.hi.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(unsigned int srcA, unsigned int srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp2a.hi.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+__SM_61_INTRINSICS_DECL__ int __dp2a_hi(short2 srcA, char4 srcB, int c) {
+    int ret;
+    asm volatile ("dp2a.hi.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c));
+    return ret;
+}
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(ushort2 srcA, uchar4 srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp2a.hi.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c));
+    return ret;
+}
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 610 */
+#endif /* __cplusplus && __CUDACC__ */
+#undef __SM_61_INTRINSICS_DECL__
+#endif /* !__SM_61_INTRINSICS_HPP__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/surface_indirect_functions.h ADDED Viewed

	@@ -0,0 +1,286 @@

+/*
+ * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef __SURFACE_INDIRECT_FUNCTIONS_H__
+#define __SURFACE_INDIRECT_FUNCTIONS_H__
+#if defined(__cplusplus) && defined(__CUDACC__)
+#include "cuda_runtime_api.h"
+template<typename T> struct __nv_isurf_trait { };
+template<> struct __nv_isurf_trait<char> { typedef void type; };
+template<> struct __nv_isurf_trait<signed char> { typedef void type; };
+template<> struct __nv_isurf_trait<char1> { typedef void type; };
+template<> struct __nv_isurf_trait<unsigned char> { typedef void type; };
+template<> struct __nv_isurf_trait<uchar1> { typedef void type; };
+template<> struct __nv_isurf_trait<short> { typedef void type; };
+template<> struct __nv_isurf_trait<short1> { typedef void type; };
+template<> struct __nv_isurf_trait<unsigned short> { typedef void type; };
+template<> struct __nv_isurf_trait<ushort1> { typedef void type; };
+template<> struct __nv_isurf_trait<int> { typedef void type; };
+template<> struct __nv_isurf_trait<int1> { typedef void type; };
+template<> struct __nv_isurf_trait<unsigned int> { typedef void type; };
+template<> struct __nv_isurf_trait<uint1> { typedef void type; };
+template<> struct __nv_isurf_trait<long long> { typedef void type; };
+template<> struct __nv_isurf_trait<longlong1> { typedef void type; };
+template<> struct __nv_isurf_trait<unsigned long long> { typedef void type; };
+template<> struct __nv_isurf_trait<ulonglong1> { typedef void type; };
+template<> struct __nv_isurf_trait<float> { typedef void type; };
+template<> struct __nv_isurf_trait<float1> { typedef void type; };
+template<> struct __nv_isurf_trait<char2> { typedef void type; };
+template<> struct __nv_isurf_trait<uchar2> { typedef void type; };
+template<> struct __nv_isurf_trait<short2> { typedef void type; };
+template<> struct __nv_isurf_trait<ushort2> { typedef void type; };
+template<> struct __nv_isurf_trait<int2> { typedef void type; };
+template<> struct __nv_isurf_trait<uint2> { typedef void type; };
+template<> struct __nv_isurf_trait<longlong2> { typedef void type; };
+template<> struct __nv_isurf_trait<ulonglong2> { typedef void type; };
+template<> struct __nv_isurf_trait<float2> { typedef void type; };
+template<> struct __nv_isurf_trait<char4> { typedef void type; };
+template<> struct __nv_isurf_trait<uchar4> { typedef void type; };
+template<> struct __nv_isurf_trait<short4> { typedef void type; };
+template<> struct __nv_isurf_trait<ushort4> { typedef void type; };
+template<> struct __nv_isurf_trait<int4> { typedef void type; };
+template<> struct __nv_isurf_trait<uint4> { typedef void type; };
+template<> struct __nv_isurf_trait<float4> { typedef void type; };
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type  surf1Dread(T *ptr, cudaSurfaceObject_t obj, int x, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurf1Dread", ptr, obj, x, mode);
+#endif /* __CUDA_ARCH__ */
+}
+template <class T>
+static __device__ T surf1Dread(cudaSurfaceObject_t surfObject, int x, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+   T ret;
+   surf1Dread(&ret, surfObject, x, boundaryMode);
+   return ret;
+#endif /* __CUDA_ARCH__ */
+}
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type  surf2Dread(T *ptr, cudaSurfaceObject_t obj, int x, int y, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurf2Dread", ptr, obj, x, y, mode);
+#endif /* __CUDA_ARCH__ */
+}
+template <class T>
+static __device__ T surf2Dread(cudaSurfaceObject_t surfObject, int x, int y, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+   T ret;
+   surf2Dread(&ret, surfObject, x, y, boundaryMode);
+   return ret;
+#endif /* __CUDA_ARCH__ */
+}
+template <typename T>
+static __device__ typename  __nv_isurf_trait<T>::type  surf3Dread(T *ptr, cudaSurfaceObject_t obj, int x, int y, int z, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurf3Dread", ptr, obj, x, y, z, mode);
+#endif /* __CUDA_ARCH__ */
+}
+template <class T>
+static __device__ T surf3Dread(cudaSurfaceObject_t surfObject, int x, int y, int z, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+   T ret;
+   surf3Dread(&ret, surfObject, x, y, z, boundaryMode);
+   return ret;
+#endif /* __CUDA_ARCH__ */
+}
+template <typename T>
+static __device__ typename  __nv_isurf_trait<T>::type  surf1DLayeredread(T *ptr, cudaSurfaceObject_t obj, int x, int layer, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurf1DLayeredread", ptr, obj, x, layer, mode);
+#endif /* __CUDA_ARCH__ */
+}
+template <class T>
+static __device__ T surf1DLayeredread(cudaSurfaceObject_t surfObject, int x, int layer, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+   T ret;
+   surf1DLayeredread(&ret, surfObject, x, layer, boundaryMode);
+   return ret;
+#endif /* __CUDA_ARCH__ */
+}
+template <typename T>
+static __device__  typename __nv_isurf_trait<T>::type  surf2DLayeredread(T *ptr, cudaSurfaceObject_t obj, int x, int y, int layer, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurf2DLayeredread", ptr, obj, x, y, layer, mode);
+#endif /* __CUDA_ARCH__ */
+}
+template <class T>
+static __device__ T surf2DLayeredread(cudaSurfaceObject_t surfObject, int x, int y, int layer, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+   T ret;
+   surf2DLayeredread(&ret, surfObject, x, y, layer, boundaryMode);
+   return ret;
+#endif /* __CUDA_ARCH__ */
+}
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type  surfCubemapread(T *ptr, cudaSurfaceObject_t obj, int x, int y, int face, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurfCubemapread", ptr, obj, x, y, face, mode);
+#endif /* __CUDA_ARCH__ */
+}
+template <class T>
+static __device__ T surfCubemapread(cudaSurfaceObject_t surfObject, int x, int y, int face, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+   T ret;
+   surfCubemapread(&ret, surfObject, x, y, face, boundaryMode);
+   return ret;
+#endif /* __CUDA_ARCH__ */
+}
+template <typename T>
+static __device__  typename __nv_isurf_trait<T>::type  surfCubemapLayeredread(T *ptr, cudaSurfaceObject_t obj, int x, int y, int layerface, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurfCubemapLayeredread", ptr, obj, x, y, layerface, mode);
+#endif /* __CUDA_ARCH__ */
+}
+template <class T>
+static __device__ T surfCubemapLayeredread(cudaSurfaceObject_t surfObject, int x, int y, int layerface, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+   T ret;
+   surfCubemapLayeredread(&ret, surfObject, x, y, layerface, boundaryMode);
+   return ret;
+#endif /* __CUDA_ARCH__ */
+}
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type surf1Dwrite(T val, cudaSurfaceObject_t obj, int x, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, obj, x, mode);
+#endif /* __CUDA_ARCH__ */
+}
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type surf2Dwrite(T val, cudaSurfaceObject_t obj, int x, int y, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, obj, x, y, mode);
+#endif /* __CUDA_ARCH__ */
+}
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type surf3Dwrite(T val, cudaSurfaceObject_t obj, int x, int y, int z, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, obj, x, y, z, mode);
+#endif /* __CUDA_ARCH__ */
+}
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type surf1DLayeredwrite(T val, cudaSurfaceObject_t obj, int x, int layer, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, obj, x, layer, mode);
+#endif /* __CUDA_ARCH__ */
+}
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type surf2DLayeredwrite(T val, cudaSurfaceObject_t obj, int x, int y, int layer, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, obj, x, y, layer, mode);
+#endif /* __CUDA_ARCH__ */
+}
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type surfCubemapwrite(T val, cudaSurfaceObject_t obj, int x, int y, int face, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, obj, x, y, face, mode);
+#endif /* __CUDA_ARCH__ */
+}
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type surfCubemapLayeredwrite(T val, cudaSurfaceObject_t obj, int x, int y, int layerface, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, obj, x, y, layerface, mode);
+#endif /* __CUDA_ARCH__ */
+}
+#endif // __cplusplus && __CUDACC__
+#endif // __SURFACE_INDIRECT_FUNCTIONS_H__

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/surface_types.h ADDED Viewed

	@@ -0,0 +1,119 @@

+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__SURFACE_TYPES_H__)
+#define __SURFACE_TYPES_H__
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#include "driver_types.h"
+/**
+ * \addtogroup CUDART_TYPES
+ *
+ * @{
+ */
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#define cudaSurfaceType1D              0x01
+#define cudaSurfaceType2D              0x02
+#define cudaSurfaceType3D              0x03
+#define cudaSurfaceTypeCubemap         0x0C
+#define cudaSurfaceType1DLayered       0xF1
+#define cudaSurfaceType2DLayered       0xF2
+#define cudaSurfaceTypeCubemapLayered  0xFC
+/**
+ * CUDA Surface boundary modes
+ */
+enum __device_builtin__ cudaSurfaceBoundaryMode
+{
+    cudaBoundaryModeZero  = 0,    /**< Zero boundary mode */
+    cudaBoundaryModeClamp = 1,    /**< Clamp boundary mode */
+    cudaBoundaryModeTrap  = 2     /**< Trap boundary mode */
+};
+/**
+ * CUDA Surface format modes
+ */
+enum __device_builtin__  cudaSurfaceFormatMode
+{
+    cudaFormatModeForced = 0,     /**< Forced format mode */
+    cudaFormatModeAuto = 1        /**< Auto format mode */
+};
+/**
+ * CUDA Surface reference
+ */
+struct __device_builtin__ surfaceReference
+{
+    /**
+     * Channel descriptor for surface reference
+     */
+    struct cudaChannelFormatDesc channelDesc;
+};
+/**
+ * An opaque value that represents a CUDA Surface object
+ */
+typedef __device_builtin__ unsigned long long cudaSurfaceObject_t;
+/** @} */
+/** @} */ /* END CUDART_TYPES */
+#endif /* !__SURFACE_TYPES_H__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_indirect_functions.h ADDED Viewed

	@@ -0,0 +1,771 @@

+/*
+ * Copyright 1993-2020 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef __TEXTURE_INDIRECT_FUNCTIONS_H__
+#define __TEXTURE_INDIRECT_FUNCTIONS_H__
+#if defined(__cplusplus) && defined(__CUDACC__)
+#include "cuda_runtime_api.h"
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
+#define __NV_TEX_SPARSE 1
+#endif  /* endif */
+template <typename T> struct __nv_itex_trait {   };
+template<> struct __nv_itex_trait<char> { typedef void type; };
+template<> struct __nv_itex_trait<signed char> { typedef void type; };
+template<> struct __nv_itex_trait<char1> { typedef void type; };
+template<> struct __nv_itex_trait<char2> { typedef void type; };
+template<> struct __nv_itex_trait<char4> { typedef void type; };
+template<> struct __nv_itex_trait<unsigned char> { typedef void type; };
+template<> struct __nv_itex_trait<uchar1> { typedef void type; };
+template<> struct __nv_itex_trait<uchar2> { typedef void type; };
+template<> struct __nv_itex_trait<uchar4> { typedef void type; };
+template<> struct __nv_itex_trait<short> { typedef void type; };
+template<> struct __nv_itex_trait<short1> { typedef void type; };
+template<> struct __nv_itex_trait<short2> { typedef void type; };
+template<> struct __nv_itex_trait<short4> { typedef void type; };
+template<> struct __nv_itex_trait<unsigned short> { typedef void type; };
+template<> struct __nv_itex_trait<ushort1> { typedef void type; };
+template<> struct __nv_itex_trait<ushort2> { typedef void type; };
+template<> struct __nv_itex_trait<ushort4> { typedef void type; };
+template<> struct __nv_itex_trait<int> { typedef void type; };
+template<> struct __nv_itex_trait<int1> { typedef void type; };
+template<> struct __nv_itex_trait<int2> { typedef void type; };
+template<> struct __nv_itex_trait<int4> { typedef void type; };
+template<> struct __nv_itex_trait<unsigned int> { typedef void type; };
+template<> struct __nv_itex_trait<uint1> { typedef void type; };
+template<> struct __nv_itex_trait<uint2> { typedef void type; };
+template<> struct __nv_itex_trait<uint4> { typedef void type; };
+#if !defined(__LP64__)
+template<> struct __nv_itex_trait<long> { typedef void type; };
+template<> struct __nv_itex_trait<long1> { typedef void type; };
+template<> struct __nv_itex_trait<long2> { typedef void type; };
+template<> struct __nv_itex_trait<long4> { typedef void type; };
+template<> struct __nv_itex_trait<unsigned long> { typedef void type; };
+template<> struct __nv_itex_trait<ulong1> { typedef void type; };
+template<> struct __nv_itex_trait<ulong2> { typedef void type; };
+template<> struct __nv_itex_trait<ulong4> { typedef void type; };
+#endif /* !__LP64__ */
+template<> struct __nv_itex_trait<float> { typedef void type; };
+template<> struct __nv_itex_trait<float1> { typedef void type; };
+template<> struct __nv_itex_trait<float2> { typedef void type; };
+template<> struct __nv_itex_trait<float4> { typedef void type; };
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1Dfetch(T *ptr, cudaTextureObject_t obj, int x)
+{
+#ifdef __CUDA_ARCH__
+   __nv_tex_surf_handler("__itex1Dfetch", ptr, obj, x);
+#endif
+}
+template <class T>
+static __device__ T tex1Dfetch(cudaTextureObject_t texObject, int x)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex1Dfetch(&ret, texObject, x);
+  return ret;
+#endif
+}
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1D(T *ptr, cudaTextureObject_t obj, float x)
+{
+#ifdef __CUDA_ARCH__
+   __nv_tex_surf_handler("__itex1D", ptr, obj, x);
+#endif
+}
+template <class T>
+static __device__  T tex1D(cudaTextureObject_t texObject, float x)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex1D(&ret, texObject, x);
+  return ret;
+#endif
+}
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2D(T *ptr, cudaTextureObject_t obj, float x, float y)
+{
+#ifdef __CUDA_ARCH__
+   __nv_tex_surf_handler("__itex2D", ptr, obj, x, y);
+#endif
+}
+template <class T>
+static __device__  T tex2D(cudaTextureObject_t texObject, float x, float y)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2D(&ret, texObject, x, y);
+  return ret;
+#endif
+}
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2D(T *ptr, cudaTextureObject_t obj, float x, float y,
+                                                          bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  unsigned char res;
+   __nv_tex_surf_handler("__itex2D_sparse", ptr, obj, x, y, &res);
+   *isResident = (res != 0);
+#endif
+}
+template <class T>
+static __device__  T tex2D(cudaTextureObject_t texObject, float x, float y, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2D(&ret, texObject, x, y, isResident);
+  return ret;
+#endif
+}
+#endif  /* __NV_TEX_SPARSE */
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3D(T *ptr, cudaTextureObject_t obj, float x, float y, float z)
+{
+#ifdef __CUDA_ARCH__
+   __nv_tex_surf_handler("__itex3D", ptr, obj, x, y, z);
+#endif
+}
+template <class T>
+static __device__  T tex3D(cudaTextureObject_t texObject, float x, float y, float z)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex3D(&ret, texObject, x, y, z);
+  return ret;
+#endif
+}
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3D(T *ptr, cudaTextureObject_t obj, float x, float y, float z,
+                                                          bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  unsigned char res;
+   __nv_tex_surf_handler("__itex3D_sparse", ptr, obj, x, y, z, &res);
+   *isResident = (res != 0);
+#endif
+}
+template <class T>
+static __device__  T tex3D(cudaTextureObject_t texObject, float x, float y, float z, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex3D(&ret, texObject, x, y, z, isResident);
+  return ret;
+#endif
+}
+#endif  /* __NV_TEX_SPARSE */
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1DLayered(T *ptr, cudaTextureObject_t obj, float x, int layer)
+{
+#ifdef __CUDA_ARCH__
+   __nv_tex_surf_handler("__itex1DLayered", ptr, obj, x, layer);
+#endif
+}
+template <class T>
+static __device__  T tex1DLayered(cudaTextureObject_t texObject, float x, int layer)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex1DLayered(&ret, texObject, x, layer);
+  return ret;
+#endif
+}
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayered(T *ptr, cudaTextureObject_t obj, float x, float y, int layer)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itex2DLayered", ptr, obj, x, y, layer);
+#endif
+}
+template <class T>
+static __device__  T tex2DLayered(cudaTextureObject_t texObject, float x, float y, int layer)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2DLayered(&ret, texObject, x, y, layer);
+  return ret;
+#endif
+}
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayered(T *ptr, cudaTextureObject_t obj, float x, float y, int layer, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2DLayered_sparse", ptr, obj, x, y, layer, &res);
+  *isResident = (res != 0);
+#endif
+}
+template <class T>
+static __device__  T tex2DLayered(cudaTextureObject_t texObject, float x, float y, int layer, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2DLayered(&ret, texObject, x, y, layer, isResident);
+  return ret;
+#endif
+}
+#endif  /* __NV_TEX_SPARSE */
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemap(T *ptr, cudaTextureObject_t obj, float x, float y, float z)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itexCubemap", ptr, obj, x, y, z);
+#endif
+}
+template <class T>
+static __device__  T texCubemap(cudaTextureObject_t texObject, float x, float y, float z)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  texCubemap(&ret, texObject, x, y, z);
+  return ret;
+#endif
+}
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemapLayered(T *ptr, cudaTextureObject_t obj, float x, float y, float z, int layer)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itexCubemapLayered", ptr, obj, x, y, z, layer);
+#endif
+}
+template <class T>
+static __device__  T texCubemapLayered(cudaTextureObject_t texObject, float x, float y, float z, int layer)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  texCubemapLayered(&ret, texObject, x, y, z, layer);
+  return ret;
+#endif
+}
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2Dgather(T *ptr, cudaTextureObject_t obj, float x, float y, int comp = 0)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itex2Dgather", ptr, obj, x, y, comp);
+#endif
+}
+template <class T>
+static __device__  T tex2Dgather(cudaTextureObject_t to, float x, float y, int comp = 0)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2Dgather(&ret, to, x, y, comp);
+  return ret;
+#endif
+}
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2Dgather(T *ptr, cudaTextureObject_t obj, float x, float y, bool* isResident, int comp = 0)
+{
+#ifdef __CUDA_ARCH__
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2Dgather_sparse", ptr, obj, x, y, comp,  &res);
+  *isResident = (res != 0);
+#endif
+}
+template <class T>
+static __device__  T tex2Dgather(cudaTextureObject_t to, float x, float y, bool* isResident, int comp = 0)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2Dgather(&ret, to, x, y,  isResident, comp);
+  return ret;
+#endif
+}
+#endif  /* __NV_TEX_SPARSE */
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1DLod(T *ptr, cudaTextureObject_t obj, float x, float level)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itex1DLod", ptr, obj, x, level);
+#endif
+}
+template <class T>
+static __device__  T tex1DLod(cudaTextureObject_t texObject, float x, float level)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex1DLod(&ret, texObject, x, level);
+  return ret;
+#endif
+}
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float level)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itex2DLod", ptr, obj, x, y, level);
+#endif
+}
+template <class T>
+static __device__  T tex2DLod(cudaTextureObject_t texObject, float x, float y, float level)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2DLod(&ret, texObject, x, y, level);
+  return ret;
+#endif
+}
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float level, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2DLod_sparse", ptr, obj, x, y, level, &res);
+  *isResident = (res != 0);
+#endif
+}
+template <class T>
+static __device__  T tex2DLod(cudaTextureObject_t texObject, float x, float y, float level, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2DLod(&ret, texObject, x, y, level, isResident);
+  return ret;
+#endif
+}
+#endif  /* __NV_TEX_SPARSE */
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float level)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itex3DLod", ptr, obj, x, y, z, level);
+#endif
+}
+template <class T>
+static __device__  T tex3DLod(cudaTextureObject_t texObject, float x, float y, float z, float level)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex3DLod(&ret, texObject, x, y, z, level);
+  return ret;
+#endif
+}
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float level, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  unsigned char res;
+  __nv_tex_surf_handler("__itex3DLod_sparse", ptr, obj, x, y, z, level, &res);
+  *isResident = (res != 0);
+#endif
+}
+template <class T>
+static __device__  T tex3DLod(cudaTextureObject_t texObject, float x, float y, float z, float level, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex3DLod(&ret, texObject, x, y, z, level, isResident);
+  return ret;
+#endif
+}
+#endif  /* __NV_TEX_SPARSE */
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1DLayeredLod(T *ptr, cudaTextureObject_t obj, float x, int layer, float level)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itex1DLayeredLod", ptr, obj, x, layer, level);
+#endif
+}
+template <class T>
+static __device__  T tex1DLayeredLod(cudaTextureObject_t texObject, float x, int layer, float level)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex1DLayeredLod(&ret, texObject, x, layer, level);
+  return ret;
+#endif
+}
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayeredLod(T *ptr, cudaTextureObject_t obj, float x, float y, int layer, float level)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itex2DLayeredLod", ptr, obj, x, y, layer, level);
+#endif
+}
+template <class T>
+static __device__  T tex2DLayeredLod(cudaTextureObject_t texObject, float x, float y, int layer, float level)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2DLayeredLod(&ret, texObject, x, y, layer, level);
+  return ret;
+#endif
+}
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayeredLod(T *ptr, cudaTextureObject_t obj, float x, float y, int layer, float level, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2DLayeredLod_sparse", ptr, obj, x, y, layer, level, &res);
+  *isResident = (res != 0);
+#endif
+}
+template <class T>
+static __device__  T tex2DLayeredLod(cudaTextureObject_t texObject, float x, float y, int layer, float level, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2DLayeredLod(&ret, texObject, x, y, layer, level, isResident);
+  return ret;
+#endif
+}
+#endif  /* __NV_TEX_SPARSE */
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemapLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float level)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itexCubemapLod", ptr, obj, x, y, z, level);
+#endif
+}
+template <class T>
+static __device__  T texCubemapLod(cudaTextureObject_t texObject, float x, float y, float z, float level)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  texCubemapLod(&ret, texObject, x, y, z, level);
+  return ret;
+#endif
+}
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemapGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itexCubemapGrad_v2", ptr, obj, x, y, z, &dPdx, &dPdy);
+#endif
+}
+template <class T>
+static __device__  T texCubemapGrad(cudaTextureObject_t texObject, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  texCubemapGrad(&ret, texObject, x, y, z, dPdx, dPdy);
+  return ret;
+#endif
+}
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemapLayeredLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, int layer, float level)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itexCubemapLayeredLod", ptr, obj, x, y, z, layer, level);
+#endif
+}
+template <class T>
+static __device__  T texCubemapLayeredLod(cudaTextureObject_t texObject, float x, float y, float z, int layer, float level)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  texCubemapLayeredLod(&ret, texObject, x, y, z, layer, level);
+  return ret;
+#endif
+}
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1DGrad(T *ptr, cudaTextureObject_t obj, float x, float dPdx, float dPdy)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itex1DGrad", ptr, obj, x, dPdx, dPdy);
+#endif
+}
+template <class T>
+static __device__  T tex1DGrad(cudaTextureObject_t texObject, float x, float dPdx, float dPdy)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex1DGrad(&ret, texObject, x, dPdx, dPdy);
+  return ret;
+#endif
+}
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float2 dPdx, float2 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itex2DGrad_v2", ptr, obj, x, y, &dPdx, &dPdy);
+#endif
+}
+template <class T>
+static __device__  T tex2DGrad(cudaTextureObject_t texObject, float x, float y, float2 dPdx, float2 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2DGrad(&ret, texObject, x, y, dPdx, dPdy);
+  return ret;
+#endif
+}
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float2 dPdx, float2 dPdy, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2DGrad_sparse", ptr, obj, x, y, &dPdx, &dPdy, &res);
+  *isResident = (res != 0);
+#endif
+}
+template <class T>
+static __device__  T tex2DGrad(cudaTextureObject_t texObject, float x, float y, float2 dPdx, float2 dPdy, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2DGrad(&ret, texObject, x, y, dPdx, dPdy, isResident);
+  return ret;
+#endif
+}
+#endif  /* __NV_TEX_SPARSE */
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itex3DGrad_v2", ptr, obj, x, y, z, &dPdx, &dPdy);
+#endif
+}
+template <class T>
+static __device__  T tex3DGrad(cudaTextureObject_t texObject, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex3DGrad(&ret, texObject, x, y, z, dPdx, dPdy);
+  return ret;
+#endif
+}
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float4 dPdx, float4 dPdy, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  unsigned char res;
+  __nv_tex_surf_handler("__itex3DGrad_sparse", ptr, obj, x, y, z, &dPdx, &dPdy, &res);
+  *isResident = (res != 0);
+#endif
+}
+template <class T>
+static __device__  T tex3DGrad(cudaTextureObject_t texObject, float x, float y, float z, float4 dPdx, float4 dPdy, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex3DGrad(&ret, texObject, x, y, z, dPdx, dPdy, isResident);
+  return ret;
+#endif
+}
+#endif  /* __NV_TEX_SPARSE */
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1DLayeredGrad(T *ptr, cudaTextureObject_t obj, float x, int layer, float dPdx, float dPdy)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itex1DLayeredGrad", ptr, obj, x, layer, dPdx, dPdy);
+#endif
+}
+template <class T>
+static __device__  T tex1DLayeredGrad(cudaTextureObject_t texObject, float x, int layer, float dPdx, float dPdy)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex1DLayeredGrad(&ret, texObject, x, layer, dPdx, dPdy);
+  return ret;
+#endif
+}
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayeredGrad(T * ptr, cudaTextureObject_t obj, float x, float y, int layer, float2 dPdx, float2 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itex2DLayeredGrad_v2", ptr, obj, x, y, layer, &dPdx, &dPdy);
+#endif
+}
+template <class T>
+static __device__  T tex2DLayeredGrad(cudaTextureObject_t texObject, float x, float y, int layer, float2 dPdx, float2 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2DLayeredGrad(&ret, texObject, x, y, layer, dPdx, dPdy);
+  return ret;
+#endif
+}
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayeredGrad(T * ptr, cudaTextureObject_t obj, float x, float y, int layer, float2 dPdx, float2 dPdy, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2DLayeredGrad_sparse", ptr, obj, x, y, layer, &dPdx, &dPdy, &res);
+  *isResident = (res != 0);
+#endif
+}
+template <class T>
+static __device__  T tex2DLayeredGrad(cudaTextureObject_t texObject, float x, float y, int layer, float2 dPdx, float2 dPdy, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2DLayeredGrad(&ret, texObject, x, y, layer, dPdx, dPdy, isResident);
+  return ret;
+#endif
+}
+#endif  /* __NV_TEX_SPARSE */
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemapLayeredGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itexCubemapLayeredGrad_v2", ptr, obj, x, y, z, layer, &dPdx, &dPdy);
+#endif
+}
+template <class T>
+static __device__  T texCubemapLayeredGrad(cudaTextureObject_t texObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  texCubemapLayeredGrad(&ret, texObject, x, y, z, layer, dPdx, dPdy);
+  return ret;
+#endif
+}
+#undef __NV_TEX_SPARSE
+#endif // __cplusplus && __CUDACC__
+#endif // __TEXTURE_INDIRECT_FUNCTIONS_H__