Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/caching.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/fuse.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/dask.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/webhdfs.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudaGL_meta.h +116 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudaVDPAU_meta.h +46 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudart_removed_meta.h +55 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/nvperf_target.h +570 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl.hpp +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_egl.h +123 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_gl.h +154 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_gl_ext.h +44 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_platform.h +1414 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/opencl.h +40 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/__pycache__/__init__.cpython-311.pyc +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/builtin_types.h +64 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/channel_descriptor.h +595 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups.h +1828 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/coalesced_reduce.h +108 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/functional.h +207 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/helpers.h +707 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/info.h +323 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/partitioning.h +133 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/reduce.h +430 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/scan.h +324 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/memcpy_async.h +62 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/reduce.h +63 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda.h +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaEGLTypedefs.h +96 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaGLTypedefs.h +123 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaTypedefs.h +959 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaVDPAUTypedefs.h +90 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_bf16.h +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp16.h +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp16.hpp +2614 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp8.h +360 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_gl_interop.h +508 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_runtime_api.h +0 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_vdpau_interop.h +198 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_functions.h +65 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/math_constants.h +152 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_intrinsics.h +1551 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_intrinsics.hpp +221 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_30_intrinsics.h +215 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_30_intrinsics.hpp +604 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_35_atomic_functions.h +58 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_61_intrinsics.hpp +161 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/surface_indirect_functions.h +286 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/surface_types.h +119 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_indirect_functions.h +771 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/caching.cpython-311.pyc
ADDED
|
Binary file (37.6 kB). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/fuse.cpython-311.pyc
ADDED
|
Binary file (17.1 kB). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/dask.cpython-311.pyc
ADDED
|
Binary file (7.78 kB). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/webhdfs.cpython-311.pyc
ADDED
|
Binary file (24.8 kB). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudaGL_meta.h
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// This file is generated. Any changes you make will be lost during the next clean build.
|
| 2 |
+
|
| 3 |
+
// Dependent includes
|
| 4 |
+
#ifdef __APPLE__
|
| 5 |
+
#include <OpenGL/gl.h>
|
| 6 |
+
#else
|
| 7 |
+
#include <GL/gl.h>
|
| 8 |
+
#endif
|
| 9 |
+
|
| 10 |
+
// CUDA public interface, for type definitions and cu* function prototypes
|
| 11 |
+
#include "cudaGL.h"
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
// *************************************************************************
|
| 15 |
+
// Definitions of structs to hold parameters for each function
|
| 16 |
+
// *************************************************************************
|
| 17 |
+
|
| 18 |
+
typedef struct cuGraphicsGLRegisterBuffer_params_st {
|
| 19 |
+
CUgraphicsResource *pCudaResource;
|
| 20 |
+
GLuint buffer;
|
| 21 |
+
unsigned int Flags;
|
| 22 |
+
} cuGraphicsGLRegisterBuffer_params;
|
| 23 |
+
|
| 24 |
+
typedef struct cuGraphicsGLRegisterImage_params_st {
|
| 25 |
+
CUgraphicsResource *pCudaResource;
|
| 26 |
+
GLuint image;
|
| 27 |
+
GLenum target;
|
| 28 |
+
unsigned int Flags;
|
| 29 |
+
} cuGraphicsGLRegisterImage_params;
|
| 30 |
+
|
| 31 |
+
typedef struct cuGLGetDevices_v2_params_st {
|
| 32 |
+
unsigned int *pCudaDeviceCount;
|
| 33 |
+
CUdevice *pCudaDevices;
|
| 34 |
+
unsigned int cudaDeviceCount;
|
| 35 |
+
CUGLDeviceList deviceList;
|
| 36 |
+
} cuGLGetDevices_v2_params;
|
| 37 |
+
|
| 38 |
+
typedef struct cuGLCtxCreate_v2_params_st {
|
| 39 |
+
CUcontext *pCtx;
|
| 40 |
+
unsigned int Flags;
|
| 41 |
+
CUdevice device;
|
| 42 |
+
} cuGLCtxCreate_v2_params;
|
| 43 |
+
|
| 44 |
+
typedef struct cuGLRegisterBufferObject_params_st {
|
| 45 |
+
GLuint buffer;
|
| 46 |
+
} cuGLRegisterBufferObject_params;
|
| 47 |
+
|
| 48 |
+
typedef struct cuGLMapBufferObject_v2_ptds_params_st {
|
| 49 |
+
CUdeviceptr *dptr;
|
| 50 |
+
size_t *size;
|
| 51 |
+
GLuint buffer;
|
| 52 |
+
} cuGLMapBufferObject_v2_ptds_params;
|
| 53 |
+
|
| 54 |
+
typedef struct cuGLUnmapBufferObject_params_st {
|
| 55 |
+
GLuint buffer;
|
| 56 |
+
} cuGLUnmapBufferObject_params;
|
| 57 |
+
|
| 58 |
+
typedef struct cuGLUnregisterBufferObject_params_st {
|
| 59 |
+
GLuint buffer;
|
| 60 |
+
} cuGLUnregisterBufferObject_params;
|
| 61 |
+
|
| 62 |
+
typedef struct cuGLSetBufferObjectMapFlags_params_st {
|
| 63 |
+
GLuint buffer;
|
| 64 |
+
unsigned int Flags;
|
| 65 |
+
} cuGLSetBufferObjectMapFlags_params;
|
| 66 |
+
|
| 67 |
+
typedef struct cuGLMapBufferObjectAsync_v2_ptsz_params_st {
|
| 68 |
+
CUdeviceptr *dptr;
|
| 69 |
+
size_t *size;
|
| 70 |
+
GLuint buffer;
|
| 71 |
+
CUstream hStream;
|
| 72 |
+
} cuGLMapBufferObjectAsync_v2_ptsz_params;
|
| 73 |
+
|
| 74 |
+
typedef struct cuGLUnmapBufferObjectAsync_params_st {
|
| 75 |
+
GLuint buffer;
|
| 76 |
+
CUstream hStream;
|
| 77 |
+
} cuGLUnmapBufferObjectAsync_params;
|
| 78 |
+
|
| 79 |
+
typedef struct cuGLGetDevices_params_st {
|
| 80 |
+
unsigned int *pCudaDeviceCount;
|
| 81 |
+
CUdevice *pCudaDevices;
|
| 82 |
+
unsigned int cudaDeviceCount;
|
| 83 |
+
CUGLDeviceList deviceList;
|
| 84 |
+
} cuGLGetDevices_params;
|
| 85 |
+
|
| 86 |
+
typedef struct cuGLMapBufferObject_v2_params_st {
|
| 87 |
+
CUdeviceptr *dptr;
|
| 88 |
+
size_t *size;
|
| 89 |
+
GLuint buffer;
|
| 90 |
+
} cuGLMapBufferObject_v2_params;
|
| 91 |
+
|
| 92 |
+
typedef struct cuGLMapBufferObjectAsync_v2_params_st {
|
| 93 |
+
CUdeviceptr *dptr;
|
| 94 |
+
size_t *size;
|
| 95 |
+
GLuint buffer;
|
| 96 |
+
CUstream hStream;
|
| 97 |
+
} cuGLMapBufferObjectAsync_v2_params;
|
| 98 |
+
|
| 99 |
+
typedef struct cuGLCtxCreate_params_st {
|
| 100 |
+
CUcontext *pCtx;
|
| 101 |
+
unsigned int Flags;
|
| 102 |
+
CUdevice device;
|
| 103 |
+
} cuGLCtxCreate_params;
|
| 104 |
+
|
| 105 |
+
typedef struct cuGLMapBufferObject_params_st {
|
| 106 |
+
CUdeviceptr_v1 *dptr;
|
| 107 |
+
unsigned int *size;
|
| 108 |
+
GLuint buffer;
|
| 109 |
+
} cuGLMapBufferObject_params;
|
| 110 |
+
|
| 111 |
+
typedef struct cuGLMapBufferObjectAsync_params_st {
|
| 112 |
+
CUdeviceptr_v1 *dptr;
|
| 113 |
+
unsigned int *size;
|
| 114 |
+
GLuint buffer;
|
| 115 |
+
CUstream hStream;
|
| 116 |
+
} cuGLMapBufferObjectAsync_params;
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudaVDPAU_meta.h
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// This file is generated. Any changes you make will be lost during the next clean build.
|
| 2 |
+
|
| 3 |
+
// Dependent includes
|
| 4 |
+
#include <vdpau/vdpau.h>
|
| 5 |
+
|
| 6 |
+
// CUDA public interface, for type definitions and cu* function prototypes
|
| 7 |
+
#include "cudaVDPAU.h"
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
// *************************************************************************
|
| 11 |
+
// Definitions of structs to hold parameters for each function
|
| 12 |
+
// *************************************************************************
|
| 13 |
+
|
| 14 |
+
typedef struct cuVDPAUGetDevice_params_st {
|
| 15 |
+
CUdevice *pDevice;
|
| 16 |
+
VdpDevice vdpDevice;
|
| 17 |
+
VdpGetProcAddress *vdpGetProcAddress;
|
| 18 |
+
} cuVDPAUGetDevice_params;
|
| 19 |
+
|
| 20 |
+
typedef struct cuVDPAUCtxCreate_v2_params_st {
|
| 21 |
+
CUcontext *pCtx;
|
| 22 |
+
unsigned int flags;
|
| 23 |
+
CUdevice device;
|
| 24 |
+
VdpDevice vdpDevice;
|
| 25 |
+
VdpGetProcAddress *vdpGetProcAddress;
|
| 26 |
+
} cuVDPAUCtxCreate_v2_params;
|
| 27 |
+
|
| 28 |
+
typedef struct cuGraphicsVDPAURegisterVideoSurface_params_st {
|
| 29 |
+
CUgraphicsResource *pCudaResource;
|
| 30 |
+
VdpVideoSurface vdpSurface;
|
| 31 |
+
unsigned int flags;
|
| 32 |
+
} cuGraphicsVDPAURegisterVideoSurface_params;
|
| 33 |
+
|
| 34 |
+
typedef struct cuGraphicsVDPAURegisterOutputSurface_params_st {
|
| 35 |
+
CUgraphicsResource *pCudaResource;
|
| 36 |
+
VdpOutputSurface vdpSurface;
|
| 37 |
+
unsigned int flags;
|
| 38 |
+
} cuGraphicsVDPAURegisterOutputSurface_params;
|
| 39 |
+
|
| 40 |
+
typedef struct cuVDPAUCtxCreate_params_st {
|
| 41 |
+
CUcontext *pCtx;
|
| 42 |
+
unsigned int flags;
|
| 43 |
+
CUdevice device;
|
| 44 |
+
VdpDevice vdpDevice;
|
| 45 |
+
VdpGetProcAddress *vdpGetProcAddress;
|
| 46 |
+
} cuVDPAUCtxCreate_params;
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudart_removed_meta.h
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// This file is generated. Any changes you make will be lost during the next clean build.
|
| 2 |
+
|
| 3 |
+
// CUDA public interface, for type definitions and api function prototypes
|
| 4 |
+
#include "cudart_removed.h"
|
| 5 |
+
|
| 6 |
+
// *************************************************************************
|
| 7 |
+
// Definitions of structs to hold parameters for each function
|
| 8 |
+
// *************************************************************************
|
| 9 |
+
|
| 10 |
+
// Currently used parameter trace structures
|
| 11 |
+
typedef struct cudaStreamDestroy_v3020_params_st {
|
| 12 |
+
cudaStream_t stream;
|
| 13 |
+
} cudaStreamDestroy_v3020_params;
|
| 14 |
+
|
| 15 |
+
typedef struct cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6000_params_st {
|
| 16 |
+
int *numBlocks;
|
| 17 |
+
const void *func;
|
| 18 |
+
size_t numDynamicSmemBytes;
|
| 19 |
+
} cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6000_params;
|
| 20 |
+
|
| 21 |
+
typedef struct cudaConfigureCall_v3020_params_st {
|
| 22 |
+
dim3 gridDim;
|
| 23 |
+
dim3 blockDim;
|
| 24 |
+
size_t sharedMem __dv;
|
| 25 |
+
cudaStream_t stream __dv;
|
| 26 |
+
} cudaConfigureCall_v3020_params;
|
| 27 |
+
|
| 28 |
+
typedef struct cudaSetupArgument_v3020_params_st {
|
| 29 |
+
const void *arg;
|
| 30 |
+
size_t size;
|
| 31 |
+
size_t offset;
|
| 32 |
+
} cudaSetupArgument_v3020_params;
|
| 33 |
+
|
| 34 |
+
typedef struct cudaLaunch_v3020_params_st {
|
| 35 |
+
const void *func;
|
| 36 |
+
} cudaLaunch_v3020_params;
|
| 37 |
+
|
| 38 |
+
typedef struct cudaLaunch_ptsz_v7000_params_st {
|
| 39 |
+
const void *func;
|
| 40 |
+
} cudaLaunch_ptsz_v7000_params;
|
| 41 |
+
|
| 42 |
+
typedef struct cudaStreamSetFlags_v10200_params_st {
|
| 43 |
+
cudaStream_t hStream;
|
| 44 |
+
unsigned int flags;
|
| 45 |
+
} cudaStreamSetFlags_v10200_params;
|
| 46 |
+
|
| 47 |
+
typedef struct cudaStreamSetFlags_ptsz_v10200_params_st {
|
| 48 |
+
cudaStream_t hStream;
|
| 49 |
+
unsigned int flags;
|
| 50 |
+
} cudaStreamSetFlags_ptsz_v10200_params;
|
| 51 |
+
|
| 52 |
+
// Parameter trace structures for removed functions
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
// End of parameter trace structures
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/nvperf_target.h
ADDED
|
@@ -0,0 +1,570 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef NVPERF_TARGET_H
|
| 2 |
+
#define NVPERF_TARGET_H
|
| 3 |
+
|
| 4 |
+
/*
|
| 5 |
+
* Copyright 2014-2022 NVIDIA Corporation. All rights reserved.
|
| 6 |
+
*
|
| 7 |
+
* NOTICE TO USER:
|
| 8 |
+
*
|
| 9 |
+
* This source code is subject to NVIDIA ownership rights under U.S. and
|
| 10 |
+
* international Copyright laws.
|
| 11 |
+
*
|
| 12 |
+
* This software and the information contained herein is PROPRIETARY and
|
| 13 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
|
| 14 |
+
* of a form of NVIDIA software license agreement.
|
| 15 |
+
*
|
| 16 |
+
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
|
| 17 |
+
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
|
| 18 |
+
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
|
| 19 |
+
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
|
| 20 |
+
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 21 |
+
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
|
| 22 |
+
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
| 23 |
+
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
| 24 |
+
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
| 25 |
+
* OR PERFORMANCE OF THIS SOURCE CODE.
|
| 26 |
+
*
|
| 27 |
+
* U.S. Government End Users. This source code is a "commercial item" as
|
| 28 |
+
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
|
| 29 |
+
* "commercial computer software" and "commercial computer software
|
| 30 |
+
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
|
| 31 |
+
* and is provided to the U.S. Government only as a commercial end item.
|
| 32 |
+
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
|
| 33 |
+
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
|
| 34 |
+
* source code with only those rights set forth herein.
|
| 35 |
+
*
|
| 36 |
+
* Any use of this source code in individual and commercial software must
|
| 37 |
+
* include, in the user documentation and internal comments to the code,
|
| 38 |
+
* the above Disclaimer and U.S. Government End Users Notice.
|
| 39 |
+
*/
|
| 40 |
+
|
| 41 |
+
#include <stddef.h>
|
| 42 |
+
#include <stdint.h>
|
| 43 |
+
#include "nvperf_common.h"
|
| 44 |
+
|
| 45 |
+
#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
|
| 46 |
+
#pragma GCC visibility push(default)
|
| 47 |
+
#if !defined(NVPW_LOCAL)
|
| 48 |
+
#define NVPW_LOCAL __attribute__ ((visibility ("hidden")))
|
| 49 |
+
#endif
|
| 50 |
+
#else
|
| 51 |
+
#if !defined(NVPW_LOCAL)
|
| 52 |
+
#define NVPW_LOCAL
|
| 53 |
+
#endif
|
| 54 |
+
#endif
|
| 55 |
+
|
| 56 |
+
#ifdef __cplusplus
|
| 57 |
+
extern "C" {
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
/**
|
| 61 |
+
* @file nvperf_target.h
|
| 62 |
+
*/
|
| 63 |
+
|
| 64 |
+
#ifndef NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_DEFINED
|
| 65 |
+
#define NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_DEFINED
|
| 66 |
+
/// GPU architecture support level
|
| 67 |
+
typedef enum NVPW_GpuArchitectureSupportLevel
|
| 68 |
+
{
|
| 69 |
+
NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_UNKNOWN = 0,
|
| 70 |
+
NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_UNSUPPORTED,
|
| 71 |
+
NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_SUPPORTED
|
| 72 |
+
} NVPW_GpuArchitectureSupportLevel;
|
| 73 |
+
#endif //NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_DEFINED
|
| 74 |
+
|
| 75 |
+
#ifndef NVPW_SLI_SUPPORT_LEVEL_DEFINED
|
| 76 |
+
#define NVPW_SLI_SUPPORT_LEVEL_DEFINED
|
| 77 |
+
/// SLI configuration support level
|
| 78 |
+
typedef enum NVPW_SliSupportLevel
|
| 79 |
+
{
|
| 80 |
+
NVPW_SLI_SUPPORT_LEVEL_UNKNOWN = 0,
|
| 81 |
+
NVPW_SLI_SUPPORT_LEVEL_UNSUPPORTED,
|
| 82 |
+
/// Only Non-SLI configurations are supported.
|
| 83 |
+
NVPW_SLI_SUPPORT_LEVEL_SUPPORTED_NON_SLI_CONFIGURATION
|
| 84 |
+
} NVPW_SliSupportLevel;
|
| 85 |
+
#endif //NVPW_SLI_SUPPORT_LEVEL_DEFINED
|
| 86 |
+
|
| 87 |
+
#ifndef NVPW_VGPU_SUPPORT_LEVEL_DEFINED
|
| 88 |
+
#define NVPW_VGPU_SUPPORT_LEVEL_DEFINED
|
| 89 |
+
/// Virtualized GPU configuration support level
|
| 90 |
+
typedef enum NVPW_VGpuSupportLevel
|
| 91 |
+
{
|
| 92 |
+
NVPW_VGPU_SUPPORT_LEVEL_UNKNOWN = 0,
|
| 93 |
+
NVPW_VGPU_SUPPORT_LEVEL_UNSUPPORTED,
|
| 94 |
+
/// Supported but not allowed by system admin.
|
| 95 |
+
NVPW_VGPU_SUPPORT_LEVEL_SUPPORTED_DISALLOWED,
|
| 96 |
+
NVPW_VGPU_SUPPORT_LEVEL_SUPPORTED_ALLOWED,
|
| 97 |
+
NVPW_VGPU_SUPPORT_LEVEL_SUPPORTED_NON_VGPU_CONFIGURATION
|
| 98 |
+
} NVPW_VGpuSupportLevel;
|
| 99 |
+
#endif //NVPW_VGPU_SUPPORT_LEVEL_DEFINED
|
| 100 |
+
|
| 101 |
+
#ifndef NVPW_CONF_COMPUTE_SUPPORT_LEVEL_DEFINED
|
| 102 |
+
#define NVPW_CONF_COMPUTE_SUPPORT_LEVEL_DEFINED
|
| 103 |
+
/// Confidential Compute mode support level
|
| 104 |
+
typedef enum NVPW_ConfidentialComputeSupportLevel
|
| 105 |
+
{
|
| 106 |
+
NVPW_CONF_COMPUTE_SUPPORT_LEVEL_UNKNOWN = 0,
|
| 107 |
+
NVPW_CONF_COMPUTE_SUPPORT_LEVEL_UNSUPPORTED,
|
| 108 |
+
NVPW_CONF_COMPUTE_SUPPORT_LEVEL_SUPPORTED_NON_CONF_COMPUTE_CONFIGURATION
|
| 109 |
+
} NVPW_ConfidentialComputeSupportLevel;
|
| 110 |
+
#endif //NVPW_CONF_COMPUTE_SUPPORT_LEVEL_DEFINED
|
| 111 |
+
|
| 112 |
+
#ifndef NVPW_CMP_SUPPORT_LEVEL_DEFINED
|
| 113 |
+
#define NVPW_CMP_SUPPORT_LEVEL_DEFINED
|
| 114 |
+
/// CMP support level
|
| 115 |
+
typedef enum NVPW_CmpSupportLevel
|
| 116 |
+
{
|
| 117 |
+
NVPW_CMP_SUPPORT_LEVEL_UNKNOWN = 0,
|
| 118 |
+
NVPW_CMP_SUPPORT_LEVEL_UNSUPPORTED,
|
| 119 |
+
NVPW_CMP_SUPPORT_LEVEL_SUPPORTED_NON_CMP_CONFIGURATON
|
| 120 |
+
} NVPW_CmpSupportLevel;
|
| 121 |
+
#endif //NVPW_CMP_SUPPORT_LEVEL_DEFINED
|
| 122 |
+
|
| 123 |
+
#ifndef NVPW_WSL_SUPPORT_LEVEL_DEFINED
|
| 124 |
+
#define NVPW_WSL_SUPPORT_LEVEL_DEFINED
|
| 125 |
+
/// WSL support level
|
| 126 |
+
typedef enum NVPW_WslSupportLevel
|
| 127 |
+
{
|
| 128 |
+
NVPW_WSL_SUPPORT_LEVEL_UNKNOWN = 0,
|
| 129 |
+
NVPW_WSL_SUPPORT_LEVEL_UNSUPPORTED_INSUFFICIENT_DRIVER_VERSION,
|
| 130 |
+
NVPW_WSL_SUPPORT_LEVEL_SUPPORTED,
|
| 131 |
+
NVPW_WSL_SUPPORT_LEVEL_SUPPORTED_NON_WSL_CONFIGURATION
|
| 132 |
+
} NVPW_WslSupportLevel;
|
| 133 |
+
#endif //NVPW_WSL_SUPPORT_LEVEL_DEFINED
|
| 134 |
+
|
| 135 |
+
typedef struct NVPW_InitializeTarget_Params
|
| 136 |
+
{
|
| 137 |
+
/// [in]
|
| 138 |
+
size_t structSize;
|
| 139 |
+
/// [in] assign to NULL
|
| 140 |
+
void* pPriv;
|
| 141 |
+
} NVPW_InitializeTarget_Params;
|
| 142 |
+
#define NVPW_InitializeTarget_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_InitializeTarget_Params, pPriv)
|
| 143 |
+
|
| 144 |
+
/// Load the target library.
|
| 145 |
+
NVPA_Status NVPW_InitializeTarget(NVPW_InitializeTarget_Params* pParams);
|
| 146 |
+
|
| 147 |
+
typedef struct NVPW_GetDeviceCount_Params
|
| 148 |
+
{
|
| 149 |
+
/// [in]
|
| 150 |
+
size_t structSize;
|
| 151 |
+
/// [in] assign to NULL
|
| 152 |
+
void* pPriv;
|
| 153 |
+
size_t numDevices;
|
| 154 |
+
} NVPW_GetDeviceCount_Params;
|
| 155 |
+
#define NVPW_GetDeviceCount_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_GetDeviceCount_Params, numDevices)
|
| 156 |
+
|
| 157 |
+
NVPA_Status NVPW_GetDeviceCount(NVPW_GetDeviceCount_Params* pParams);
|
| 158 |
+
|
| 159 |
+
typedef struct NVPW_Device_GetNames_Params
|
| 160 |
+
{
|
| 161 |
+
/// [in]
|
| 162 |
+
size_t structSize;
|
| 163 |
+
/// [in] assign to NULL
|
| 164 |
+
void* pPriv;
|
| 165 |
+
size_t deviceIndex;
|
| 166 |
+
const char* pDeviceName;
|
| 167 |
+
const char* pChipName;
|
| 168 |
+
} NVPW_Device_GetNames_Params;
|
| 169 |
+
#define NVPW_Device_GetNames_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetNames_Params, pChipName)
|
| 170 |
+
|
| 171 |
+
NVPA_Status NVPW_Device_GetNames(NVPW_Device_GetNames_Params* pParams);
|
| 172 |
+
|
| 173 |
+
typedef struct NVPW_PciBusId
|
| 174 |
+
{
|
| 175 |
+
/// The PCI domain on which the device bus resides.
|
| 176 |
+
uint32_t domain;
|
| 177 |
+
/// The bus on which the device resides.
|
| 178 |
+
uint16_t bus;
|
| 179 |
+
/// device ID.
|
| 180 |
+
uint16_t device;
|
| 181 |
+
} NVPW_PciBusId;
|
| 182 |
+
#define NVPW_PciBusId_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PciBusId, device)
|
| 183 |
+
|
| 184 |
+
typedef struct NVPW_Device_GetPciBusIds_Params
|
| 185 |
+
{
|
| 186 |
+
/// [in]
|
| 187 |
+
size_t structSize;
|
| 188 |
+
/// [in] assign to NULL
|
| 189 |
+
void* pPriv;
|
| 190 |
+
/// [in] caller-allocated array of NVPW_PciBusId, indexed by NVPW deviceIndex
|
| 191 |
+
NVPW_PciBusId* pBusIds;
|
| 192 |
+
/// [in] size of the pBusIDs array; use result from NVPW_GetDeviceCount
|
| 193 |
+
size_t numDevices;
|
| 194 |
+
} NVPW_Device_GetPciBusIds_Params;
|
| 195 |
+
#define NVPW_Device_GetPciBusIds_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetPciBusIds_Params, numDevices)
|
| 196 |
+
|
| 197 |
+
NVPA_Status NVPW_Device_GetPciBusIds(NVPW_Device_GetPciBusIds_Params* pParams);
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
#define NVPW_DEVICE_MIG_GPU_INSTANCE_ID_INVALID 0xFFFFFFFFu
|
| 201 |
+
#define NVPW_DEVICE_MIG_GPU_INSTANCE_ID_FULLCHIP 0xFFFFFFFEu
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
typedef struct NVPW_Device_GetMigAttributes_Params
|
| 205 |
+
{
|
| 206 |
+
/// [in]
|
| 207 |
+
size_t structSize;
|
| 208 |
+
/// [in] assign to NULL
|
| 209 |
+
void* pPriv;
|
| 210 |
+
/// [in]
|
| 211 |
+
size_t deviceIndex;
|
| 212 |
+
/// [out]
|
| 213 |
+
NVPA_Bool isMigPartition;
|
| 214 |
+
/// [out]
|
| 215 |
+
uint32_t gpuInstanceId;
|
| 216 |
+
/// [out]
|
| 217 |
+
uint32_t computeInstanceId;
|
| 218 |
+
} NVPW_Device_GetMigAttributes_Params;
|
| 219 |
+
#define NVPW_Device_GetMigAttributes_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetMigAttributes_Params, computeInstanceId)
|
| 220 |
+
|
| 221 |
+
NVPA_Status NVPW_Device_GetMigAttributes(NVPW_Device_GetMigAttributes_Params* pParams);
|
| 222 |
+
|
| 223 |
+
typedef struct NVPW_Adapter_GetDeviceIndex_Params
|
| 224 |
+
{
|
| 225 |
+
/// [in]
|
| 226 |
+
size_t structSize;
|
| 227 |
+
/// [in] assign to NULL
|
| 228 |
+
void* pPriv;
|
| 229 |
+
/// [in]
|
| 230 |
+
struct IDXGIAdapter* pAdapter;
|
| 231 |
+
/// [in]
|
| 232 |
+
size_t sliIndex;
|
| 233 |
+
/// [out]
|
| 234 |
+
size_t deviceIndex;
|
| 235 |
+
} NVPW_Adapter_GetDeviceIndex_Params;
|
| 236 |
+
#define NVPW_Adapter_GetDeviceIndex_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Adapter_GetDeviceIndex_Params, deviceIndex)
|
| 237 |
+
|
| 238 |
+
NVPA_Status NVPW_Adapter_GetDeviceIndex(NVPW_Adapter_GetDeviceIndex_Params* pParams);
|
| 239 |
+
|
| 240 |
+
typedef struct NVPW_CounterData_GetNumRanges_Params
|
| 241 |
+
{
|
| 242 |
+
/// [in]
|
| 243 |
+
size_t structSize;
|
| 244 |
+
/// [in] assign to NULL
|
| 245 |
+
void* pPriv;
|
| 246 |
+
const uint8_t* pCounterDataImage;
|
| 247 |
+
size_t numRanges;
|
| 248 |
+
} NVPW_CounterData_GetNumRanges_Params;
|
| 249 |
+
#define NVPW_CounterData_GetNumRanges_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_GetNumRanges_Params, numRanges)
|
| 250 |
+
|
| 251 |
+
NVPA_Status NVPW_CounterData_GetNumRanges(NVPW_CounterData_GetNumRanges_Params* pParams);
|
| 252 |
+
|
| 253 |
+
typedef struct NVPW_CounterData_GetChipName_Params
|
| 254 |
+
{
|
| 255 |
+
/// [in]
|
| 256 |
+
size_t structSize;
|
| 257 |
+
/// [in] assign to NULL
|
| 258 |
+
void* pPriv;
|
| 259 |
+
/// [in]
|
| 260 |
+
const uint8_t* pCounterDataImage;
|
| 261 |
+
/// [in]
|
| 262 |
+
size_t counterDataImageSize;
|
| 263 |
+
/// [out]
|
| 264 |
+
const char* pChipName;
|
| 265 |
+
} NVPW_CounterData_GetChipName_Params;
|
| 266 |
+
#define NVPW_CounterData_GetChipName_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_GetChipName_Params, pChipName)
|
| 267 |
+
|
| 268 |
+
NVPA_Status NVPW_CounterData_GetChipName(NVPW_CounterData_GetChipName_Params* pParams);
|
| 269 |
+
|
| 270 |
+
typedef struct NVPW_Config_GetNumPasses_Params
|
| 271 |
+
{
|
| 272 |
+
/// [in]
|
| 273 |
+
size_t structSize;
|
| 274 |
+
/// [in] assign to NULL
|
| 275 |
+
void* pPriv;
|
| 276 |
+
/// [in]
|
| 277 |
+
const uint8_t* pConfig;
|
| 278 |
+
/// [out]
|
| 279 |
+
size_t numPipelinedPasses;
|
| 280 |
+
/// [out]
|
| 281 |
+
size_t numIsolatedPasses;
|
| 282 |
+
} NVPW_Config_GetNumPasses_Params;
|
| 283 |
+
#define NVPW_Config_GetNumPasses_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Config_GetNumPasses_Params, numIsolatedPasses)
|
| 284 |
+
|
| 285 |
+
/// Total num passes = numPipelinedPasses + numIsolatedPasses * numNestingLevels
|
| 286 |
+
NVPA_Status NVPW_Config_GetNumPasses(NVPW_Config_GetNumPasses_Params* pParams);
|
| 287 |
+
|
| 288 |
+
typedef struct NVPW_Config_GetNumPasses_V2_Params
|
| 289 |
+
{
|
| 290 |
+
/// [in]
|
| 291 |
+
size_t structSize;
|
| 292 |
+
/// [in] assign to NULL
|
| 293 |
+
void* pPriv;
|
| 294 |
+
/// [in]
|
| 295 |
+
const uint8_t* pConfig;
|
| 296 |
+
/// [out]
|
| 297 |
+
size_t numPasses;
|
| 298 |
+
} NVPW_Config_GetNumPasses_V2_Params;
|
| 299 |
+
#define NVPW_Config_GetNumPasses_V2_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Config_GetNumPasses_V2_Params, numPasses)
|
| 300 |
+
|
| 301 |
+
/// Total num passes = numPasses * numNestingLevels
|
| 302 |
+
NVPA_Status NVPW_Config_GetNumPasses_V2(NVPW_Config_GetNumPasses_V2_Params* pParams);
|
| 303 |
+
|
| 304 |
+
#define NVPW_API_SET_CUDA_PROFILER 0x18209d0775b2f89dULL
|
| 305 |
+
|
| 306 |
+
#define NVPW_API_SET_D3D11_PROFILER 0xca55c6738445db2bULL
|
| 307 |
+
|
| 308 |
+
#define NVPW_API_SET_D3D12_PROFILER 0xc0c2d46dd7c7ad78ULL
|
| 309 |
+
|
| 310 |
+
#define NVPW_API_SET_EGL_PROFILER 0x3c3747dae1f9565cULL
|
| 311 |
+
|
| 312 |
+
#define NVPW_API_SET_GPU_PERIODICSAMPLER 0x9f4c2571fc0b2e8aULL
|
| 313 |
+
|
| 314 |
+
#define NVPW_API_SET_METRICSCONTEXT 0x7c8579f6f2144beaULL
|
| 315 |
+
|
| 316 |
+
#define NVPW_API_SET_METRICSEVALUATOR 0x0368a8768d811af9ULL
|
| 317 |
+
|
| 318 |
+
#define NVPW_API_SET_METRICS_GA100_COMP 0x16b7d8c20d8b4915ULL
|
| 319 |
+
|
| 320 |
+
#define NVPW_API_SET_METRICS_GA100_GRFX 0xc94eaabec04a94faULL
|
| 321 |
+
|
| 322 |
+
#define NVPW_API_SET_METRICS_GA10X_COMP 0xb5d6391c2e299ab5ULL
|
| 323 |
+
|
| 324 |
+
#define NVPW_API_SET_METRICS_GA10X_GRFX 0x6ebc121178b5ce0bULL
|
| 325 |
+
|
| 326 |
+
#define NVPW_API_SET_METRICS_GV100_COMP 0x863705cc57919f72ULL
|
| 327 |
+
|
| 328 |
+
#define NVPW_API_SET_METRICS_GV100_GRFX 0x9900da75d164fecfULL
|
| 329 |
+
|
| 330 |
+
#define NVPW_API_SET_METRICS_GV11B_COMP 0xd3f79a859235848fULL
|
| 331 |
+
|
| 332 |
+
#define NVPW_API_SET_METRICS_GV11B_GRFX 0xeb8e26220106e227ULL
|
| 333 |
+
|
| 334 |
+
#define NVPW_API_SET_METRICS_TU10X_COMP 0x70f40be0afd35da8ULL
|
| 335 |
+
|
| 336 |
+
#define NVPW_API_SET_METRICS_TU10X_GRFX 0xdf219cb838db6968ULL
|
| 337 |
+
|
| 338 |
+
#define NVPW_API_SET_METRICS_TU11X_COMP 0xeb0069d7d0956678ULL
|
| 339 |
+
|
| 340 |
+
#define NVPW_API_SET_METRICS_TU11X_GRFX 0x0977d9342bd62743ULL
|
| 341 |
+
|
| 342 |
+
#define NVPW_API_SET_OPENGL_PROFILER 0xe4cd9ea40f2ee777ULL
|
| 343 |
+
|
| 344 |
+
#define NVPW_API_SET_VULKAN_PROFILER 0x8c56b6a03d779689ULL
|
| 345 |
+
|
| 346 |
+
typedef struct NVPW_QueryVersionNumber_Params
|
| 347 |
+
{
|
| 348 |
+
/// [in]
|
| 349 |
+
size_t structSize;
|
| 350 |
+
/// [in] assign to NULL
|
| 351 |
+
void* pPriv;
|
| 352 |
+
/// [in]
|
| 353 |
+
uint64_t apiSet;
|
| 354 |
+
/// [out]
|
| 355 |
+
uint32_t major;
|
| 356 |
+
/// [out]
|
| 357 |
+
uint32_t minor;
|
| 358 |
+
/// [out]
|
| 359 |
+
uint32_t patch;
|
| 360 |
+
/// [out]
|
| 361 |
+
uint32_t relMajor;
|
| 362 |
+
/// [out]
|
| 363 |
+
uint32_t relMinor;
|
| 364 |
+
/// [out]
|
| 365 |
+
uint32_t relPatch;
|
| 366 |
+
} NVPW_QueryVersionNumber_Params;
|
| 367 |
+
#define NVPW_QueryVersionNumber_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_QueryVersionNumber_Params, relPatch)
|
| 368 |
+
|
| 369 |
+
/// Query version number of an API set
|
| 370 |
+
NVPA_Status NVPW_QueryVersionNumber(NVPW_QueryVersionNumber_Params* pParams);
|
| 371 |
+
|
| 372 |
+
typedef enum NVPW_Device_ClockStatus
|
| 373 |
+
{
|
| 374 |
+
/// clock status is unknown
|
| 375 |
+
NVPW_DEVICE_CLOCK_STATUS_UNKNOWN,
|
| 376 |
+
/// clocks are locked to rated tdp values
|
| 377 |
+
NVPW_DEVICE_CLOCK_STATUS_LOCKED_TO_RATED_TDP,
|
| 378 |
+
/// clocks are not locked and can boost above rated tdp
|
| 379 |
+
NVPW_DEVICE_CLOCK_STATUS_BOOST_ENABLED,
|
| 380 |
+
/// clocks are not locked and will not go above rated tdp
|
| 381 |
+
NVPW_DEVICE_CLOCK_STATUS_BOOST_DISABLED,
|
| 382 |
+
NVPW_DEVICE_CLOCK_STATUS__COUNT
|
| 383 |
+
} NVPW_Device_ClockStatus;
|
| 384 |
+
|
| 385 |
+
typedef struct NVPW_Device_GetClockStatus_Params
|
| 386 |
+
{
|
| 387 |
+
/// [in]
|
| 388 |
+
size_t structSize;
|
| 389 |
+
/// [in] assign to NULL
|
| 390 |
+
void* pPriv;
|
| 391 |
+
size_t deviceIndex;
|
| 392 |
+
/// [in]
|
| 393 |
+
NVPW_Device_ClockStatus clockStatus;
|
| 394 |
+
} NVPW_Device_GetClockStatus_Params;
|
| 395 |
+
#define NVPW_Device_GetClockStatus_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetClockStatus_Params, clockStatus)
|
| 396 |
+
|
| 397 |
+
NVPA_Status NVPW_Device_GetClockStatus(NVPW_Device_GetClockStatus_Params* pParams);
|
| 398 |
+
|
| 399 |
+
typedef enum NVPW_Device_ClockSetting
|
| 400 |
+
{
|
| 401 |
+
/// invalid op, specify valid clocks operation during profiling
|
| 402 |
+
NVPW_DEVICE_CLOCK_SETTING_INVALID,
|
| 403 |
+
/// default to driver/application config (normally unlocked and not boosted, but could be unlocked boosted, or
|
| 404 |
+
/// locked to rated TDP)
|
| 405 |
+
NVPW_DEVICE_CLOCK_SETTING_DEFAULT,
|
| 406 |
+
/// lock clocks at rated tdp base values
|
| 407 |
+
NVPW_DEVICE_CLOCK_SETTING_LOCK_TO_RATED_TDP,
|
| 408 |
+
NVPW_DEVICE_CLOCK_SETTING__COUNT
|
| 409 |
+
} NVPW_Device_ClockSetting;
|
| 410 |
+
|
| 411 |
+
typedef struct NVPW_Device_SetClockSetting_Params
|
| 412 |
+
{
|
| 413 |
+
/// [in]
|
| 414 |
+
size_t structSize;
|
| 415 |
+
/// [in] assign to NULL
|
| 416 |
+
void* pPriv;
|
| 417 |
+
size_t deviceIndex;
|
| 418 |
+
/// [in]
|
| 419 |
+
NVPW_Device_ClockSetting clockSetting;
|
| 420 |
+
} NVPW_Device_SetClockSetting_Params;
|
| 421 |
+
#define NVPW_Device_SetClockSetting_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_SetClockSetting_Params, clockSetting)
|
| 422 |
+
|
| 423 |
+
NVPA_Status NVPW_Device_SetClockSetting(NVPW_Device_SetClockSetting_Params* pParams);
|
| 424 |
+
|
| 425 |
+
typedef struct NVPW_CounterData_GetRangeDescriptions_Params
|
| 426 |
+
{
|
| 427 |
+
/// [in]
|
| 428 |
+
size_t structSize;
|
| 429 |
+
/// [in] assign to NULL
|
| 430 |
+
void* pPriv;
|
| 431 |
+
const uint8_t* pCounterDataImage;
|
| 432 |
+
size_t rangeIndex;
|
| 433 |
+
/// [inout] Number of descriptions allocated in ppDescriptions
|
| 434 |
+
size_t numDescriptions;
|
| 435 |
+
const char** ppDescriptions;
|
| 436 |
+
} NVPW_CounterData_GetRangeDescriptions_Params;
|
| 437 |
+
#define NVPW_CounterData_GetRangeDescriptions_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_GetRangeDescriptions_Params, ppDescriptions)
|
| 438 |
+
|
| 439 |
+
NVPA_Status NVPW_CounterData_GetRangeDescriptions(NVPW_CounterData_GetRangeDescriptions_Params* pParams);
|
| 440 |
+
|
| 441 |
+
typedef struct NVPW_Profiler_CounterData_GetRangeDescriptions_Params
|
| 442 |
+
{
|
| 443 |
+
/// [in]
|
| 444 |
+
size_t structSize;
|
| 445 |
+
/// [in] assign to NULL
|
| 446 |
+
void* pPriv;
|
| 447 |
+
const uint8_t* pCounterDataImage;
|
| 448 |
+
size_t rangeIndex;
|
| 449 |
+
/// [inout] Number of descriptions allocated in ppDescriptions
|
| 450 |
+
size_t numDescriptions;
|
| 451 |
+
const char** ppDescriptions;
|
| 452 |
+
} NVPW_Profiler_CounterData_GetRangeDescriptions_Params;
|
| 453 |
+
#define NVPW_Profiler_CounterData_GetRangeDescriptions_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Profiler_CounterData_GetRangeDescriptions_Params, ppDescriptions)
|
| 454 |
+
|
| 455 |
+
NVPA_Status NVPW_Profiler_CounterData_GetRangeDescriptions(NVPW_Profiler_CounterData_GetRangeDescriptions_Params* pParams);
|
| 456 |
+
|
| 457 |
+
#ifndef NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_DEFINED
|
| 458 |
+
#define NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_DEFINED
|
| 459 |
+
typedef enum NVPW_PeriodicSampler_CounterData_AppendMode
|
| 460 |
+
{
|
| 461 |
+
NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_LINEAR = 0,
|
| 462 |
+
NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_CIRCULAR = 1,
|
| 463 |
+
NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE__COUNT
|
| 464 |
+
} NVPW_PeriodicSampler_CounterData_AppendMode;
|
| 465 |
+
#endif //NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_DEFINED
|
| 466 |
+
|
| 467 |
+
typedef struct NVPW_PeriodicSampler_CounterData_GetSampleTime_Params
|
| 468 |
+
{
|
| 469 |
+
/// [in]
|
| 470 |
+
size_t structSize;
|
| 471 |
+
/// [in] assign to NULL
|
| 472 |
+
void* pPriv;
|
| 473 |
+
/// [in]
|
| 474 |
+
const uint8_t* pCounterDataImage;
|
| 475 |
+
/// [in]
|
| 476 |
+
size_t rangeIndex;
|
| 477 |
+
/// [out]
|
| 478 |
+
uint64_t timestampStart;
|
| 479 |
+
/// [out]
|
| 480 |
+
uint64_t timestampEnd;
|
| 481 |
+
} NVPW_PeriodicSampler_CounterData_GetSampleTime_Params;
|
| 482 |
+
#define NVPW_PeriodicSampler_CounterData_GetSampleTime_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_GetSampleTime_Params, timestampEnd)
|
| 483 |
+
|
| 484 |
+
NVPA_Status NVPW_PeriodicSampler_CounterData_GetSampleTime(NVPW_PeriodicSampler_CounterData_GetSampleTime_Params* pParams);
|
| 485 |
+
|
| 486 |
+
typedef struct NVPW_PeriodicSampler_CounterData_TrimInPlace_Params
|
| 487 |
+
{
|
| 488 |
+
/// [in]
|
| 489 |
+
size_t structSize;
|
| 490 |
+
/// [in] assign to NULL
|
| 491 |
+
void* pPriv;
|
| 492 |
+
/// [in]
|
| 493 |
+
uint8_t* pCounterDataImage;
|
| 494 |
+
/// [in]
|
| 495 |
+
size_t counterDataImageSize;
|
| 496 |
+
/// [out]
|
| 497 |
+
size_t counterDataImageTrimmedSize;
|
| 498 |
+
} NVPW_PeriodicSampler_CounterData_TrimInPlace_Params;
|
| 499 |
+
#define NVPW_PeriodicSampler_CounterData_TrimInPlace_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_TrimInPlace_Params, counterDataImageTrimmedSize)
|
| 500 |
+
|
| 501 |
+
NVPA_Status NVPW_PeriodicSampler_CounterData_TrimInPlace(NVPW_PeriodicSampler_CounterData_TrimInPlace_Params* pParams);
|
| 502 |
+
|
| 503 |
+
typedef struct NVPW_PeriodicSampler_CounterData_GetInfo_Params
|
| 504 |
+
{
|
| 505 |
+
/// [in]
|
| 506 |
+
size_t structSize;
|
| 507 |
+
/// [in] assign to NULL
|
| 508 |
+
void* pPriv;
|
| 509 |
+
/// [in]
|
| 510 |
+
const uint8_t* pCounterDataImage;
|
| 511 |
+
/// [in]
|
| 512 |
+
size_t counterDataImageSize;
|
| 513 |
+
/// [out] total number of ranges in the counter data
|
| 514 |
+
size_t numTotalRanges;
|
| 515 |
+
/// [out] if in "linear" mode, this API returns the number of "populated" ranges; if it's in "circular" mode,
|
| 516 |
+
/// then it returns the last "populated" range index + 1, when there is no such range, it returns 0.
|
| 517 |
+
size_t numPopulatedRanges;
|
| 518 |
+
/// [out] if in "linear" mode, this API returns the number of "completed" ranges; if it's in "circular" mode,
|
| 519 |
+
/// then it returns the last "completed" range index + 1, when there is no such range, it returns 0.
|
| 520 |
+
size_t numCompletedRanges;
|
| 521 |
+
} NVPW_PeriodicSampler_CounterData_GetInfo_Params;
|
| 522 |
+
#define NVPW_PeriodicSampler_CounterData_GetInfo_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_GetInfo_Params, numCompletedRanges)
|
| 523 |
+
|
| 524 |
+
/// In periodic sampler, a range in counter data stores exactly one sample's data. For better performance, periodic
|
| 525 |
+
/// sampler may operate in an out-of-order fashion when populating sample data, i.e. it may not fully populate all
|
| 526 |
+
/// counters of a sample/range before starting to populate the next sample/range. As a result, we have two concepts
|
| 527 |
+
/// here, "populated" & "completed": a range is considered "populated" even if only partial counters have been
|
| 528 |
+
/// written; on the other hand, a range is only considered "completed" if all the collecting counters have been
|
| 529 |
+
/// written.
|
| 530 |
+
NVPA_Status NVPW_PeriodicSampler_CounterData_GetInfo(NVPW_PeriodicSampler_CounterData_GetInfo_Params* pParams);
|
| 531 |
+
|
| 532 |
+
typedef struct NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params
|
| 533 |
+
{
|
| 534 |
+
/// [in]
|
| 535 |
+
size_t structSize;
|
| 536 |
+
/// [in] assign to NULL
|
| 537 |
+
void* pPriv;
|
| 538 |
+
/// [in]
|
| 539 |
+
const uint8_t* pCounterDataImage;
|
| 540 |
+
/// [in]
|
| 541 |
+
size_t counterDataImageSize;
|
| 542 |
+
/// [in]
|
| 543 |
+
size_t rangeIndex;
|
| 544 |
+
/// [out]
|
| 545 |
+
uint32_t triggerCount;
|
| 546 |
+
} NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params;
|
| 547 |
+
#define NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params, triggerCount)
|
| 548 |
+
|
| 549 |
+
NVPA_Status NVPW_PeriodicSampler_CounterData_GetTriggerCount(NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params* pParams);
|
| 550 |
+
|
| 551 |
+
|
| 552 |
+
typedef struct NVPW_TimestampReport
|
| 553 |
+
{
|
| 554 |
+
uint32_t payload;
|
| 555 |
+
uint8_t reserved0004[4];
|
| 556 |
+
uint64_t timestamp;
|
| 557 |
+
} NVPW_TimestampReport;
|
| 558 |
+
|
| 559 |
+
|
| 560 |
+
|
| 561 |
+
|
| 562 |
+
#ifdef __cplusplus
|
| 563 |
+
} // extern "C"
|
| 564 |
+
#endif
|
| 565 |
+
|
| 566 |
+
#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
|
| 567 |
+
#pragma GCC visibility pop
|
| 568 |
+
#endif
|
| 569 |
+
|
| 570 |
+
#endif // NVPERF_TARGET_H
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl.hpp
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_egl.h
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*******************************************************************************
|
| 2 |
+
* Copyright (c) 2008-2020 The Khronos Group Inc.
|
| 3 |
+
*
|
| 4 |
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
* you may not use this file except in compliance with the License.
|
| 6 |
+
* You may obtain a copy of the License at
|
| 7 |
+
*
|
| 8 |
+
* http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
*
|
| 10 |
+
* Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
* See the License for the specific language governing permissions and
|
| 14 |
+
* limitations under the License.
|
| 15 |
+
******************************************************************************/
|
| 16 |
+
|
| 17 |
+
#ifndef __OPENCL_CL_EGL_H
|
| 18 |
+
#define __OPENCL_CL_EGL_H
|
| 19 |
+
|
| 20 |
+
#ifdef __APPLE__
|
| 21 |
+
#else
|
| 22 |
+
#include <CL/cl.h>
|
| 23 |
+
#endif
|
| 24 |
+
|
| 25 |
+
#ifdef __cplusplus
|
| 26 |
+
extern "C" {
|
| 27 |
+
#endif
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
|
| 31 |
+
#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR 0x202F
|
| 32 |
+
#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR 0x202D
|
| 33 |
+
#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR 0x202E
|
| 34 |
+
|
| 35 |
+
/* Error type for clCreateFromEGLImageKHR */
|
| 36 |
+
#define CL_INVALID_EGL_OBJECT_KHR -1093
|
| 37 |
+
#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR -1092
|
| 38 |
+
|
| 39 |
+
/* CLeglImageKHR is an opaque handle to an EGLImage */
|
| 40 |
+
typedef void* CLeglImageKHR;
|
| 41 |
+
|
| 42 |
+
/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
|
| 43 |
+
typedef void* CLeglDisplayKHR;
|
| 44 |
+
|
| 45 |
+
/* CLeglSyncKHR is an opaque handle to an EGLSync object */
|
| 46 |
+
typedef void* CLeglSyncKHR;
|
| 47 |
+
|
| 48 |
+
/* properties passed to clCreateFromEGLImageKHR */
|
| 49 |
+
typedef intptr_t cl_egl_image_properties_khr;
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
#define cl_khr_egl_image 1
|
| 53 |
+
|
| 54 |
+
extern CL_API_ENTRY cl_mem CL_API_CALL
|
| 55 |
+
clCreateFromEGLImageKHR(cl_context context,
|
| 56 |
+
CLeglDisplayKHR egldisplay,
|
| 57 |
+
CLeglImageKHR eglimage,
|
| 58 |
+
cl_mem_flags flags,
|
| 59 |
+
const cl_egl_image_properties_khr * properties,
|
| 60 |
+
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
|
| 61 |
+
|
| 62 |
+
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
|
| 63 |
+
cl_context context,
|
| 64 |
+
CLeglDisplayKHR egldisplay,
|
| 65 |
+
CLeglImageKHR eglimage,
|
| 66 |
+
cl_mem_flags flags,
|
| 67 |
+
const cl_egl_image_properties_khr * properties,
|
| 68 |
+
cl_int * errcode_ret);
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
extern CL_API_ENTRY cl_int CL_API_CALL
|
| 72 |
+
clEnqueueAcquireEGLObjectsKHR(cl_command_queue command_queue,
|
| 73 |
+
cl_uint num_objects,
|
| 74 |
+
const cl_mem * mem_objects,
|
| 75 |
+
cl_uint num_events_in_wait_list,
|
| 76 |
+
const cl_event * event_wait_list,
|
| 77 |
+
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
|
| 78 |
+
|
| 79 |
+
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
|
| 80 |
+
cl_command_queue command_queue,
|
| 81 |
+
cl_uint num_objects,
|
| 82 |
+
const cl_mem * mem_objects,
|
| 83 |
+
cl_uint num_events_in_wait_list,
|
| 84 |
+
const cl_event * event_wait_list,
|
| 85 |
+
cl_event * event);
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
extern CL_API_ENTRY cl_int CL_API_CALL
|
| 89 |
+
clEnqueueReleaseEGLObjectsKHR(cl_command_queue command_queue,
|
| 90 |
+
cl_uint num_objects,
|
| 91 |
+
const cl_mem * mem_objects,
|
| 92 |
+
cl_uint num_events_in_wait_list,
|
| 93 |
+
const cl_event * event_wait_list,
|
| 94 |
+
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
|
| 95 |
+
|
| 96 |
+
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
|
| 97 |
+
cl_command_queue command_queue,
|
| 98 |
+
cl_uint num_objects,
|
| 99 |
+
const cl_mem * mem_objects,
|
| 100 |
+
cl_uint num_events_in_wait_list,
|
| 101 |
+
const cl_event * event_wait_list,
|
| 102 |
+
cl_event * event);
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
#define cl_khr_egl_event 1
|
| 106 |
+
|
| 107 |
+
extern CL_API_ENTRY cl_event CL_API_CALL
|
| 108 |
+
clCreateEventFromEGLSyncKHR(cl_context context,
|
| 109 |
+
CLeglSyncKHR sync,
|
| 110 |
+
CLeglDisplayKHR display,
|
| 111 |
+
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
|
| 112 |
+
|
| 113 |
+
typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
|
| 114 |
+
cl_context context,
|
| 115 |
+
CLeglSyncKHR sync,
|
| 116 |
+
CLeglDisplayKHR display,
|
| 117 |
+
cl_int * errcode_ret);
|
| 118 |
+
|
| 119 |
+
#ifdef __cplusplus
|
| 120 |
+
}
|
| 121 |
+
#endif
|
| 122 |
+
|
| 123 |
+
#endif /* __OPENCL_CL_EGL_H */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_gl.h
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*******************************************************************************
|
| 2 |
+
* Copyright (c) 2008-2020 The Khronos Group Inc.
|
| 3 |
+
*
|
| 4 |
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
* you may not use this file except in compliance with the License.
|
| 6 |
+
* You may obtain a copy of the License at
|
| 7 |
+
*
|
| 8 |
+
* http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
*
|
| 10 |
+
* Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
* See the License for the specific language governing permissions and
|
| 14 |
+
* limitations under the License.
|
| 15 |
+
******************************************************************************/
|
| 16 |
+
|
| 17 |
+
#ifndef __OPENCL_CL_GL_H
|
| 18 |
+
#define __OPENCL_CL_GL_H
|
| 19 |
+
|
| 20 |
+
#ifdef __APPLE__
|
| 21 |
+
#include <OpenCL/cl.h>
|
| 22 |
+
#else
|
| 23 |
+
#include <CL/cl.h>
|
| 24 |
+
#endif
|
| 25 |
+
|
| 26 |
+
#ifdef __cplusplus
|
| 27 |
+
extern "C" {
|
| 28 |
+
#endif
|
| 29 |
+
|
| 30 |
+
typedef cl_uint cl_gl_object_type;
|
| 31 |
+
typedef cl_uint cl_gl_texture_info;
|
| 32 |
+
typedef cl_uint cl_gl_platform_info;
|
| 33 |
+
typedef struct __GLsync *cl_GLsync;
|
| 34 |
+
|
| 35 |
+
/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken */
|
| 36 |
+
#define CL_GL_OBJECT_BUFFER 0x2000
|
| 37 |
+
#define CL_GL_OBJECT_TEXTURE2D 0x2001
|
| 38 |
+
#define CL_GL_OBJECT_TEXTURE3D 0x2002
|
| 39 |
+
#define CL_GL_OBJECT_RENDERBUFFER 0x2003
|
| 40 |
+
#define CL_GL_OBJECT_TEXTURE2D_ARRAY 0x200E
|
| 41 |
+
#define CL_GL_OBJECT_TEXTURE1D 0x200F
|
| 42 |
+
#define CL_GL_OBJECT_TEXTURE1D_ARRAY 0x2010
|
| 43 |
+
#define CL_GL_OBJECT_TEXTURE_BUFFER 0x2011
|
| 44 |
+
|
| 45 |
+
/* cl_gl_texture_info */
|
| 46 |
+
#define CL_GL_TEXTURE_TARGET 0x2004
|
| 47 |
+
#define CL_GL_MIPMAP_LEVEL 0x2005
|
| 48 |
+
#define CL_GL_NUM_SAMPLES 0x2012
|
| 49 |
+
|
| 50 |
+
extern CL_API_ENTRY cl_mem CL_API_CALL
|
| 51 |
+
clCreateFromGLBuffer(cl_context context,
|
| 52 |
+
cl_mem_flags flags,
|
| 53 |
+
cl_GLuint bufobj,
|
| 54 |
+
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
|
| 55 |
+
|
| 56 |
+
extern CL_API_ENTRY cl_mem CL_API_CALL
|
| 57 |
+
clCreateFromGLTexture(cl_context context,
|
| 58 |
+
cl_mem_flags flags,
|
| 59 |
+
cl_GLenum target,
|
| 60 |
+
cl_GLint miplevel,
|
| 61 |
+
cl_GLuint texture,
|
| 62 |
+
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
|
| 63 |
+
|
| 64 |
+
extern CL_API_ENTRY cl_mem CL_API_CALL
|
| 65 |
+
clCreateFromGLRenderbuffer(cl_context context,
|
| 66 |
+
cl_mem_flags flags,
|
| 67 |
+
cl_GLuint renderbuffer,
|
| 68 |
+
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
|
| 69 |
+
|
| 70 |
+
extern CL_API_ENTRY cl_int CL_API_CALL
|
| 71 |
+
clGetGLObjectInfo(cl_mem memobj,
|
| 72 |
+
cl_gl_object_type * gl_object_type,
|
| 73 |
+
cl_GLuint * gl_object_name) CL_API_SUFFIX__VERSION_1_0;
|
| 74 |
+
|
| 75 |
+
extern CL_API_ENTRY cl_int CL_API_CALL
|
| 76 |
+
clGetGLTextureInfo(cl_mem memobj,
|
| 77 |
+
cl_gl_texture_info param_name,
|
| 78 |
+
size_t param_value_size,
|
| 79 |
+
void * param_value,
|
| 80 |
+
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
|
| 81 |
+
|
| 82 |
+
extern CL_API_ENTRY cl_int CL_API_CALL
|
| 83 |
+
clEnqueueAcquireGLObjects(cl_command_queue command_queue,
|
| 84 |
+
cl_uint num_objects,
|
| 85 |
+
const cl_mem * mem_objects,
|
| 86 |
+
cl_uint num_events_in_wait_list,
|
| 87 |
+
const cl_event * event_wait_list,
|
| 88 |
+
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
|
| 89 |
+
|
| 90 |
+
extern CL_API_ENTRY cl_int CL_API_CALL
|
| 91 |
+
clEnqueueReleaseGLObjects(cl_command_queue command_queue,
|
| 92 |
+
cl_uint num_objects,
|
| 93 |
+
const cl_mem * mem_objects,
|
| 94 |
+
cl_uint num_events_in_wait_list,
|
| 95 |
+
const cl_event * event_wait_list,
|
| 96 |
+
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
/* Deprecated OpenCL 1.1 APIs */
|
| 100 |
+
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
|
| 101 |
+
clCreateFromGLTexture2D(cl_context context,
|
| 102 |
+
cl_mem_flags flags,
|
| 103 |
+
cl_GLenum target,
|
| 104 |
+
cl_GLint miplevel,
|
| 105 |
+
cl_GLuint texture,
|
| 106 |
+
cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
|
| 107 |
+
|
| 108 |
+
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
|
| 109 |
+
clCreateFromGLTexture3D(cl_context context,
|
| 110 |
+
cl_mem_flags flags,
|
| 111 |
+
cl_GLenum target,
|
| 112 |
+
cl_GLint miplevel,
|
| 113 |
+
cl_GLuint texture,
|
| 114 |
+
cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
|
| 115 |
+
|
| 116 |
+
/* cl_khr_gl_sharing extension */
|
| 117 |
+
|
| 118 |
+
#define cl_khr_gl_sharing 1
|
| 119 |
+
|
| 120 |
+
typedef cl_uint cl_gl_context_info;
|
| 121 |
+
|
| 122 |
+
/* Additional Error Codes */
|
| 123 |
+
#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000
|
| 124 |
+
|
| 125 |
+
/* cl_gl_context_info */
|
| 126 |
+
#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006
|
| 127 |
+
#define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007
|
| 128 |
+
|
| 129 |
+
/* Additional cl_context_properties */
|
| 130 |
+
#define CL_GL_CONTEXT_KHR 0x2008
|
| 131 |
+
#define CL_EGL_DISPLAY_KHR 0x2009
|
| 132 |
+
#define CL_GLX_DISPLAY_KHR 0x200A
|
| 133 |
+
#define CL_WGL_HDC_KHR 0x200B
|
| 134 |
+
#define CL_CGL_SHAREGROUP_KHR 0x200C
|
| 135 |
+
|
| 136 |
+
extern CL_API_ENTRY cl_int CL_API_CALL
|
| 137 |
+
clGetGLContextInfoKHR(const cl_context_properties * properties,
|
| 138 |
+
cl_gl_context_info param_name,
|
| 139 |
+
size_t param_value_size,
|
| 140 |
+
void * param_value,
|
| 141 |
+
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
|
| 142 |
+
|
| 143 |
+
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
|
| 144 |
+
const cl_context_properties * properties,
|
| 145 |
+
cl_gl_context_info param_name,
|
| 146 |
+
size_t param_value_size,
|
| 147 |
+
void * param_value,
|
| 148 |
+
size_t * param_value_size_ret);
|
| 149 |
+
|
| 150 |
+
#ifdef __cplusplus
|
| 151 |
+
}
|
| 152 |
+
#endif
|
| 153 |
+
|
| 154 |
+
#endif /* __OPENCL_CL_GL_H */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_gl_ext.h
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*******************************************************************************
|
| 2 |
+
* Copyright (c) 2008-2020 The Khronos Group Inc.
|
| 3 |
+
*
|
| 4 |
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
* you may not use this file except in compliance with the License.
|
| 6 |
+
* You may obtain a copy of the License at
|
| 7 |
+
*
|
| 8 |
+
* http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
*
|
| 10 |
+
* Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
* See the License for the specific language governing permissions and
|
| 14 |
+
* limitations under the License.
|
| 15 |
+
******************************************************************************/
|
| 16 |
+
|
| 17 |
+
#ifndef __OPENCL_CL_GL_EXT_H
|
| 18 |
+
#define __OPENCL_CL_GL_EXT_H
|
| 19 |
+
|
| 20 |
+
#ifdef __cplusplus
|
| 21 |
+
extern "C" {
|
| 22 |
+
#endif
|
| 23 |
+
|
| 24 |
+
#ifdef __APPLE__
|
| 25 |
+
#include <OpenCL/cl_gl.h>
|
| 26 |
+
#else
|
| 27 |
+
#include <CL/cl_gl.h>
|
| 28 |
+
#endif
|
| 29 |
+
|
| 30 |
+
/*
|
| 31 |
+
* cl_khr_gl_event extension
|
| 32 |
+
*/
|
| 33 |
+
#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D
|
| 34 |
+
|
| 35 |
+
extern CL_API_ENTRY cl_event CL_API_CALL
|
| 36 |
+
clCreateEventFromGLsyncKHR(cl_context context,
|
| 37 |
+
cl_GLsync sync,
|
| 38 |
+
cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
|
| 39 |
+
|
| 40 |
+
#ifdef __cplusplus
|
| 41 |
+
}
|
| 42 |
+
#endif
|
| 43 |
+
|
| 44 |
+
#endif /* __OPENCL_CL_GL_EXT_H */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_platform.h
ADDED
|
@@ -0,0 +1,1414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*******************************************************************************
|
| 2 |
+
* Copyright (c) 2008-2020 The Khronos Group Inc.
|
| 3 |
+
*
|
| 4 |
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
* you may not use this file except in compliance with the License.
|
| 6 |
+
* You may obtain a copy of the License at
|
| 7 |
+
*
|
| 8 |
+
* http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
*
|
| 10 |
+
* Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
* See the License for the specific language governing permissions and
|
| 14 |
+
* limitations under the License.
|
| 15 |
+
******************************************************************************/
|
| 16 |
+
|
| 17 |
+
#ifndef __CL_PLATFORM_H
|
| 18 |
+
#define __CL_PLATFORM_H
|
| 19 |
+
|
| 20 |
+
#ifdef __APPLE__
|
| 21 |
+
/* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */
|
| 22 |
+
#include <AvailabilityMacros.h>
|
| 23 |
+
#endif
|
| 24 |
+
|
| 25 |
+
#ifdef __cplusplus
|
| 26 |
+
extern "C" {
|
| 27 |
+
#endif
|
| 28 |
+
|
| 29 |
+
#if defined(_WIN32)
|
| 30 |
+
#define CL_API_ENTRY
|
| 31 |
+
#define CL_API_CALL __stdcall
|
| 32 |
+
#define CL_CALLBACK __stdcall
|
| 33 |
+
#else
|
| 34 |
+
#define CL_API_ENTRY
|
| 35 |
+
#define CL_API_CALL
|
| 36 |
+
#define CL_CALLBACK
|
| 37 |
+
#endif
|
| 38 |
+
|
| 39 |
+
/*
|
| 40 |
+
* Deprecation flags refer to the last version of the header in which the
|
| 41 |
+
* feature was not deprecated.
|
| 42 |
+
*
|
| 43 |
+
* E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without
|
| 44 |
+
* deprecation but is deprecated in versions later than 1.1.
|
| 45 |
+
*/
|
| 46 |
+
#ifdef __APPLE__
|
| 47 |
+
#define CL_EXTENSION_WEAK_LINK __attribute__((weak_import))
|
| 48 |
+
#define CL_API_SUFFIX__VERSION_1_0 AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
|
| 49 |
+
#define CL_EXT_SUFFIX__VERSION_1_0 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
|
| 50 |
+
#define CL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
|
| 51 |
+
#define GCL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
|
| 52 |
+
#define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
|
| 53 |
+
#define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
|
| 54 |
+
|
| 55 |
+
#ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
|
| 56 |
+
#define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
|
| 57 |
+
#define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
|
| 58 |
+
#define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
|
| 59 |
+
#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
|
| 60 |
+
#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
|
| 61 |
+
#else
|
| 62 |
+
#warning This path should never happen outside of internal operating system development. AvailabilityMacros do not function correctly here!
|
| 63 |
+
#define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
|
| 64 |
+
#define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
|
| 65 |
+
#define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
|
| 66 |
+
#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
|
| 67 |
+
#endif
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
#else
|
| 72 |
+
#define CL_EXTENSION_WEAK_LINK
|
| 73 |
+
#define CL_API_SUFFIX__VERSION_1_0
|
| 74 |
+
#define CL_EXT_SUFFIX__VERSION_1_0
|
| 75 |
+
#define CL_API_SUFFIX__VERSION_1_1
|
| 76 |
+
#define CL_EXT_SUFFIX__VERSION_1_1
|
| 77 |
+
#define CL_API_SUFFIX__VERSION_1_2
|
| 78 |
+
#define CL_EXT_SUFFIX__VERSION_1_2
|
| 79 |
+
#define CL_API_SUFFIX__VERSION_2_0
|
| 80 |
+
#define CL_EXT_SUFFIX__VERSION_2_0
|
| 81 |
+
#define CL_API_SUFFIX__VERSION_2_1
|
| 82 |
+
#define CL_EXT_SUFFIX__VERSION_2_1
|
| 83 |
+
#define CL_API_SUFFIX__VERSION_2_2
|
| 84 |
+
#define CL_EXT_SUFFIX__VERSION_2_2
|
| 85 |
+
#define CL_API_SUFFIX__VERSION_3_0
|
| 86 |
+
#define CL_EXT_SUFFIX__VERSION_3_0
|
| 87 |
+
#define CL_API_SUFFIX__EXPERIMENTAL
|
| 88 |
+
#define CL_EXT_SUFFIX__EXPERIMENTAL
|
| 89 |
+
|
| 90 |
+
#ifdef __GNUC__
|
| 91 |
+
#define CL_EXT_SUFFIX_DEPRECATED __attribute__((deprecated))
|
| 92 |
+
#define CL_EXT_PREFIX_DEPRECATED
|
| 93 |
+
#elif defined(_WIN32)
|
| 94 |
+
#define CL_EXT_SUFFIX_DEPRECATED
|
| 95 |
+
#define CL_EXT_PREFIX_DEPRECATED __declspec(deprecated)
|
| 96 |
+
#else
|
| 97 |
+
#define CL_EXT_SUFFIX_DEPRECATED
|
| 98 |
+
#define CL_EXT_PREFIX_DEPRECATED
|
| 99 |
+
#endif
|
| 100 |
+
|
| 101 |
+
#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
|
| 102 |
+
#define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
|
| 103 |
+
#define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
|
| 104 |
+
#else
|
| 105 |
+
#define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
|
| 106 |
+
#define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED
|
| 107 |
+
#endif
|
| 108 |
+
|
| 109 |
+
#ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
|
| 110 |
+
#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
|
| 111 |
+
#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
|
| 112 |
+
#else
|
| 113 |
+
#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
|
| 114 |
+
#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED
|
| 115 |
+
#endif
|
| 116 |
+
|
| 117 |
+
#ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
|
| 118 |
+
#define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
|
| 119 |
+
#define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
|
| 120 |
+
#else
|
| 121 |
+
#define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
|
| 122 |
+
#define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED
|
| 123 |
+
#endif
|
| 124 |
+
|
| 125 |
+
#ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS
|
| 126 |
+
#define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
|
| 127 |
+
#define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
|
| 128 |
+
#else
|
| 129 |
+
#define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
|
| 130 |
+
#define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED
|
| 131 |
+
#endif
|
| 132 |
+
|
| 133 |
+
#ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS
|
| 134 |
+
#define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
|
| 135 |
+
#define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
|
| 136 |
+
#else
|
| 137 |
+
#define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
|
| 138 |
+
#define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED
|
| 139 |
+
#endif
|
| 140 |
+
|
| 141 |
+
#ifdef CL_USE_DEPRECATED_OPENCL_2_2_APIS
|
| 142 |
+
#define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED
|
| 143 |
+
#define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED
|
| 144 |
+
#else
|
| 145 |
+
#define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
|
| 146 |
+
#define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED
|
| 147 |
+
#endif
|
| 148 |
+
#endif
|
| 149 |
+
|
| 150 |
+
#if (defined (_WIN32) && defined(_MSC_VER))
|
| 151 |
+
|
| 152 |
+
/* scalar types */
|
| 153 |
+
typedef signed __int8 cl_char;
|
| 154 |
+
typedef unsigned __int8 cl_uchar;
|
| 155 |
+
typedef signed __int16 cl_short;
|
| 156 |
+
typedef unsigned __int16 cl_ushort;
|
| 157 |
+
typedef signed __int32 cl_int;
|
| 158 |
+
typedef unsigned __int32 cl_uint;
|
| 159 |
+
typedef signed __int64 cl_long;
|
| 160 |
+
typedef unsigned __int64 cl_ulong;
|
| 161 |
+
|
| 162 |
+
typedef unsigned __int16 cl_half;
|
| 163 |
+
typedef float cl_float;
|
| 164 |
+
typedef double cl_double;
|
| 165 |
+
|
| 166 |
+
/* Macro names and corresponding values defined by OpenCL */
|
| 167 |
+
#define CL_CHAR_BIT 8
|
| 168 |
+
#define CL_SCHAR_MAX 127
|
| 169 |
+
#define CL_SCHAR_MIN (-127-1)
|
| 170 |
+
#define CL_CHAR_MAX CL_SCHAR_MAX
|
| 171 |
+
#define CL_CHAR_MIN CL_SCHAR_MIN
|
| 172 |
+
#define CL_UCHAR_MAX 255
|
| 173 |
+
#define CL_SHRT_MAX 32767
|
| 174 |
+
#define CL_SHRT_MIN (-32767-1)
|
| 175 |
+
#define CL_USHRT_MAX 65535
|
| 176 |
+
#define CL_INT_MAX 2147483647
|
| 177 |
+
#define CL_INT_MIN (-2147483647-1)
|
| 178 |
+
#define CL_UINT_MAX 0xffffffffU
|
| 179 |
+
#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
|
| 180 |
+
#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
|
| 181 |
+
#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
|
| 182 |
+
|
| 183 |
+
#define CL_FLT_DIG 6
|
| 184 |
+
#define CL_FLT_MANT_DIG 24
|
| 185 |
+
#define CL_FLT_MAX_10_EXP +38
|
| 186 |
+
#define CL_FLT_MAX_EXP +128
|
| 187 |
+
#define CL_FLT_MIN_10_EXP -37
|
| 188 |
+
#define CL_FLT_MIN_EXP -125
|
| 189 |
+
#define CL_FLT_RADIX 2
|
| 190 |
+
#define CL_FLT_MAX 340282346638528859811704183484516925440.0f
|
| 191 |
+
#define CL_FLT_MIN 1.175494350822287507969e-38f
|
| 192 |
+
#define CL_FLT_EPSILON 1.1920928955078125e-7f
|
| 193 |
+
|
| 194 |
+
#define CL_HALF_DIG 3
|
| 195 |
+
#define CL_HALF_MANT_DIG 11
|
| 196 |
+
#define CL_HALF_MAX_10_EXP +4
|
| 197 |
+
#define CL_HALF_MAX_EXP +16
|
| 198 |
+
#define CL_HALF_MIN_10_EXP -4
|
| 199 |
+
#define CL_HALF_MIN_EXP -13
|
| 200 |
+
#define CL_HALF_RADIX 2
|
| 201 |
+
#define CL_HALF_MAX 65504.0f
|
| 202 |
+
#define CL_HALF_MIN 6.103515625e-05f
|
| 203 |
+
#define CL_HALF_EPSILON 9.765625e-04f
|
| 204 |
+
|
| 205 |
+
#define CL_DBL_DIG 15
|
| 206 |
+
#define CL_DBL_MANT_DIG 53
|
| 207 |
+
#define CL_DBL_MAX_10_EXP +308
|
| 208 |
+
#define CL_DBL_MAX_EXP +1024
|
| 209 |
+
#define CL_DBL_MIN_10_EXP -307
|
| 210 |
+
#define CL_DBL_MIN_EXP -1021
|
| 211 |
+
#define CL_DBL_RADIX 2
|
| 212 |
+
#define CL_DBL_MAX 1.7976931348623158e+308
|
| 213 |
+
#define CL_DBL_MIN 2.225073858507201383090e-308
|
| 214 |
+
#define CL_DBL_EPSILON 2.220446049250313080847e-16
|
| 215 |
+
|
| 216 |
+
#define CL_M_E 2.7182818284590452354
|
| 217 |
+
#define CL_M_LOG2E 1.4426950408889634074
|
| 218 |
+
#define CL_M_LOG10E 0.43429448190325182765
|
| 219 |
+
#define CL_M_LN2 0.69314718055994530942
|
| 220 |
+
#define CL_M_LN10 2.30258509299404568402
|
| 221 |
+
#define CL_M_PI 3.14159265358979323846
|
| 222 |
+
#define CL_M_PI_2 1.57079632679489661923
|
| 223 |
+
#define CL_M_PI_4 0.78539816339744830962
|
| 224 |
+
#define CL_M_1_PI 0.31830988618379067154
|
| 225 |
+
#define CL_M_2_PI 0.63661977236758134308
|
| 226 |
+
#define CL_M_2_SQRTPI 1.12837916709551257390
|
| 227 |
+
#define CL_M_SQRT2 1.41421356237309504880
|
| 228 |
+
#define CL_M_SQRT1_2 0.70710678118654752440
|
| 229 |
+
|
| 230 |
+
#define CL_M_E_F 2.718281828f
|
| 231 |
+
#define CL_M_LOG2E_F 1.442695041f
|
| 232 |
+
#define CL_M_LOG10E_F 0.434294482f
|
| 233 |
+
#define CL_M_LN2_F 0.693147181f
|
| 234 |
+
#define CL_M_LN10_F 2.302585093f
|
| 235 |
+
#define CL_M_PI_F 3.141592654f
|
| 236 |
+
#define CL_M_PI_2_F 1.570796327f
|
| 237 |
+
#define CL_M_PI_4_F 0.785398163f
|
| 238 |
+
#define CL_M_1_PI_F 0.318309886f
|
| 239 |
+
#define CL_M_2_PI_F 0.636619772f
|
| 240 |
+
#define CL_M_2_SQRTPI_F 1.128379167f
|
| 241 |
+
#define CL_M_SQRT2_F 1.414213562f
|
| 242 |
+
#define CL_M_SQRT1_2_F 0.707106781f
|
| 243 |
+
|
| 244 |
+
#define CL_NAN (CL_INFINITY - CL_INFINITY)
|
| 245 |
+
#define CL_HUGE_VALF ((cl_float) 1e50)
|
| 246 |
+
#define CL_HUGE_VAL ((cl_double) 1e500)
|
| 247 |
+
#define CL_MAXFLOAT CL_FLT_MAX
|
| 248 |
+
#define CL_INFINITY CL_HUGE_VALF
|
| 249 |
+
|
| 250 |
+
#else
|
| 251 |
+
|
| 252 |
+
#include <stdint.h>
|
| 253 |
+
|
| 254 |
+
/* scalar types */
|
| 255 |
+
typedef int8_t cl_char;
|
| 256 |
+
typedef uint8_t cl_uchar;
|
| 257 |
+
typedef int16_t cl_short;
|
| 258 |
+
typedef uint16_t cl_ushort;
|
| 259 |
+
typedef int32_t cl_int;
|
| 260 |
+
typedef uint32_t cl_uint;
|
| 261 |
+
typedef int64_t cl_long;
|
| 262 |
+
typedef uint64_t cl_ulong;
|
| 263 |
+
|
| 264 |
+
typedef uint16_t cl_half;
|
| 265 |
+
typedef float cl_float;
|
| 266 |
+
typedef double cl_double;
|
| 267 |
+
|
| 268 |
+
/* Macro names and corresponding values defined by OpenCL */
|
| 269 |
+
#define CL_CHAR_BIT 8
|
| 270 |
+
#define CL_SCHAR_MAX 127
|
| 271 |
+
#define CL_SCHAR_MIN (-127-1)
|
| 272 |
+
#define CL_CHAR_MAX CL_SCHAR_MAX
|
| 273 |
+
#define CL_CHAR_MIN CL_SCHAR_MIN
|
| 274 |
+
#define CL_UCHAR_MAX 255
|
| 275 |
+
#define CL_SHRT_MAX 32767
|
| 276 |
+
#define CL_SHRT_MIN (-32767-1)
|
| 277 |
+
#define CL_USHRT_MAX 65535
|
| 278 |
+
#define CL_INT_MAX 2147483647
|
| 279 |
+
#define CL_INT_MIN (-2147483647-1)
|
| 280 |
+
#define CL_UINT_MAX 0xffffffffU
|
| 281 |
+
#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
|
| 282 |
+
#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
|
| 283 |
+
#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
|
| 284 |
+
|
| 285 |
+
#define CL_FLT_DIG 6
|
| 286 |
+
#define CL_FLT_MANT_DIG 24
|
| 287 |
+
#define CL_FLT_MAX_10_EXP +38
|
| 288 |
+
#define CL_FLT_MAX_EXP +128
|
| 289 |
+
#define CL_FLT_MIN_10_EXP -37
|
| 290 |
+
#define CL_FLT_MIN_EXP -125
|
| 291 |
+
#define CL_FLT_RADIX 2
|
| 292 |
+
#define CL_FLT_MAX 340282346638528859811704183484516925440.0f
|
| 293 |
+
#define CL_FLT_MIN 1.175494350822287507969e-38f
|
| 294 |
+
#define CL_FLT_EPSILON 1.1920928955078125e-7f
|
| 295 |
+
|
| 296 |
+
#define CL_HALF_DIG 3
|
| 297 |
+
#define CL_HALF_MANT_DIG 11
|
| 298 |
+
#define CL_HALF_MAX_10_EXP +4
|
| 299 |
+
#define CL_HALF_MAX_EXP +16
|
| 300 |
+
#define CL_HALF_MIN_10_EXP -4
|
| 301 |
+
#define CL_HALF_MIN_EXP -13
|
| 302 |
+
#define CL_HALF_RADIX 2
|
| 303 |
+
#define CL_HALF_MAX 65504.0f
|
| 304 |
+
#define CL_HALF_MIN 6.103515625e-05f
|
| 305 |
+
#define CL_HALF_EPSILON 9.765625e-04f
|
| 306 |
+
|
| 307 |
+
#define CL_DBL_DIG 15
|
| 308 |
+
#define CL_DBL_MANT_DIG 53
|
| 309 |
+
#define CL_DBL_MAX_10_EXP +308
|
| 310 |
+
#define CL_DBL_MAX_EXP +1024
|
| 311 |
+
#define CL_DBL_MIN_10_EXP -307
|
| 312 |
+
#define CL_DBL_MIN_EXP -1021
|
| 313 |
+
#define CL_DBL_RADIX 2
|
| 314 |
+
#define CL_DBL_MAX 179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
|
| 315 |
+
#define CL_DBL_MIN 2.225073858507201383090e-308
|
| 316 |
+
#define CL_DBL_EPSILON 2.220446049250313080847e-16
|
| 317 |
+
|
| 318 |
+
#define CL_M_E 2.7182818284590452354
|
| 319 |
+
#define CL_M_LOG2E 1.4426950408889634074
|
| 320 |
+
#define CL_M_LOG10E 0.43429448190325182765
|
| 321 |
+
#define CL_M_LN2 0.69314718055994530942
|
| 322 |
+
#define CL_M_LN10 2.30258509299404568402
|
| 323 |
+
#define CL_M_PI 3.14159265358979323846
|
| 324 |
+
#define CL_M_PI_2 1.57079632679489661923
|
| 325 |
+
#define CL_M_PI_4 0.78539816339744830962
|
| 326 |
+
#define CL_M_1_PI 0.31830988618379067154
|
| 327 |
+
#define CL_M_2_PI 0.63661977236758134308
|
| 328 |
+
#define CL_M_2_SQRTPI 1.12837916709551257390
|
| 329 |
+
#define CL_M_SQRT2 1.41421356237309504880
|
| 330 |
+
#define CL_M_SQRT1_2 0.70710678118654752440
|
| 331 |
+
|
| 332 |
+
#define CL_M_E_F 2.718281828f
|
| 333 |
+
#define CL_M_LOG2E_F 1.442695041f
|
| 334 |
+
#define CL_M_LOG10E_F 0.434294482f
|
| 335 |
+
#define CL_M_LN2_F 0.693147181f
|
| 336 |
+
#define CL_M_LN10_F 2.302585093f
|
| 337 |
+
#define CL_M_PI_F 3.141592654f
|
| 338 |
+
#define CL_M_PI_2_F 1.570796327f
|
| 339 |
+
#define CL_M_PI_4_F 0.785398163f
|
| 340 |
+
#define CL_M_1_PI_F 0.318309886f
|
| 341 |
+
#define CL_M_2_PI_F 0.636619772f
|
| 342 |
+
#define CL_M_2_SQRTPI_F 1.128379167f
|
| 343 |
+
#define CL_M_SQRT2_F 1.414213562f
|
| 344 |
+
#define CL_M_SQRT1_2_F 0.707106781f
|
| 345 |
+
|
| 346 |
+
#if defined( __GNUC__ )
|
| 347 |
+
#define CL_HUGE_VALF __builtin_huge_valf()
|
| 348 |
+
#define CL_HUGE_VAL __builtin_huge_val()
|
| 349 |
+
#define CL_NAN __builtin_nanf( "" )
|
| 350 |
+
#else
|
| 351 |
+
#define CL_HUGE_VALF ((cl_float) 1e50)
|
| 352 |
+
#define CL_HUGE_VAL ((cl_double) 1e500)
|
| 353 |
+
float nanf( const char * );
|
| 354 |
+
#define CL_NAN nanf( "" )
|
| 355 |
+
#endif
|
| 356 |
+
#define CL_MAXFLOAT CL_FLT_MAX
|
| 357 |
+
#define CL_INFINITY CL_HUGE_VALF
|
| 358 |
+
|
| 359 |
+
#endif
|
| 360 |
+
|
| 361 |
+
#include <stddef.h>
|
| 362 |
+
|
| 363 |
+
/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */
|
| 364 |
+
typedef unsigned int cl_GLuint;
|
| 365 |
+
typedef int cl_GLint;
|
| 366 |
+
typedef unsigned int cl_GLenum;
|
| 367 |
+
|
| 368 |
+
/*
|
| 369 |
+
* Vector types
|
| 370 |
+
*
|
| 371 |
+
* Note: OpenCL requires that all types be naturally aligned.
|
| 372 |
+
* This means that vector types must be naturally aligned.
|
| 373 |
+
* For example, a vector of four floats must be aligned to
|
| 374 |
+
* a 16 byte boundary (calculated as 4 * the natural 4-byte
|
| 375 |
+
* alignment of the float). The alignment qualifiers here
|
| 376 |
+
* will only function properly if your compiler supports them
|
| 377 |
+
* and if you don't actively work to defeat them. For example,
|
| 378 |
+
* in order for a cl_float4 to be 16 byte aligned in a struct,
|
| 379 |
+
* the start of the struct must itself be 16-byte aligned.
|
| 380 |
+
*
|
| 381 |
+
* Maintaining proper alignment is the user's responsibility.
|
| 382 |
+
*/
|
| 383 |
+
|
| 384 |
+
/* Define basic vector types */
|
| 385 |
+
#if defined( __VEC__ )
|
| 386 |
+
#if !defined(__clang__)
|
| 387 |
+
#include <altivec.h> /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
|
| 388 |
+
#endif
|
| 389 |
+
typedef __vector unsigned char __cl_uchar16;
|
| 390 |
+
typedef __vector signed char __cl_char16;
|
| 391 |
+
typedef __vector unsigned short __cl_ushort8;
|
| 392 |
+
typedef __vector signed short __cl_short8;
|
| 393 |
+
typedef __vector unsigned int __cl_uint4;
|
| 394 |
+
typedef __vector signed int __cl_int4;
|
| 395 |
+
typedef __vector float __cl_float4;
|
| 396 |
+
#define __CL_UCHAR16__ 1
|
| 397 |
+
#define __CL_CHAR16__ 1
|
| 398 |
+
#define __CL_USHORT8__ 1
|
| 399 |
+
#define __CL_SHORT8__ 1
|
| 400 |
+
#define __CL_UINT4__ 1
|
| 401 |
+
#define __CL_INT4__ 1
|
| 402 |
+
#define __CL_FLOAT4__ 1
|
| 403 |
+
#endif
|
| 404 |
+
|
| 405 |
+
#if defined( __SSE__ )
|
| 406 |
+
#if defined( __MINGW64__ )
|
| 407 |
+
#include <intrin.h>
|
| 408 |
+
#else
|
| 409 |
+
#include <xmmintrin.h>
|
| 410 |
+
#endif
|
| 411 |
+
#if defined( __GNUC__ )
|
| 412 |
+
typedef float __cl_float4 __attribute__((vector_size(16)));
|
| 413 |
+
#else
|
| 414 |
+
typedef __m128 __cl_float4;
|
| 415 |
+
#endif
|
| 416 |
+
#define __CL_FLOAT4__ 1
|
| 417 |
+
#endif
|
| 418 |
+
|
| 419 |
+
#if defined( __SSE2__ )
|
| 420 |
+
#if defined( __MINGW64__ )
|
| 421 |
+
#include <intrin.h>
|
| 422 |
+
#else
|
| 423 |
+
#include <emmintrin.h>
|
| 424 |
+
#endif
|
| 425 |
+
#if defined( __GNUC__ )
|
| 426 |
+
typedef cl_uchar __cl_uchar16 __attribute__((vector_size(16)));
|
| 427 |
+
typedef cl_char __cl_char16 __attribute__((vector_size(16)));
|
| 428 |
+
typedef cl_ushort __cl_ushort8 __attribute__((vector_size(16)));
|
| 429 |
+
typedef cl_short __cl_short8 __attribute__((vector_size(16)));
|
| 430 |
+
typedef cl_uint __cl_uint4 __attribute__((vector_size(16)));
|
| 431 |
+
typedef cl_int __cl_int4 __attribute__((vector_size(16)));
|
| 432 |
+
typedef cl_ulong __cl_ulong2 __attribute__((vector_size(16)));
|
| 433 |
+
typedef cl_long __cl_long2 __attribute__((vector_size(16)));
|
| 434 |
+
typedef cl_double __cl_double2 __attribute__((vector_size(16)));
|
| 435 |
+
#else
|
| 436 |
+
typedef __m128i __cl_uchar16;
|
| 437 |
+
typedef __m128i __cl_char16;
|
| 438 |
+
typedef __m128i __cl_ushort8;
|
| 439 |
+
typedef __m128i __cl_short8;
|
| 440 |
+
typedef __m128i __cl_uint4;
|
| 441 |
+
typedef __m128i __cl_int4;
|
| 442 |
+
typedef __m128i __cl_ulong2;
|
| 443 |
+
typedef __m128i __cl_long2;
|
| 444 |
+
typedef __m128d __cl_double2;
|
| 445 |
+
#endif
|
| 446 |
+
#define __CL_UCHAR16__ 1
|
| 447 |
+
#define __CL_CHAR16__ 1
|
| 448 |
+
#define __CL_USHORT8__ 1
|
| 449 |
+
#define __CL_SHORT8__ 1
|
| 450 |
+
#define __CL_INT4__ 1
|
| 451 |
+
#define __CL_UINT4__ 1
|
| 452 |
+
#define __CL_ULONG2__ 1
|
| 453 |
+
#define __CL_LONG2__ 1
|
| 454 |
+
#define __CL_DOUBLE2__ 1
|
| 455 |
+
#endif
|
| 456 |
+
|
| 457 |
+
#if defined( __MMX__ )
|
| 458 |
+
#include <mmintrin.h>
|
| 459 |
+
#if defined( __GNUC__ )
|
| 460 |
+
typedef cl_uchar __cl_uchar8 __attribute__((vector_size(8)));
|
| 461 |
+
typedef cl_char __cl_char8 __attribute__((vector_size(8)));
|
| 462 |
+
typedef cl_ushort __cl_ushort4 __attribute__((vector_size(8)));
|
| 463 |
+
typedef cl_short __cl_short4 __attribute__((vector_size(8)));
|
| 464 |
+
typedef cl_uint __cl_uint2 __attribute__((vector_size(8)));
|
| 465 |
+
typedef cl_int __cl_int2 __attribute__((vector_size(8)));
|
| 466 |
+
typedef cl_ulong __cl_ulong1 __attribute__((vector_size(8)));
|
| 467 |
+
typedef cl_long __cl_long1 __attribute__((vector_size(8)));
|
| 468 |
+
typedef cl_float __cl_float2 __attribute__((vector_size(8)));
|
| 469 |
+
#else
|
| 470 |
+
typedef __m64 __cl_uchar8;
|
| 471 |
+
typedef __m64 __cl_char8;
|
| 472 |
+
typedef __m64 __cl_ushort4;
|
| 473 |
+
typedef __m64 __cl_short4;
|
| 474 |
+
typedef __m64 __cl_uint2;
|
| 475 |
+
typedef __m64 __cl_int2;
|
| 476 |
+
typedef __m64 __cl_ulong1;
|
| 477 |
+
typedef __m64 __cl_long1;
|
| 478 |
+
typedef __m64 __cl_float2;
|
| 479 |
+
#endif
|
| 480 |
+
#define __CL_UCHAR8__ 1
|
| 481 |
+
#define __CL_CHAR8__ 1
|
| 482 |
+
#define __CL_USHORT4__ 1
|
| 483 |
+
#define __CL_SHORT4__ 1
|
| 484 |
+
#define __CL_INT2__ 1
|
| 485 |
+
#define __CL_UINT2__ 1
|
| 486 |
+
#define __CL_ULONG1__ 1
|
| 487 |
+
#define __CL_LONG1__ 1
|
| 488 |
+
#define __CL_FLOAT2__ 1
|
| 489 |
+
#endif
|
| 490 |
+
|
| 491 |
+
#if defined( __AVX__ )
|
| 492 |
+
#if defined( __MINGW64__ )
|
| 493 |
+
#include <intrin.h>
|
| 494 |
+
#else
|
| 495 |
+
#include <immintrin.h>
|
| 496 |
+
#endif
|
| 497 |
+
#if defined( __GNUC__ )
|
| 498 |
+
typedef cl_float __cl_float8 __attribute__((vector_size(32)));
|
| 499 |
+
typedef cl_double __cl_double4 __attribute__((vector_size(32)));
|
| 500 |
+
#else
|
| 501 |
+
typedef __m256 __cl_float8;
|
| 502 |
+
typedef __m256d __cl_double4;
|
| 503 |
+
#endif
|
| 504 |
+
#define __CL_FLOAT8__ 1
|
| 505 |
+
#define __CL_DOUBLE4__ 1
|
| 506 |
+
#endif
|
| 507 |
+
|
| 508 |
+
/* Define capabilities for anonymous struct members. */
|
| 509 |
+
#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
|
| 510 |
+
#define __CL_HAS_ANON_STRUCT__ 1
|
| 511 |
+
#define __CL_ANON_STRUCT__
|
| 512 |
+
#elif defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
|
| 513 |
+
#define __CL_HAS_ANON_STRUCT__ 1
|
| 514 |
+
#define __CL_ANON_STRUCT__ __extension__
|
| 515 |
+
#elif defined( _WIN32) && defined(_MSC_VER) && ! defined(__STDC__)
|
| 516 |
+
#if _MSC_VER >= 1500
|
| 517 |
+
/* Microsoft Developer Studio 2008 supports anonymous structs, but
|
| 518 |
+
* complains by default. */
|
| 519 |
+
#define __CL_HAS_ANON_STRUCT__ 1
|
| 520 |
+
#define __CL_ANON_STRUCT__
|
| 521 |
+
/* Disable warning C4201: nonstandard extension used : nameless
|
| 522 |
+
* struct/union */
|
| 523 |
+
#pragma warning( push )
|
| 524 |
+
#pragma warning( disable : 4201 )
|
| 525 |
+
#endif
|
| 526 |
+
#else
|
| 527 |
+
#define __CL_HAS_ANON_STRUCT__ 0
|
| 528 |
+
#define __CL_ANON_STRUCT__
|
| 529 |
+
#endif
|
| 530 |
+
|
| 531 |
+
/* Define alignment keys */
|
| 532 |
+
#if defined( __GNUC__ )
|
| 533 |
+
#define CL_ALIGNED(_x) __attribute__ ((aligned(_x)))
|
| 534 |
+
#elif defined( _WIN32) && (_MSC_VER)
|
| 535 |
+
/* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements */
|
| 536 |
+
/* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx */
|
| 537 |
+
/* #include <crtdefs.h> */
|
| 538 |
+
/* #define CL_ALIGNED(_x) _CRT_ALIGN(_x) */
|
| 539 |
+
#define CL_ALIGNED(_x)
|
| 540 |
+
#else
|
| 541 |
+
#warning Need to implement some method to align data here
|
| 542 |
+
#define CL_ALIGNED(_x)
|
| 543 |
+
#endif
|
| 544 |
+
|
| 545 |
+
/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
|
| 546 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 547 |
+
/* .xyzw and .s0123...{f|F} are supported */
|
| 548 |
+
#define CL_HAS_NAMED_VECTOR_FIELDS 1
|
| 549 |
+
/* .hi and .lo are supported */
|
| 550 |
+
#define CL_HAS_HI_LO_VECTOR_FIELDS 1
|
| 551 |
+
#endif
|
| 552 |
+
|
| 553 |
+
/* Define cl_vector types */
|
| 554 |
+
|
| 555 |
+
/* ---- cl_charn ---- */
|
| 556 |
+
typedef union
|
| 557 |
+
{
|
| 558 |
+
cl_char CL_ALIGNED(2) s[2];
|
| 559 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 560 |
+
__CL_ANON_STRUCT__ struct{ cl_char x, y; };
|
| 561 |
+
__CL_ANON_STRUCT__ struct{ cl_char s0, s1; };
|
| 562 |
+
__CL_ANON_STRUCT__ struct{ cl_char lo, hi; };
|
| 563 |
+
#endif
|
| 564 |
+
#if defined( __CL_CHAR2__)
|
| 565 |
+
__cl_char2 v2;
|
| 566 |
+
#endif
|
| 567 |
+
}cl_char2;
|
| 568 |
+
|
| 569 |
+
typedef union
|
| 570 |
+
{
|
| 571 |
+
cl_char CL_ALIGNED(4) s[4];
|
| 572 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 573 |
+
__CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; };
|
| 574 |
+
__CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3; };
|
| 575 |
+
__CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; };
|
| 576 |
+
#endif
|
| 577 |
+
#if defined( __CL_CHAR2__)
|
| 578 |
+
__cl_char2 v2[2];
|
| 579 |
+
#endif
|
| 580 |
+
#if defined( __CL_CHAR4__)
|
| 581 |
+
__cl_char4 v4;
|
| 582 |
+
#endif
|
| 583 |
+
}cl_char4;
|
| 584 |
+
|
| 585 |
+
/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
|
| 586 |
+
typedef cl_char4 cl_char3;
|
| 587 |
+
|
| 588 |
+
typedef union
|
| 589 |
+
{
|
| 590 |
+
cl_char CL_ALIGNED(8) s[8];
|
| 591 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 592 |
+
__CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; };
|
| 593 |
+
__CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7; };
|
| 594 |
+
__CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; };
|
| 595 |
+
#endif
|
| 596 |
+
#if defined( __CL_CHAR2__)
|
| 597 |
+
__cl_char2 v2[4];
|
| 598 |
+
#endif
|
| 599 |
+
#if defined( __CL_CHAR4__)
|
| 600 |
+
__cl_char4 v4[2];
|
| 601 |
+
#endif
|
| 602 |
+
#if defined( __CL_CHAR8__ )
|
| 603 |
+
__cl_char8 v8;
|
| 604 |
+
#endif
|
| 605 |
+
}cl_char8;
|
| 606 |
+
|
| 607 |
+
typedef union
|
| 608 |
+
{
|
| 609 |
+
cl_char CL_ALIGNED(16) s[16];
|
| 610 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 611 |
+
__CL_ANON_STRUCT__ struct{ cl_char x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
|
| 612 |
+
__CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
|
| 613 |
+
__CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; };
|
| 614 |
+
#endif
|
| 615 |
+
#if defined( __CL_CHAR2__)
|
| 616 |
+
__cl_char2 v2[8];
|
| 617 |
+
#endif
|
| 618 |
+
#if defined( __CL_CHAR4__)
|
| 619 |
+
__cl_char4 v4[4];
|
| 620 |
+
#endif
|
| 621 |
+
#if defined( __CL_CHAR8__ )
|
| 622 |
+
__cl_char8 v8[2];
|
| 623 |
+
#endif
|
| 624 |
+
#if defined( __CL_CHAR16__ )
|
| 625 |
+
__cl_char16 v16;
|
| 626 |
+
#endif
|
| 627 |
+
}cl_char16;
|
| 628 |
+
|
| 629 |
+
|
| 630 |
+
/* ---- cl_ucharn ---- */
|
| 631 |
+
typedef union
|
| 632 |
+
{
|
| 633 |
+
cl_uchar CL_ALIGNED(2) s[2];
|
| 634 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 635 |
+
__CL_ANON_STRUCT__ struct{ cl_uchar x, y; };
|
| 636 |
+
__CL_ANON_STRUCT__ struct{ cl_uchar s0, s1; };
|
| 637 |
+
__CL_ANON_STRUCT__ struct{ cl_uchar lo, hi; };
|
| 638 |
+
#endif
|
| 639 |
+
#if defined( __cl_uchar2__)
|
| 640 |
+
__cl_uchar2 v2;
|
| 641 |
+
#endif
|
| 642 |
+
}cl_uchar2;
|
| 643 |
+
|
| 644 |
+
typedef union
|
| 645 |
+
{
|
| 646 |
+
cl_uchar CL_ALIGNED(4) s[4];
|
| 647 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 648 |
+
__CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; };
|
| 649 |
+
__CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3; };
|
| 650 |
+
__CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; };
|
| 651 |
+
#endif
|
| 652 |
+
#if defined( __CL_UCHAR2__)
|
| 653 |
+
__cl_uchar2 v2[2];
|
| 654 |
+
#endif
|
| 655 |
+
#if defined( __CL_UCHAR4__)
|
| 656 |
+
__cl_uchar4 v4;
|
| 657 |
+
#endif
|
| 658 |
+
}cl_uchar4;
|
| 659 |
+
|
| 660 |
+
/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
|
| 661 |
+
typedef cl_uchar4 cl_uchar3;
|
| 662 |
+
|
| 663 |
+
typedef union
|
| 664 |
+
{
|
| 665 |
+
cl_uchar CL_ALIGNED(8) s[8];
|
| 666 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 667 |
+
__CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; };
|
| 668 |
+
__CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7; };
|
| 669 |
+
__CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; };
|
| 670 |
+
#endif
|
| 671 |
+
#if defined( __CL_UCHAR2__)
|
| 672 |
+
__cl_uchar2 v2[4];
|
| 673 |
+
#endif
|
| 674 |
+
#if defined( __CL_UCHAR4__)
|
| 675 |
+
__cl_uchar4 v4[2];
|
| 676 |
+
#endif
|
| 677 |
+
#if defined( __CL_UCHAR8__ )
|
| 678 |
+
__cl_uchar8 v8;
|
| 679 |
+
#endif
|
| 680 |
+
}cl_uchar8;
|
| 681 |
+
|
| 682 |
+
typedef union
|
| 683 |
+
{
|
| 684 |
+
cl_uchar CL_ALIGNED(16) s[16];
|
| 685 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 686 |
+
__CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
|
| 687 |
+
__CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
|
| 688 |
+
__CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; };
|
| 689 |
+
#endif
|
| 690 |
+
#if defined( __CL_UCHAR2__)
|
| 691 |
+
__cl_uchar2 v2[8];
|
| 692 |
+
#endif
|
| 693 |
+
#if defined( __CL_UCHAR4__)
|
| 694 |
+
__cl_uchar4 v4[4];
|
| 695 |
+
#endif
|
| 696 |
+
#if defined( __CL_UCHAR8__ )
|
| 697 |
+
__cl_uchar8 v8[2];
|
| 698 |
+
#endif
|
| 699 |
+
#if defined( __CL_UCHAR16__ )
|
| 700 |
+
__cl_uchar16 v16;
|
| 701 |
+
#endif
|
| 702 |
+
}cl_uchar16;
|
| 703 |
+
|
| 704 |
+
|
| 705 |
+
/* ---- cl_shortn ---- */
|
| 706 |
+
typedef union
|
| 707 |
+
{
|
| 708 |
+
cl_short CL_ALIGNED(4) s[2];
|
| 709 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 710 |
+
__CL_ANON_STRUCT__ struct{ cl_short x, y; };
|
| 711 |
+
__CL_ANON_STRUCT__ struct{ cl_short s0, s1; };
|
| 712 |
+
__CL_ANON_STRUCT__ struct{ cl_short lo, hi; };
|
| 713 |
+
#endif
|
| 714 |
+
#if defined( __CL_SHORT2__)
|
| 715 |
+
__cl_short2 v2;
|
| 716 |
+
#endif
|
| 717 |
+
}cl_short2;
|
| 718 |
+
|
| 719 |
+
typedef union
|
| 720 |
+
{
|
| 721 |
+
cl_short CL_ALIGNED(8) s[4];
|
| 722 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 723 |
+
__CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; };
|
| 724 |
+
__CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3; };
|
| 725 |
+
__CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; };
|
| 726 |
+
#endif
|
| 727 |
+
#if defined( __CL_SHORT2__)
|
| 728 |
+
__cl_short2 v2[2];
|
| 729 |
+
#endif
|
| 730 |
+
#if defined( __CL_SHORT4__)
|
| 731 |
+
__cl_short4 v4;
|
| 732 |
+
#endif
|
| 733 |
+
}cl_short4;
|
| 734 |
+
|
| 735 |
+
/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
|
| 736 |
+
typedef cl_short4 cl_short3;
|
| 737 |
+
|
| 738 |
+
typedef union
|
| 739 |
+
{
|
| 740 |
+
cl_short CL_ALIGNED(16) s[8];
|
| 741 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 742 |
+
__CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; };
|
| 743 |
+
__CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7; };
|
| 744 |
+
__CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; };
|
| 745 |
+
#endif
|
| 746 |
+
#if defined( __CL_SHORT2__)
|
| 747 |
+
__cl_short2 v2[4];
|
| 748 |
+
#endif
|
| 749 |
+
#if defined( __CL_SHORT4__)
|
| 750 |
+
__cl_short4 v4[2];
|
| 751 |
+
#endif
|
| 752 |
+
#if defined( __CL_SHORT8__ )
|
| 753 |
+
__cl_short8 v8;
|
| 754 |
+
#endif
|
| 755 |
+
}cl_short8;
|
| 756 |
+
|
| 757 |
+
typedef union
|
| 758 |
+
{
|
| 759 |
+
cl_short CL_ALIGNED(32) s[16];
|
| 760 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 761 |
+
__CL_ANON_STRUCT__ struct{ cl_short x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
|
| 762 |
+
__CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
|
| 763 |
+
__CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; };
|
| 764 |
+
#endif
|
| 765 |
+
#if defined( __CL_SHORT2__)
|
| 766 |
+
__cl_short2 v2[8];
|
| 767 |
+
#endif
|
| 768 |
+
#if defined( __CL_SHORT4__)
|
| 769 |
+
__cl_short4 v4[4];
|
| 770 |
+
#endif
|
| 771 |
+
#if defined( __CL_SHORT8__ )
|
| 772 |
+
__cl_short8 v8[2];
|
| 773 |
+
#endif
|
| 774 |
+
#if defined( __CL_SHORT16__ )
|
| 775 |
+
__cl_short16 v16;
|
| 776 |
+
#endif
|
| 777 |
+
}cl_short16;
|
| 778 |
+
|
| 779 |
+
|
| 780 |
+
/* ---- cl_ushortn ---- */
|
| 781 |
+
typedef union
|
| 782 |
+
{
|
| 783 |
+
cl_ushort CL_ALIGNED(4) s[2];
|
| 784 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 785 |
+
__CL_ANON_STRUCT__ struct{ cl_ushort x, y; };
|
| 786 |
+
__CL_ANON_STRUCT__ struct{ cl_ushort s0, s1; };
|
| 787 |
+
__CL_ANON_STRUCT__ struct{ cl_ushort lo, hi; };
|
| 788 |
+
#endif
|
| 789 |
+
#if defined( __CL_USHORT2__)
|
| 790 |
+
__cl_ushort2 v2;
|
| 791 |
+
#endif
|
| 792 |
+
}cl_ushort2;
|
| 793 |
+
|
| 794 |
+
typedef union
|
| 795 |
+
{
|
| 796 |
+
cl_ushort CL_ALIGNED(8) s[4];
|
| 797 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 798 |
+
__CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; };
|
| 799 |
+
__CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3; };
|
| 800 |
+
__CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; };
|
| 801 |
+
#endif
|
| 802 |
+
#if defined( __CL_USHORT2__)
|
| 803 |
+
__cl_ushort2 v2[2];
|
| 804 |
+
#endif
|
| 805 |
+
#if defined( __CL_USHORT4__)
|
| 806 |
+
__cl_ushort4 v4;
|
| 807 |
+
#endif
|
| 808 |
+
}cl_ushort4;
|
| 809 |
+
|
| 810 |
+
/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
|
| 811 |
+
typedef cl_ushort4 cl_ushort3;
|
| 812 |
+
|
| 813 |
+
typedef union
|
| 814 |
+
{
|
| 815 |
+
cl_ushort CL_ALIGNED(16) s[8];
|
| 816 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 817 |
+
__CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; };
|
| 818 |
+
__CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7; };
|
| 819 |
+
__CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; };
|
| 820 |
+
#endif
|
| 821 |
+
#if defined( __CL_USHORT2__)
|
| 822 |
+
__cl_ushort2 v2[4];
|
| 823 |
+
#endif
|
| 824 |
+
#if defined( __CL_USHORT4__)
|
| 825 |
+
__cl_ushort4 v4[2];
|
| 826 |
+
#endif
|
| 827 |
+
#if defined( __CL_USHORT8__ )
|
| 828 |
+
__cl_ushort8 v8;
|
| 829 |
+
#endif
|
| 830 |
+
}cl_ushort8;
|
| 831 |
+
|
| 832 |
+
typedef union
|
| 833 |
+
{
|
| 834 |
+
cl_ushort CL_ALIGNED(32) s[16];
|
| 835 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 836 |
+
__CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
|
| 837 |
+
__CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
|
| 838 |
+
__CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; };
|
| 839 |
+
#endif
|
| 840 |
+
#if defined( __CL_USHORT2__)
|
| 841 |
+
__cl_ushort2 v2[8];
|
| 842 |
+
#endif
|
| 843 |
+
#if defined( __CL_USHORT4__)
|
| 844 |
+
__cl_ushort4 v4[4];
|
| 845 |
+
#endif
|
| 846 |
+
#if defined( __CL_USHORT8__ )
|
| 847 |
+
__cl_ushort8 v8[2];
|
| 848 |
+
#endif
|
| 849 |
+
#if defined( __CL_USHORT16__ )
|
| 850 |
+
__cl_ushort16 v16;
|
| 851 |
+
#endif
|
| 852 |
+
}cl_ushort16;
|
| 853 |
+
|
| 854 |
+
|
| 855 |
+
/* ---- cl_halfn ---- */
|
| 856 |
+
typedef union
|
| 857 |
+
{
|
| 858 |
+
cl_half CL_ALIGNED(4) s[2];
|
| 859 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 860 |
+
__CL_ANON_STRUCT__ struct{ cl_half x, y; };
|
| 861 |
+
__CL_ANON_STRUCT__ struct{ cl_half s0, s1; };
|
| 862 |
+
__CL_ANON_STRUCT__ struct{ cl_half lo, hi; };
|
| 863 |
+
#endif
|
| 864 |
+
#if defined( __CL_HALF2__)
|
| 865 |
+
__cl_half2 v2;
|
| 866 |
+
#endif
|
| 867 |
+
}cl_half2;
|
| 868 |
+
|
| 869 |
+
typedef union
|
| 870 |
+
{
|
| 871 |
+
cl_half CL_ALIGNED(8) s[4];
|
| 872 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 873 |
+
__CL_ANON_STRUCT__ struct{ cl_half x, y, z, w; };
|
| 874 |
+
__CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3; };
|
| 875 |
+
__CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; };
|
| 876 |
+
#endif
|
| 877 |
+
#if defined( __CL_HALF2__)
|
| 878 |
+
__cl_half2 v2[2];
|
| 879 |
+
#endif
|
| 880 |
+
#if defined( __CL_HALF4__)
|
| 881 |
+
__cl_half4 v4;
|
| 882 |
+
#endif
|
| 883 |
+
}cl_half4;
|
| 884 |
+
|
| 885 |
+
/* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */
|
| 886 |
+
typedef cl_half4 cl_half3;
|
| 887 |
+
|
| 888 |
+
typedef union
|
| 889 |
+
{
|
| 890 |
+
cl_half CL_ALIGNED(16) s[8];
|
| 891 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 892 |
+
__CL_ANON_STRUCT__ struct{ cl_half x, y, z, w; };
|
| 893 |
+
__CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3, s4, s5, s6, s7; };
|
| 894 |
+
__CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; };
|
| 895 |
+
#endif
|
| 896 |
+
#if defined( __CL_HALF2__)
|
| 897 |
+
__cl_half2 v2[4];
|
| 898 |
+
#endif
|
| 899 |
+
#if defined( __CL_HALF4__)
|
| 900 |
+
__cl_half4 v4[2];
|
| 901 |
+
#endif
|
| 902 |
+
#if defined( __CL_HALF8__ )
|
| 903 |
+
__cl_half8 v8;
|
| 904 |
+
#endif
|
| 905 |
+
}cl_half8;
|
| 906 |
+
|
| 907 |
+
typedef union
|
| 908 |
+
{
|
| 909 |
+
cl_half CL_ALIGNED(32) s[16];
|
| 910 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 911 |
+
__CL_ANON_STRUCT__ struct{ cl_half x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
|
| 912 |
+
__CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
|
| 913 |
+
__CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; };
|
| 914 |
+
#endif
|
| 915 |
+
#if defined( __CL_HALF2__)
|
| 916 |
+
__cl_half2 v2[8];
|
| 917 |
+
#endif
|
| 918 |
+
#if defined( __CL_HALF4__)
|
| 919 |
+
__cl_half4 v4[4];
|
| 920 |
+
#endif
|
| 921 |
+
#if defined( __CL_HALF8__ )
|
| 922 |
+
__cl_half8 v8[2];
|
| 923 |
+
#endif
|
| 924 |
+
#if defined( __CL_HALF16__ )
|
| 925 |
+
__cl_half16 v16;
|
| 926 |
+
#endif
|
| 927 |
+
}cl_half16;
|
| 928 |
+
|
| 929 |
+
/* ---- cl_intn ---- */
|
| 930 |
+
typedef union
|
| 931 |
+
{
|
| 932 |
+
cl_int CL_ALIGNED(8) s[2];
|
| 933 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 934 |
+
__CL_ANON_STRUCT__ struct{ cl_int x, y; };
|
| 935 |
+
__CL_ANON_STRUCT__ struct{ cl_int s0, s1; };
|
| 936 |
+
__CL_ANON_STRUCT__ struct{ cl_int lo, hi; };
|
| 937 |
+
#endif
|
| 938 |
+
#if defined( __CL_INT2__)
|
| 939 |
+
__cl_int2 v2;
|
| 940 |
+
#endif
|
| 941 |
+
}cl_int2;
|
| 942 |
+
|
| 943 |
+
typedef union
|
| 944 |
+
{
|
| 945 |
+
cl_int CL_ALIGNED(16) s[4];
|
| 946 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 947 |
+
__CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; };
|
| 948 |
+
__CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3; };
|
| 949 |
+
__CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; };
|
| 950 |
+
#endif
|
| 951 |
+
#if defined( __CL_INT2__)
|
| 952 |
+
__cl_int2 v2[2];
|
| 953 |
+
#endif
|
| 954 |
+
#if defined( __CL_INT4__)
|
| 955 |
+
__cl_int4 v4;
|
| 956 |
+
#endif
|
| 957 |
+
}cl_int4;
|
| 958 |
+
|
| 959 |
+
/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
|
| 960 |
+
typedef cl_int4 cl_int3;
|
| 961 |
+
|
| 962 |
+
typedef union
|
| 963 |
+
{
|
| 964 |
+
cl_int CL_ALIGNED(32) s[8];
|
| 965 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 966 |
+
__CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; };
|
| 967 |
+
__CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7; };
|
| 968 |
+
__CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; };
|
| 969 |
+
#endif
|
| 970 |
+
#if defined( __CL_INT2__)
|
| 971 |
+
__cl_int2 v2[4];
|
| 972 |
+
#endif
|
| 973 |
+
#if defined( __CL_INT4__)
|
| 974 |
+
__cl_int4 v4[2];
|
| 975 |
+
#endif
|
| 976 |
+
#if defined( __CL_INT8__ )
|
| 977 |
+
__cl_int8 v8;
|
| 978 |
+
#endif
|
| 979 |
+
}cl_int8;
|
| 980 |
+
|
| 981 |
+
typedef union
|
| 982 |
+
{
|
| 983 |
+
cl_int CL_ALIGNED(64) s[16];
|
| 984 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 985 |
+
__CL_ANON_STRUCT__ struct{ cl_int x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
|
| 986 |
+
__CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
|
| 987 |
+
__CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; };
|
| 988 |
+
#endif
|
| 989 |
+
#if defined( __CL_INT2__)
|
| 990 |
+
__cl_int2 v2[8];
|
| 991 |
+
#endif
|
| 992 |
+
#if defined( __CL_INT4__)
|
| 993 |
+
__cl_int4 v4[4];
|
| 994 |
+
#endif
|
| 995 |
+
#if defined( __CL_INT8__ )
|
| 996 |
+
__cl_int8 v8[2];
|
| 997 |
+
#endif
|
| 998 |
+
#if defined( __CL_INT16__ )
|
| 999 |
+
__cl_int16 v16;
|
| 1000 |
+
#endif
|
| 1001 |
+
}cl_int16;
|
| 1002 |
+
|
| 1003 |
+
|
| 1004 |
+
/* ---- cl_uintn ---- */
|
| 1005 |
+
typedef union
|
| 1006 |
+
{
|
| 1007 |
+
cl_uint CL_ALIGNED(8) s[2];
|
| 1008 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 1009 |
+
__CL_ANON_STRUCT__ struct{ cl_uint x, y; };
|
| 1010 |
+
__CL_ANON_STRUCT__ struct{ cl_uint s0, s1; };
|
| 1011 |
+
__CL_ANON_STRUCT__ struct{ cl_uint lo, hi; };
|
| 1012 |
+
#endif
|
| 1013 |
+
#if defined( __CL_UINT2__)
|
| 1014 |
+
__cl_uint2 v2;
|
| 1015 |
+
#endif
|
| 1016 |
+
}cl_uint2;
|
| 1017 |
+
|
| 1018 |
+
typedef union
|
| 1019 |
+
{
|
| 1020 |
+
cl_uint CL_ALIGNED(16) s[4];
|
| 1021 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 1022 |
+
__CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; };
|
| 1023 |
+
__CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3; };
|
| 1024 |
+
__CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; };
|
| 1025 |
+
#endif
|
| 1026 |
+
#if defined( __CL_UINT2__)
|
| 1027 |
+
__cl_uint2 v2[2];
|
| 1028 |
+
#endif
|
| 1029 |
+
#if defined( __CL_UINT4__)
|
| 1030 |
+
__cl_uint4 v4;
|
| 1031 |
+
#endif
|
| 1032 |
+
}cl_uint4;
|
| 1033 |
+
|
| 1034 |
+
/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
|
| 1035 |
+
typedef cl_uint4 cl_uint3;
|
| 1036 |
+
|
| 1037 |
+
typedef union
|
| 1038 |
+
{
|
| 1039 |
+
cl_uint CL_ALIGNED(32) s[8];
|
| 1040 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 1041 |
+
__CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; };
|
| 1042 |
+
__CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7; };
|
| 1043 |
+
__CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; };
|
| 1044 |
+
#endif
|
| 1045 |
+
#if defined( __CL_UINT2__)
|
| 1046 |
+
__cl_uint2 v2[4];
|
| 1047 |
+
#endif
|
| 1048 |
+
#if defined( __CL_UINT4__)
|
| 1049 |
+
__cl_uint4 v4[2];
|
| 1050 |
+
#endif
|
| 1051 |
+
#if defined( __CL_UINT8__ )
|
| 1052 |
+
__cl_uint8 v8;
|
| 1053 |
+
#endif
|
| 1054 |
+
}cl_uint8;
|
| 1055 |
+
|
| 1056 |
+
typedef union
|
| 1057 |
+
{
|
| 1058 |
+
cl_uint CL_ALIGNED(64) s[16];
|
| 1059 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 1060 |
+
__CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
|
| 1061 |
+
__CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
|
| 1062 |
+
__CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; };
|
| 1063 |
+
#endif
|
| 1064 |
+
#if defined( __CL_UINT2__)
|
| 1065 |
+
__cl_uint2 v2[8];
|
| 1066 |
+
#endif
|
| 1067 |
+
#if defined( __CL_UINT4__)
|
| 1068 |
+
__cl_uint4 v4[4];
|
| 1069 |
+
#endif
|
| 1070 |
+
#if defined( __CL_UINT8__ )
|
| 1071 |
+
__cl_uint8 v8[2];
|
| 1072 |
+
#endif
|
| 1073 |
+
#if defined( __CL_UINT16__ )
|
| 1074 |
+
__cl_uint16 v16;
|
| 1075 |
+
#endif
|
| 1076 |
+
}cl_uint16;
|
| 1077 |
+
|
| 1078 |
+
/* ---- cl_longn ---- */
|
| 1079 |
+
typedef union
|
| 1080 |
+
{
|
| 1081 |
+
cl_long CL_ALIGNED(16) s[2];
|
| 1082 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 1083 |
+
__CL_ANON_STRUCT__ struct{ cl_long x, y; };
|
| 1084 |
+
__CL_ANON_STRUCT__ struct{ cl_long s0, s1; };
|
| 1085 |
+
__CL_ANON_STRUCT__ struct{ cl_long lo, hi; };
|
| 1086 |
+
#endif
|
| 1087 |
+
#if defined( __CL_LONG2__)
|
| 1088 |
+
__cl_long2 v2;
|
| 1089 |
+
#endif
|
| 1090 |
+
}cl_long2;
|
| 1091 |
+
|
| 1092 |
+
typedef union
|
| 1093 |
+
{
|
| 1094 |
+
cl_long CL_ALIGNED(32) s[4];
|
| 1095 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 1096 |
+
__CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; };
|
| 1097 |
+
__CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3; };
|
| 1098 |
+
__CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; };
|
| 1099 |
+
#endif
|
| 1100 |
+
#if defined( __CL_LONG2__)
|
| 1101 |
+
__cl_long2 v2[2];
|
| 1102 |
+
#endif
|
| 1103 |
+
#if defined( __CL_LONG4__)
|
| 1104 |
+
__cl_long4 v4;
|
| 1105 |
+
#endif
|
| 1106 |
+
}cl_long4;
|
| 1107 |
+
|
| 1108 |
+
/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
|
| 1109 |
+
typedef cl_long4 cl_long3;
|
| 1110 |
+
|
| 1111 |
+
typedef union
|
| 1112 |
+
{
|
| 1113 |
+
cl_long CL_ALIGNED(64) s[8];
|
| 1114 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 1115 |
+
__CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; };
|
| 1116 |
+
__CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7; };
|
| 1117 |
+
__CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; };
|
| 1118 |
+
#endif
|
| 1119 |
+
#if defined( __CL_LONG2__)
|
| 1120 |
+
__cl_long2 v2[4];
|
| 1121 |
+
#endif
|
| 1122 |
+
#if defined( __CL_LONG4__)
|
| 1123 |
+
__cl_long4 v4[2];
|
| 1124 |
+
#endif
|
| 1125 |
+
#if defined( __CL_LONG8__ )
|
| 1126 |
+
__cl_long8 v8;
|
| 1127 |
+
#endif
|
| 1128 |
+
}cl_long8;
|
| 1129 |
+
|
| 1130 |
+
typedef union
|
| 1131 |
+
{
|
| 1132 |
+
cl_long CL_ALIGNED(128) s[16];
|
| 1133 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 1134 |
+
__CL_ANON_STRUCT__ struct{ cl_long x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
|
| 1135 |
+
__CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
|
| 1136 |
+
__CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; };
|
| 1137 |
+
#endif
|
| 1138 |
+
#if defined( __CL_LONG2__)
|
| 1139 |
+
__cl_long2 v2[8];
|
| 1140 |
+
#endif
|
| 1141 |
+
#if defined( __CL_LONG4__)
|
| 1142 |
+
__cl_long4 v4[4];
|
| 1143 |
+
#endif
|
| 1144 |
+
#if defined( __CL_LONG8__ )
|
| 1145 |
+
__cl_long8 v8[2];
|
| 1146 |
+
#endif
|
| 1147 |
+
#if defined( __CL_LONG16__ )
|
| 1148 |
+
__cl_long16 v16;
|
| 1149 |
+
#endif
|
| 1150 |
+
}cl_long16;
|
| 1151 |
+
|
| 1152 |
+
|
| 1153 |
+
/* ---- cl_ulongn ---- */
|
| 1154 |
+
typedef union
|
| 1155 |
+
{
|
| 1156 |
+
cl_ulong CL_ALIGNED(16) s[2];
|
| 1157 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 1158 |
+
__CL_ANON_STRUCT__ struct{ cl_ulong x, y; };
|
| 1159 |
+
__CL_ANON_STRUCT__ struct{ cl_ulong s0, s1; };
|
| 1160 |
+
__CL_ANON_STRUCT__ struct{ cl_ulong lo, hi; };
|
| 1161 |
+
#endif
|
| 1162 |
+
#if defined( __CL_ULONG2__)
|
| 1163 |
+
__cl_ulong2 v2;
|
| 1164 |
+
#endif
|
| 1165 |
+
}cl_ulong2;
|
| 1166 |
+
|
| 1167 |
+
typedef union
|
| 1168 |
+
{
|
| 1169 |
+
cl_ulong CL_ALIGNED(32) s[4];
|
| 1170 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 1171 |
+
__CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; };
|
| 1172 |
+
__CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3; };
|
| 1173 |
+
__CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; };
|
| 1174 |
+
#endif
|
| 1175 |
+
#if defined( __CL_ULONG2__)
|
| 1176 |
+
__cl_ulong2 v2[2];
|
| 1177 |
+
#endif
|
| 1178 |
+
#if defined( __CL_ULONG4__)
|
| 1179 |
+
__cl_ulong4 v4;
|
| 1180 |
+
#endif
|
| 1181 |
+
}cl_ulong4;
|
| 1182 |
+
|
| 1183 |
+
/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
|
| 1184 |
+
typedef cl_ulong4 cl_ulong3;
|
| 1185 |
+
|
| 1186 |
+
typedef union
|
| 1187 |
+
{
|
| 1188 |
+
cl_ulong CL_ALIGNED(64) s[8];
|
| 1189 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 1190 |
+
__CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; };
|
| 1191 |
+
__CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7; };
|
| 1192 |
+
__CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; };
|
| 1193 |
+
#endif
|
| 1194 |
+
#if defined( __CL_ULONG2__)
|
| 1195 |
+
__cl_ulong2 v2[4];
|
| 1196 |
+
#endif
|
| 1197 |
+
#if defined( __CL_ULONG4__)
|
| 1198 |
+
__cl_ulong4 v4[2];
|
| 1199 |
+
#endif
|
| 1200 |
+
#if defined( __CL_ULONG8__ )
|
| 1201 |
+
__cl_ulong8 v8;
|
| 1202 |
+
#endif
|
| 1203 |
+
}cl_ulong8;
|
| 1204 |
+
|
| 1205 |
+
typedef union
|
| 1206 |
+
{
|
| 1207 |
+
cl_ulong CL_ALIGNED(128) s[16];
|
| 1208 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 1209 |
+
__CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
|
| 1210 |
+
__CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
|
| 1211 |
+
__CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; };
|
| 1212 |
+
#endif
|
| 1213 |
+
#if defined( __CL_ULONG2__)
|
| 1214 |
+
__cl_ulong2 v2[8];
|
| 1215 |
+
#endif
|
| 1216 |
+
#if defined( __CL_ULONG4__)
|
| 1217 |
+
__cl_ulong4 v4[4];
|
| 1218 |
+
#endif
|
| 1219 |
+
#if defined( __CL_ULONG8__ )
|
| 1220 |
+
__cl_ulong8 v8[2];
|
| 1221 |
+
#endif
|
| 1222 |
+
#if defined( __CL_ULONG16__ )
|
| 1223 |
+
__cl_ulong16 v16;
|
| 1224 |
+
#endif
|
| 1225 |
+
}cl_ulong16;
|
| 1226 |
+
|
| 1227 |
+
|
| 1228 |
+
/* --- cl_floatn ---- */
|
| 1229 |
+
|
| 1230 |
+
typedef union
|
| 1231 |
+
{
|
| 1232 |
+
cl_float CL_ALIGNED(8) s[2];
|
| 1233 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 1234 |
+
__CL_ANON_STRUCT__ struct{ cl_float x, y; };
|
| 1235 |
+
__CL_ANON_STRUCT__ struct{ cl_float s0, s1; };
|
| 1236 |
+
__CL_ANON_STRUCT__ struct{ cl_float lo, hi; };
|
| 1237 |
+
#endif
|
| 1238 |
+
#if defined( __CL_FLOAT2__)
|
| 1239 |
+
__cl_float2 v2;
|
| 1240 |
+
#endif
|
| 1241 |
+
}cl_float2;
|
| 1242 |
+
|
| 1243 |
+
typedef union
|
| 1244 |
+
{
|
| 1245 |
+
cl_float CL_ALIGNED(16) s[4];
|
| 1246 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 1247 |
+
__CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; };
|
| 1248 |
+
__CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3; };
|
| 1249 |
+
__CL_ANON_STRUCT__ struct{ cl_float2 lo, hi; };
|
| 1250 |
+
#endif
|
| 1251 |
+
#if defined( __CL_FLOAT2__)
|
| 1252 |
+
__cl_float2 v2[2];
|
| 1253 |
+
#endif
|
| 1254 |
+
#if defined( __CL_FLOAT4__)
|
| 1255 |
+
__cl_float4 v4;
|
| 1256 |
+
#endif
|
| 1257 |
+
}cl_float4;
|
| 1258 |
+
|
| 1259 |
+
/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
|
| 1260 |
+
typedef cl_float4 cl_float3;
|
| 1261 |
+
|
| 1262 |
+
typedef union
|
| 1263 |
+
{
|
| 1264 |
+
cl_float CL_ALIGNED(32) s[8];
|
| 1265 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 1266 |
+
__CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; };
|
| 1267 |
+
__CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7; };
|
| 1268 |
+
__CL_ANON_STRUCT__ struct{ cl_float4 lo, hi; };
|
| 1269 |
+
#endif
|
| 1270 |
+
#if defined( __CL_FLOAT2__)
|
| 1271 |
+
__cl_float2 v2[4];
|
| 1272 |
+
#endif
|
| 1273 |
+
#if defined( __CL_FLOAT4__)
|
| 1274 |
+
__cl_float4 v4[2];
|
| 1275 |
+
#endif
|
| 1276 |
+
#if defined( __CL_FLOAT8__ )
|
| 1277 |
+
__cl_float8 v8;
|
| 1278 |
+
#endif
|
| 1279 |
+
}cl_float8;
|
| 1280 |
+
|
| 1281 |
+
typedef union
|
| 1282 |
+
{
|
| 1283 |
+
cl_float CL_ALIGNED(64) s[16];
|
| 1284 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 1285 |
+
__CL_ANON_STRUCT__ struct{ cl_float x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
|
| 1286 |
+
__CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
|
| 1287 |
+
__CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; };
|
| 1288 |
+
#endif
|
| 1289 |
+
#if defined( __CL_FLOAT2__)
|
| 1290 |
+
__cl_float2 v2[8];
|
| 1291 |
+
#endif
|
| 1292 |
+
#if defined( __CL_FLOAT4__)
|
| 1293 |
+
__cl_float4 v4[4];
|
| 1294 |
+
#endif
|
| 1295 |
+
#if defined( __CL_FLOAT8__ )
|
| 1296 |
+
__cl_float8 v8[2];
|
| 1297 |
+
#endif
|
| 1298 |
+
#if defined( __CL_FLOAT16__ )
|
| 1299 |
+
__cl_float16 v16;
|
| 1300 |
+
#endif
|
| 1301 |
+
}cl_float16;
|
| 1302 |
+
|
| 1303 |
+
/* --- cl_doublen ---- */
|
| 1304 |
+
|
| 1305 |
+
typedef union
|
| 1306 |
+
{
|
| 1307 |
+
cl_double CL_ALIGNED(16) s[2];
|
| 1308 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 1309 |
+
__CL_ANON_STRUCT__ struct{ cl_double x, y; };
|
| 1310 |
+
__CL_ANON_STRUCT__ struct{ cl_double s0, s1; };
|
| 1311 |
+
__CL_ANON_STRUCT__ struct{ cl_double lo, hi; };
|
| 1312 |
+
#endif
|
| 1313 |
+
#if defined( __CL_DOUBLE2__)
|
| 1314 |
+
__cl_double2 v2;
|
| 1315 |
+
#endif
|
| 1316 |
+
}cl_double2;
|
| 1317 |
+
|
| 1318 |
+
typedef union
|
| 1319 |
+
{
|
| 1320 |
+
cl_double CL_ALIGNED(32) s[4];
|
| 1321 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 1322 |
+
__CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; };
|
| 1323 |
+
__CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3; };
|
| 1324 |
+
__CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; };
|
| 1325 |
+
#endif
|
| 1326 |
+
#if defined( __CL_DOUBLE2__)
|
| 1327 |
+
__cl_double2 v2[2];
|
| 1328 |
+
#endif
|
| 1329 |
+
#if defined( __CL_DOUBLE4__)
|
| 1330 |
+
__cl_double4 v4;
|
| 1331 |
+
#endif
|
| 1332 |
+
}cl_double4;
|
| 1333 |
+
|
| 1334 |
+
/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
|
| 1335 |
+
typedef cl_double4 cl_double3;
|
| 1336 |
+
|
| 1337 |
+
typedef union
|
| 1338 |
+
{
|
| 1339 |
+
cl_double CL_ALIGNED(64) s[8];
|
| 1340 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 1341 |
+
__CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; };
|
| 1342 |
+
__CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7; };
|
| 1343 |
+
__CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; };
|
| 1344 |
+
#endif
|
| 1345 |
+
#if defined( __CL_DOUBLE2__)
|
| 1346 |
+
__cl_double2 v2[4];
|
| 1347 |
+
#endif
|
| 1348 |
+
#if defined( __CL_DOUBLE4__)
|
| 1349 |
+
__cl_double4 v4[2];
|
| 1350 |
+
#endif
|
| 1351 |
+
#if defined( __CL_DOUBLE8__ )
|
| 1352 |
+
__cl_double8 v8;
|
| 1353 |
+
#endif
|
| 1354 |
+
}cl_double8;
|
| 1355 |
+
|
| 1356 |
+
typedef union
|
| 1357 |
+
{
|
| 1358 |
+
cl_double CL_ALIGNED(128) s[16];
|
| 1359 |
+
#if __CL_HAS_ANON_STRUCT__
|
| 1360 |
+
__CL_ANON_STRUCT__ struct{ cl_double x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
|
| 1361 |
+
__CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
|
| 1362 |
+
__CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; };
|
| 1363 |
+
#endif
|
| 1364 |
+
#if defined( __CL_DOUBLE2__)
|
| 1365 |
+
__cl_double2 v2[8];
|
| 1366 |
+
#endif
|
| 1367 |
+
#if defined( __CL_DOUBLE4__)
|
| 1368 |
+
__cl_double4 v4[4];
|
| 1369 |
+
#endif
|
| 1370 |
+
#if defined( __CL_DOUBLE8__ )
|
| 1371 |
+
__cl_double8 v8[2];
|
| 1372 |
+
#endif
|
| 1373 |
+
#if defined( __CL_DOUBLE16__ )
|
| 1374 |
+
__cl_double16 v16;
|
| 1375 |
+
#endif
|
| 1376 |
+
}cl_double16;
|
| 1377 |
+
|
| 1378 |
+
/* Macro to facilitate debugging
|
| 1379 |
+
* Usage:
|
| 1380 |
+
* Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source.
|
| 1381 |
+
* The first line ends with: CL_PROGRAM_STRING_DEBUG_INFO \"
|
| 1382 |
+
* Each line thereafter of OpenCL C source must end with: \n\
|
| 1383 |
+
* The last line ends in ";
|
| 1384 |
+
*
|
| 1385 |
+
* Example:
|
| 1386 |
+
*
|
| 1387 |
+
* const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\
|
| 1388 |
+
* kernel void foo( int a, float * b ) \n\
|
| 1389 |
+
* { \n\
|
| 1390 |
+
* // my comment \n\
|
| 1391 |
+
* *b[ get_global_id(0)] = a; \n\
|
| 1392 |
+
* } \n\
|
| 1393 |
+
* ";
|
| 1394 |
+
*
|
| 1395 |
+
* This should correctly set up the line, (column) and file information for your source
|
| 1396 |
+
* string so you can do source level debugging.
|
| 1397 |
+
*/
|
| 1398 |
+
#define __CL_STRINGIFY( _x ) # _x
|
| 1399 |
+
#define _CL_STRINGIFY( _x ) __CL_STRINGIFY( _x )
|
| 1400 |
+
#define CL_PROGRAM_STRING_DEBUG_INFO "#line " _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n"
|
| 1401 |
+
|
| 1402 |
+
#ifdef __cplusplus
|
| 1403 |
+
}
|
| 1404 |
+
#endif
|
| 1405 |
+
|
| 1406 |
+
#undef __CL_HAS_ANON_STRUCT__
|
| 1407 |
+
#undef __CL_ANON_STRUCT__
|
| 1408 |
+
#if defined( _WIN32) && defined(_MSC_VER) && ! defined(__STDC__)
|
| 1409 |
+
#if _MSC_VER >=1500
|
| 1410 |
+
#pragma warning( pop )
|
| 1411 |
+
#endif
|
| 1412 |
+
#endif
|
| 1413 |
+
|
| 1414 |
+
#endif /* __CL_PLATFORM_H */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/opencl.h
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*******************************************************************************
|
| 2 |
+
* Copyright (c) 2008-2020 The Khronos Group Inc.
|
| 3 |
+
*
|
| 4 |
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
* you may not use this file except in compliance with the License.
|
| 6 |
+
* You may obtain a copy of the License at
|
| 7 |
+
*
|
| 8 |
+
* http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
*
|
| 10 |
+
* Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
* See the License for the specific language governing permissions and
|
| 14 |
+
* limitations under the License.
|
| 15 |
+
******************************************************************************/
|
| 16 |
+
|
| 17 |
+
#ifndef __OPENCL_H
|
| 18 |
+
#define __OPENCL_H
|
| 19 |
+
|
| 20 |
+
#ifdef __cplusplus
|
| 21 |
+
extern "C" {
|
| 22 |
+
#endif
|
| 23 |
+
|
| 24 |
+
#ifdef __APPLE__
|
| 25 |
+
#include <OpenCL/cl.h>
|
| 26 |
+
#include <OpenCL/cl_gl.h>
|
| 27 |
+
#include <OpenCL/cl_gl_ext.h>
|
| 28 |
+
#include <OpenCL/cl_ext.h>
|
| 29 |
+
#else
|
| 30 |
+
#include <CL/cl.h>
|
| 31 |
+
#include <CL/cl_gl.h>
|
| 32 |
+
#include <CL/cl_gl_ext.h>
|
| 33 |
+
#include <CL/cl_ext.h>
|
| 34 |
+
#endif
|
| 35 |
+
|
| 36 |
+
#ifdef __cplusplus
|
| 37 |
+
}
|
| 38 |
+
#endif
|
| 39 |
+
|
| 40 |
+
#endif /* __OPENCL_H */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (228 Bytes). View file
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/builtin_types.h
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
/*******************************************************************************
|
| 51 |
+
* *
|
| 52 |
+
* *
|
| 53 |
+
* *
|
| 54 |
+
*******************************************************************************/
|
| 55 |
+
|
| 56 |
+
#include "device_types.h"
|
| 57 |
+
#if !defined(__CUDACC_RTC__)
|
| 58 |
+
#define EXCLUDE_FROM_RTC
|
| 59 |
+
#include "driver_types.h"
|
| 60 |
+
#undef EXCLUDE_FROM_RTC
|
| 61 |
+
#endif /* !__CUDACC_RTC__ */
|
| 62 |
+
#include "surface_types.h"
|
| 63 |
+
#include "texture_types.h"
|
| 64 |
+
#include "vector_types.h"
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/channel_descriptor.h
ADDED
|
@@ -0,0 +1,595 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CHANNEL_DESCRIPTOR_H__)
|
| 51 |
+
#define __CHANNEL_DESCRIPTOR_H__
|
| 52 |
+
|
| 53 |
+
#if defined(__cplusplus)
|
| 54 |
+
|
| 55 |
+
/*******************************************************************************
|
| 56 |
+
* *
|
| 57 |
+
* *
|
| 58 |
+
* *
|
| 59 |
+
*******************************************************************************/
|
| 60 |
+
|
| 61 |
+
#include "cuda_runtime_api.h"
|
| 62 |
+
|
| 63 |
+
/*******************************************************************************
|
| 64 |
+
* *
|
| 65 |
+
* *
|
| 66 |
+
* *
|
| 67 |
+
*******************************************************************************/
|
| 68 |
+
|
| 69 |
+
/**
|
| 70 |
+
* \addtogroup CUDART_HIGHLEVEL
|
| 71 |
+
*
|
| 72 |
+
* @{
|
| 73 |
+
*/
|
| 74 |
+
|
| 75 |
+
/**
|
| 76 |
+
* \brief \hl Returns a channel descriptor using the specified format
|
| 77 |
+
*
|
| 78 |
+
* Returns a channel descriptor with format \p f and number of bits of each
|
| 79 |
+
* component \p x, \p y, \p z, and \p w. The ::cudaChannelFormatDesc is
|
| 80 |
+
* defined as:
|
| 81 |
+
* \code
|
| 82 |
+
struct cudaChannelFormatDesc {
|
| 83 |
+
int x, y, z, w;
|
| 84 |
+
enum cudaChannelFormatKind f;
|
| 85 |
+
};
|
| 86 |
+
* \endcode
|
| 87 |
+
*
|
| 88 |
+
* where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
|
| 89 |
+
* ::cudaChannelFormatKindUnsigned, cudaChannelFormatKindFloat,
|
| 90 |
+
* ::cudaChannelFormatKindSignedNormalized8X1, ::cudaChannelFormatKindSignedNormalized8X2,
|
| 91 |
+
* ::cudaChannelFormatKindSignedNormalized8X4,
|
| 92 |
+
* ::cudaChannelFormatKindUnsignedNormalized8X1, ::cudaChannelFormatKindUnsignedNormalized8X2,
|
| 93 |
+
* ::cudaChannelFormatKindUnsignedNormalized8X4,
|
| 94 |
+
* ::cudaChannelFormatKindSignedNormalized16X1, ::cudaChannelFormatKindSignedNormalized16X2,
|
| 95 |
+
* ::cudaChannelFormatKindSignedNormalized16X4,
|
| 96 |
+
* ::cudaChannelFormatKindUnsignedNormalized16X1, ::cudaChannelFormatKindUnsignedNormalized16X2,
|
| 97 |
+
* ::cudaChannelFormatKindUnsignedNormalized16X4
|
| 98 |
+
* or ::cudaChannelFormatKindNV12.
|
| 99 |
+
*
|
| 100 |
+
* The format is specified by the template specialization.
|
| 101 |
+
*
|
| 102 |
+
* The template function specializes for the following scalar types:
|
| 103 |
+
* char, signed char, unsigned char, short, unsigned short, int, unsigned int, long, unsigned long, and float.
|
| 104 |
+
* The template function specializes for the following vector types:
|
| 105 |
+
* char{1|2|4}, uchar{1|2|4}, short{1|2|4}, ushort{1|2|4}, int{1|2|4}, uint{1|2|4}, long{1|2|4}, ulong{1|2|4}, float{1|2|4}.
|
| 106 |
+
* The template function specializes for following cudaChannelFormatKind enum values:
|
| 107 |
+
* ::cudaChannelFormatKind{Uns|S}ignedNormalized{8|16}X{1|2|4}, and ::cudaChannelFormatKindNV12.
|
| 108 |
+
*
|
| 109 |
+
* Invoking the function on a type without a specialization defaults to creating a channel format of kind ::cudaChannelFormatKindNone
|
| 110 |
+
*
|
| 111 |
+
* \return
|
| 112 |
+
* Channel descriptor with format \p f
|
| 113 |
+
*
|
| 114 |
+
* \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)",
|
| 115 |
+
* ::cudaGetChannelDesc, ::cudaGetTextureReference,
|
| 116 |
+
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (High level)",
|
| 117 |
+
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, size_t) "cudaBindTexture (High level, inherited channel descriptor)",
|
| 118 |
+
* \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (High level)",
|
| 119 |
+
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (High level)",
|
| 120 |
+
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (High level, inherited channel descriptor)",
|
| 121 |
+
* \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (High level)",
|
| 122 |
+
* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (High level)"
|
| 123 |
+
*/
|
| 124 |
+
template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
|
| 125 |
+
{
|
| 126 |
+
return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void)
|
| 130 |
+
{
|
| 131 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 132 |
+
|
| 133 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void)
|
| 137 |
+
{
|
| 138 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 139 |
+
|
| 140 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void)
|
| 144 |
+
{
|
| 145 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 146 |
+
|
| 147 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void)
|
| 151 |
+
{
|
| 152 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 153 |
+
|
| 154 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char>(void)
|
| 158 |
+
{
|
| 159 |
+
int e = (int)sizeof(char) * 8;
|
| 160 |
+
|
| 161 |
+
#if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__)
|
| 162 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 163 |
+
#else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
|
| 164 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 165 |
+
#endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<signed char>(void)
|
| 169 |
+
{
|
| 170 |
+
int e = (int)sizeof(signed char) * 8;
|
| 171 |
+
|
| 172 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned char>(void)
|
| 176 |
+
{
|
| 177 |
+
int e = (int)sizeof(unsigned char) * 8;
|
| 178 |
+
|
| 179 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char1>(void)
|
| 183 |
+
{
|
| 184 |
+
int e = (int)sizeof(signed char) * 8;
|
| 185 |
+
|
| 186 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar1>(void)
|
| 190 |
+
{
|
| 191 |
+
int e = (int)sizeof(unsigned char) * 8;
|
| 192 |
+
|
| 193 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char2>(void)
|
| 197 |
+
{
|
| 198 |
+
int e = (int)sizeof(signed char) * 8;
|
| 199 |
+
|
| 200 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar2>(void)
|
| 204 |
+
{
|
| 205 |
+
int e = (int)sizeof(unsigned char) * 8;
|
| 206 |
+
|
| 207 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char4>(void)
|
| 211 |
+
{
|
| 212 |
+
int e = (int)sizeof(signed char) * 8;
|
| 213 |
+
|
| 214 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar4>(void)
|
| 218 |
+
{
|
| 219 |
+
int e = (int)sizeof(unsigned char) * 8;
|
| 220 |
+
|
| 221 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short>(void)
|
| 225 |
+
{
|
| 226 |
+
int e = (int)sizeof(short) * 8;
|
| 227 |
+
|
| 228 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned short>(void)
|
| 232 |
+
{
|
| 233 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 234 |
+
|
| 235 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short1>(void)
|
| 239 |
+
{
|
| 240 |
+
int e = (int)sizeof(short) * 8;
|
| 241 |
+
|
| 242 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort1>(void)
|
| 246 |
+
{
|
| 247 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 248 |
+
|
| 249 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short2>(void)
|
| 253 |
+
{
|
| 254 |
+
int e = (int)sizeof(short) * 8;
|
| 255 |
+
|
| 256 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort2>(void)
|
| 260 |
+
{
|
| 261 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 262 |
+
|
| 263 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short4>(void)
|
| 267 |
+
{
|
| 268 |
+
int e = (int)sizeof(short) * 8;
|
| 269 |
+
|
| 270 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort4>(void)
|
| 274 |
+
{
|
| 275 |
+
int e = (int)sizeof(unsigned short) * 8;
|
| 276 |
+
|
| 277 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int>(void)
|
| 281 |
+
{
|
| 282 |
+
int e = (int)sizeof(int) * 8;
|
| 283 |
+
|
| 284 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned int>(void)
|
| 288 |
+
{
|
| 289 |
+
int e = (int)sizeof(unsigned int) * 8;
|
| 290 |
+
|
| 291 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int1>(void)
|
| 295 |
+
{
|
| 296 |
+
int e = (int)sizeof(int) * 8;
|
| 297 |
+
|
| 298 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint1>(void)
|
| 302 |
+
{
|
| 303 |
+
int e = (int)sizeof(unsigned int) * 8;
|
| 304 |
+
|
| 305 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int2>(void)
|
| 309 |
+
{
|
| 310 |
+
int e = (int)sizeof(int) * 8;
|
| 311 |
+
|
| 312 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint2>(void)
|
| 316 |
+
{
|
| 317 |
+
int e = (int)sizeof(unsigned int) * 8;
|
| 318 |
+
|
| 319 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int4>(void)
|
| 323 |
+
{
|
| 324 |
+
int e = (int)sizeof(int) * 8;
|
| 325 |
+
|
| 326 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
|
| 327 |
+
}
|
| 328 |
+
|
| 329 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint4>(void)
|
| 330 |
+
{
|
| 331 |
+
int e = (int)sizeof(unsigned int) * 8;
|
| 332 |
+
|
| 333 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
|
| 334 |
+
}
|
| 335 |
+
|
| 336 |
+
#if !defined(__LP64__)
|
| 337 |
+
|
| 338 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long>(void)
|
| 339 |
+
{
|
| 340 |
+
int e = (int)sizeof(long) * 8;
|
| 341 |
+
|
| 342 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 343 |
+
}
|
| 344 |
+
|
| 345 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned long>(void)
|
| 346 |
+
{
|
| 347 |
+
int e = (int)sizeof(unsigned long) * 8;
|
| 348 |
+
|
| 349 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 350 |
+
}
|
| 351 |
+
|
| 352 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long1>(void)
|
| 353 |
+
{
|
| 354 |
+
int e = (int)sizeof(long) * 8;
|
| 355 |
+
|
| 356 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
| 357 |
+
}
|
| 358 |
+
|
| 359 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong1>(void)
|
| 360 |
+
{
|
| 361 |
+
int e = (int)sizeof(unsigned long) * 8;
|
| 362 |
+
|
| 363 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long2>(void)
|
| 367 |
+
{
|
| 368 |
+
int e = (int)sizeof(long) * 8;
|
| 369 |
+
|
| 370 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong2>(void)
|
| 374 |
+
{
|
| 375 |
+
int e = (int)sizeof(unsigned long) * 8;
|
| 376 |
+
|
| 377 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
|
| 378 |
+
}
|
| 379 |
+
|
| 380 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long4>(void)
|
| 381 |
+
{
|
| 382 |
+
int e = (int)sizeof(long) * 8;
|
| 383 |
+
|
| 384 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong4>(void)
|
| 388 |
+
{
|
| 389 |
+
int e = (int)sizeof(unsigned long) * 8;
|
| 390 |
+
|
| 391 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
#endif /* !__LP64__ */
|
| 395 |
+
|
| 396 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float>(void)
|
| 397 |
+
{
|
| 398 |
+
int e = (int)sizeof(float) * 8;
|
| 399 |
+
|
| 400 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float1>(void)
|
| 404 |
+
{
|
| 405 |
+
int e = (int)sizeof(float) * 8;
|
| 406 |
+
|
| 407 |
+
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float2>(void)
|
| 411 |
+
{
|
| 412 |
+
int e = (int)sizeof(float) * 8;
|
| 413 |
+
|
| 414 |
+
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
|
| 415 |
+
}
|
| 416 |
+
|
| 417 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float4>(void)
|
| 418 |
+
{
|
| 419 |
+
int e = (int)sizeof(float) * 8;
|
| 420 |
+
|
| 421 |
+
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescNV12(void)
|
| 425 |
+
{
|
| 426 |
+
int e = (int)sizeof(char) * 8;
|
| 427 |
+
|
| 428 |
+
return cudaCreateChannelDesc(e, e, e, 0, cudaChannelFormatKindNV12);
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
+
template<cudaChannelFormatKind> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
|
| 432 |
+
{
|
| 433 |
+
return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
|
| 434 |
+
}
|
| 435 |
+
|
| 436 |
+
/* Signed 8-bit normalized integer formats */
|
| 437 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X1>(void)
|
| 438 |
+
{
|
| 439 |
+
return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedNormalized8X1);
|
| 440 |
+
}
|
| 441 |
+
|
| 442 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X2>(void)
|
| 443 |
+
{
|
| 444 |
+
return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedNormalized8X2);
|
| 445 |
+
}
|
| 446 |
+
|
| 447 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X4>(void)
|
| 448 |
+
{
|
| 449 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindSignedNormalized8X4);
|
| 450 |
+
}
|
| 451 |
+
|
| 452 |
+
/* Unsigned 8-bit normalized integer formats */
|
| 453 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X1>(void)
|
| 454 |
+
{
|
| 455 |
+
return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized8X1);
|
| 456 |
+
}
|
| 457 |
+
|
| 458 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X2>(void)
|
| 459 |
+
{
|
| 460 |
+
return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedNormalized8X2);
|
| 461 |
+
}
|
| 462 |
+
|
| 463 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X4>(void)
|
| 464 |
+
{
|
| 465 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedNormalized8X4);
|
| 466 |
+
}
|
| 467 |
+
|
| 468 |
+
/* Signed 16-bit normalized integer formats */
|
| 469 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X1>(void)
|
| 470 |
+
{
|
| 471 |
+
return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindSignedNormalized16X1);
|
| 472 |
+
}
|
| 473 |
+
|
| 474 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X2>(void)
|
| 475 |
+
{
|
| 476 |
+
return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindSignedNormalized16X2);
|
| 477 |
+
}
|
| 478 |
+
|
| 479 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X4>(void)
|
| 480 |
+
{
|
| 481 |
+
return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindSignedNormalized16X4);
|
| 482 |
+
}
|
| 483 |
+
|
| 484 |
+
/* Unsigned 16-bit normalized integer formats */
|
| 485 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X1>(void)
|
| 486 |
+
{
|
| 487 |
+
return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized16X1);
|
| 488 |
+
}
|
| 489 |
+
|
| 490 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X2>(void)
|
| 491 |
+
{
|
| 492 |
+
return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindUnsignedNormalized16X2);
|
| 493 |
+
}
|
| 494 |
+
|
| 495 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X4>(void)
|
| 496 |
+
{
|
| 497 |
+
return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindUnsignedNormalized16X4);
|
| 498 |
+
}
|
| 499 |
+
|
| 500 |
+
/* NV12 format */
|
| 501 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindNV12>(void)
|
| 502 |
+
{
|
| 503 |
+
return cudaCreateChannelDesc(8, 8, 8, 0, cudaChannelFormatKindNV12);
|
| 504 |
+
}
|
| 505 |
+
|
| 506 |
+
/* BC1 format */
|
| 507 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1>(void)
|
| 508 |
+
{
|
| 509 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1);
|
| 510 |
+
}
|
| 511 |
+
|
| 512 |
+
/* BC1sRGB format */
|
| 513 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1SRGB>(void)
|
| 514 |
+
{
|
| 515 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1SRGB);
|
| 516 |
+
}
|
| 517 |
+
|
| 518 |
+
/* BC2 format */
|
| 519 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2>(void)
|
| 520 |
+
{
|
| 521 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2);
|
| 522 |
+
}
|
| 523 |
+
|
| 524 |
+
/* BC2sRGB format */
|
| 525 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2SRGB>(void)
|
| 526 |
+
{
|
| 527 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2SRGB);
|
| 528 |
+
}
|
| 529 |
+
|
| 530 |
+
/* BC3 format */
|
| 531 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3>(void)
|
| 532 |
+
{
|
| 533 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3);
|
| 534 |
+
}
|
| 535 |
+
|
| 536 |
+
/* BC3sRGB format */
|
| 537 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3SRGB>(void)
|
| 538 |
+
{
|
| 539 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3SRGB);
|
| 540 |
+
}
|
| 541 |
+
|
| 542 |
+
/* BC4 unsigned format */
|
| 543 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed4>(void)
|
| 544 |
+
{
|
| 545 |
+
return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed4);
|
| 546 |
+
}
|
| 547 |
+
|
| 548 |
+
/* BC4 signed format */
|
| 549 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed4>(void)
|
| 550 |
+
{
|
| 551 |
+
return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedBlockCompressed4);
|
| 552 |
+
}
|
| 553 |
+
|
| 554 |
+
/* BC5 unsigned format */
|
| 555 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed5>(void)
|
| 556 |
+
{
|
| 557 |
+
return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed5);
|
| 558 |
+
}
|
| 559 |
+
|
| 560 |
+
/* BC5 signed format */
|
| 561 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed5>(void)
|
| 562 |
+
{
|
| 563 |
+
return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedBlockCompressed5);
|
| 564 |
+
}
|
| 565 |
+
|
| 566 |
+
/* BC6H unsigned format */
|
| 567 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed6H>(void)
|
| 568 |
+
{
|
| 569 |
+
return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindUnsignedBlockCompressed6H);
|
| 570 |
+
}
|
| 571 |
+
|
| 572 |
+
/* BC6H signed format */
|
| 573 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed6H>(void)
|
| 574 |
+
{
|
| 575 |
+
return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindSignedBlockCompressed6H);
|
| 576 |
+
}
|
| 577 |
+
|
| 578 |
+
/* BC7 format */
|
| 579 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7>(void)
|
| 580 |
+
{
|
| 581 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7);
|
| 582 |
+
}
|
| 583 |
+
|
| 584 |
+
/* BC7sRGB format */
|
| 585 |
+
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7SRGB>(void)
|
| 586 |
+
{
|
| 587 |
+
return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7SRGB);
|
| 588 |
+
}
|
| 589 |
+
|
| 590 |
+
#endif /* __cplusplus */
|
| 591 |
+
|
| 592 |
+
/** @} */
|
| 593 |
+
/** @} */ /* END CUDART_TEXTURE_HL */
|
| 594 |
+
|
| 595 |
+
#endif /* !__CHANNEL_DESCRIPTOR_H__ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups.h
ADDED
|
@@ -0,0 +1,1828 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef _COOPERATIVE_GROUPS_H_
|
| 51 |
+
#define _COOPERATIVE_GROUPS_H_
|
| 52 |
+
|
| 53 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 54 |
+
|
| 55 |
+
#include "cooperative_groups/details/info.h"
|
| 56 |
+
#include "cooperative_groups/details/driver_abi.h"
|
| 57 |
+
#include "cooperative_groups/details/helpers.h"
|
| 58 |
+
|
| 59 |
+
#if defined(_CG_HAS_STL_ATOMICS)
|
| 60 |
+
#include <cuda/atomic>
|
| 61 |
+
#define _CG_THREAD_SCOPE(scope) _CG_STATIC_CONST_DECL cuda::thread_scope thread_scope = scope;
|
| 62 |
+
#else
|
| 63 |
+
#define _CG_THREAD_SCOPE(scope)
|
| 64 |
+
#endif
|
| 65 |
+
|
| 66 |
+
_CG_BEGIN_NAMESPACE
|
| 67 |
+
|
| 68 |
+
namespace details {
|
| 69 |
+
_CG_CONST_DECL unsigned int coalesced_group_id = 1;
|
| 70 |
+
_CG_CONST_DECL unsigned int multi_grid_group_id = 2;
|
| 71 |
+
_CG_CONST_DECL unsigned int grid_group_id = 3;
|
| 72 |
+
_CG_CONST_DECL unsigned int thread_block_id = 4;
|
| 73 |
+
_CG_CONST_DECL unsigned int multi_tile_group_id = 5;
|
| 74 |
+
_CG_CONST_DECL unsigned int cluster_group_id = 6;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
/**
|
| 78 |
+
* class thread_group;
|
| 79 |
+
*
|
| 80 |
+
* Generic thread group type, into which all groups are convertible.
|
| 81 |
+
* It acts as a container for all storage necessary for the derived groups,
|
| 82 |
+
* and will dispatch the API calls to the correct derived group. This means
|
| 83 |
+
* that all derived groups must implement the same interface as thread_group.
|
| 84 |
+
*/
|
| 85 |
+
class thread_group
|
| 86 |
+
{
|
| 87 |
+
protected:
|
| 88 |
+
struct group_data {
|
| 89 |
+
unsigned int _unused : 1;
|
| 90 |
+
unsigned int type : 7, : 0;
|
| 91 |
+
};
|
| 92 |
+
|
| 93 |
+
struct gg_data {
|
| 94 |
+
details::grid_workspace *gridWs;
|
| 95 |
+
};
|
| 96 |
+
|
| 97 |
+
#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 98 |
+
struct mg_data {
|
| 99 |
+
unsigned long long _unused : 1;
|
| 100 |
+
unsigned long long type : 7;
|
| 101 |
+
unsigned long long handle : 56;
|
| 102 |
+
const details::multi_grid::multi_grid_functions *functions;
|
| 103 |
+
};
|
| 104 |
+
#endif
|
| 105 |
+
|
| 106 |
+
struct tg_data {
|
| 107 |
+
unsigned int is_tiled : 1;
|
| 108 |
+
unsigned int type : 7;
|
| 109 |
+
unsigned int size : 24;
|
| 110 |
+
// packed to 4b
|
| 111 |
+
unsigned int metaGroupSize : 16;
|
| 112 |
+
unsigned int metaGroupRank : 16;
|
| 113 |
+
// packed to 8b
|
| 114 |
+
unsigned int mask;
|
| 115 |
+
// packed to 12b
|
| 116 |
+
unsigned int _res;
|
| 117 |
+
};
|
| 118 |
+
|
| 119 |
+
friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
|
| 120 |
+
friend class thread_block;
|
| 121 |
+
|
| 122 |
+
union __align__(8) {
|
| 123 |
+
group_data group;
|
| 124 |
+
tg_data coalesced;
|
| 125 |
+
gg_data grid;
|
| 126 |
+
#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 127 |
+
mg_data multi_grid;
|
| 128 |
+
#endif
|
| 129 |
+
} _data;
|
| 130 |
+
|
| 131 |
+
_CG_QUALIFIER thread_group operator=(const thread_group& src);
|
| 132 |
+
|
| 133 |
+
_CG_QUALIFIER thread_group(unsigned int type) {
|
| 134 |
+
_data.group.type = type;
|
| 135 |
+
_data.group._unused = false;
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
#ifdef _CG_CPP11_FEATURES
|
| 139 |
+
static_assert(sizeof(tg_data) <= 16, "Failed size check");
|
| 140 |
+
static_assert(sizeof(gg_data) <= 16, "Failed size check");
|
| 141 |
+
# ifdef _CG_ABI_EXPERIMENTAL
|
| 142 |
+
static_assert(sizeof(mg_data) <= 16, "Failed size check");
|
| 143 |
+
# endif
|
| 144 |
+
#endif
|
| 145 |
+
|
| 146 |
+
public:
|
| 147 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
|
| 148 |
+
|
| 149 |
+
_CG_QUALIFIER unsigned long long size() const;
|
| 150 |
+
_CG_QUALIFIER unsigned long long num_threads() const;
|
| 151 |
+
_CG_QUALIFIER unsigned long long thread_rank() const;
|
| 152 |
+
_CG_QUALIFIER void sync() const;
|
| 153 |
+
_CG_QUALIFIER unsigned int get_type() const {
|
| 154 |
+
return _data.group.type;
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
};
|
| 158 |
+
|
| 159 |
+
template <unsigned int TyId>
|
| 160 |
+
struct thread_group_base : public thread_group {
|
| 161 |
+
_CG_QUALIFIER thread_group_base() : thread_group(TyId) {}
|
| 162 |
+
_CG_STATIC_CONST_DECL unsigned int id = TyId;
|
| 163 |
+
};
|
| 164 |
+
|
| 165 |
+
#if defined(_CG_HAS_MULTI_GRID_GROUP)
|
| 166 |
+
|
| 167 |
+
/**
|
| 168 |
+
* class multi_grid_group;
|
| 169 |
+
*
|
| 170 |
+
* Threads within this this group are guaranteed to be co-resident on the
|
| 171 |
+
* same system, on multiple devices within the same launched kernels.
|
| 172 |
+
* To use this group, the kernel must have been launched with
|
| 173 |
+
* cuLaunchCooperativeKernelMultiDevice (or the CUDA Runtime equivalent),
|
| 174 |
+
* and the device must support it (queryable device attribute).
|
| 175 |
+
*
|
| 176 |
+
* Constructed via this_multi_grid();
|
| 177 |
+
*/
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 181 |
+
class multi_grid_group;
|
| 182 |
+
|
| 183 |
+
// Multi grid group requires these functions to be templated to prevent ptxas from trying to use CG syscalls
|
| 184 |
+
template <typename = void>
|
| 185 |
+
__device__ _CG_DEPRECATED multi_grid_group this_multi_grid();
|
| 186 |
+
|
| 187 |
+
class multi_grid_group : public thread_group_base<details::multi_grid_group_id>
|
| 188 |
+
{
|
| 189 |
+
private:
|
| 190 |
+
template <typename = void>
|
| 191 |
+
_CG_QUALIFIER multi_grid_group() {
|
| 192 |
+
_data.multi_grid.functions = details::multi_grid::load_grid_intrinsics();
|
| 193 |
+
_data.multi_grid.handle = _data.multi_grid.functions->get_intrinsic_handle();
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
friend multi_grid_group this_multi_grid<void>();
|
| 197 |
+
|
| 198 |
+
public:
|
| 199 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
|
| 200 |
+
|
| 201 |
+
_CG_QUALIFIER bool is_valid() const {
|
| 202 |
+
return (_data.multi_grid.handle != 0);
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
_CG_QUALIFIER void sync() const {
|
| 206 |
+
if (!is_valid()) {
|
| 207 |
+
_CG_ABORT();
|
| 208 |
+
}
|
| 209 |
+
_data.multi_grid.functions->sync(_data.multi_grid.handle);
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
_CG_QUALIFIER unsigned long long num_threads() const {
|
| 213 |
+
_CG_ASSERT(is_valid());
|
| 214 |
+
return _data.multi_grid.functions->size(_data.multi_grid.handle);
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
_CG_QUALIFIER unsigned long long size() const {
|
| 218 |
+
return num_threads();
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
_CG_QUALIFIER unsigned long long thread_rank() const {
|
| 222 |
+
_CG_ASSERT(is_valid());
|
| 223 |
+
return _data.multi_grid.functions->thread_rank(_data.multi_grid.handle);
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
_CG_QUALIFIER unsigned int grid_rank() const {
|
| 227 |
+
_CG_ASSERT(is_valid());
|
| 228 |
+
return (_data.multi_grid.functions->grid_rank(_data.multi_grid.handle));
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
_CG_QUALIFIER unsigned int num_grids() const {
|
| 232 |
+
_CG_ASSERT(is_valid());
|
| 233 |
+
return (_data.multi_grid.functions->num_grids(_data.multi_grid.handle));
|
| 234 |
+
}
|
| 235 |
+
};
|
| 236 |
+
# else
|
| 237 |
+
class multi_grid_group
|
| 238 |
+
{
|
| 239 |
+
private:
|
| 240 |
+
unsigned long long _handle;
|
| 241 |
+
unsigned int _size;
|
| 242 |
+
unsigned int _rank;
|
| 243 |
+
|
| 244 |
+
friend _CG_QUALIFIER multi_grid_group this_multi_grid();
|
| 245 |
+
|
| 246 |
+
_CG_QUALIFIER multi_grid_group() {
|
| 247 |
+
_handle = details::multi_grid::get_intrinsic_handle();
|
| 248 |
+
_size = details::multi_grid::size(_handle);
|
| 249 |
+
_rank = details::multi_grid::thread_rank(_handle);
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
public:
|
| 253 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
|
| 254 |
+
|
| 255 |
+
_CG_QUALIFIER _CG_DEPRECATED bool is_valid() const {
|
| 256 |
+
return (_handle != 0);
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
_CG_QUALIFIER _CG_DEPRECATED void sync() const {
|
| 260 |
+
if (!is_valid()) {
|
| 261 |
+
_CG_ABORT();
|
| 262 |
+
}
|
| 263 |
+
details::multi_grid::sync(_handle);
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
_CG_QUALIFIER _CG_DEPRECATED unsigned long long num_threads() const {
|
| 267 |
+
_CG_ASSERT(is_valid());
|
| 268 |
+
return _size;
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
_CG_QUALIFIER _CG_DEPRECATED unsigned long long size() const {
|
| 272 |
+
return num_threads();
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
_CG_QUALIFIER _CG_DEPRECATED unsigned long long thread_rank() const {
|
| 276 |
+
_CG_ASSERT(is_valid());
|
| 277 |
+
return _rank;
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
_CG_QUALIFIER _CG_DEPRECATED unsigned int grid_rank() const {
|
| 281 |
+
_CG_ASSERT(is_valid());
|
| 282 |
+
return (details::multi_grid::grid_rank(_handle));
|
| 283 |
+
}
|
| 284 |
+
|
| 285 |
+
_CG_QUALIFIER _CG_DEPRECATED unsigned int num_grids() const {
|
| 286 |
+
_CG_ASSERT(is_valid());
|
| 287 |
+
return (details::multi_grid::num_grids(_handle));
|
| 288 |
+
}
|
| 289 |
+
};
|
| 290 |
+
# endif
|
| 291 |
+
|
| 292 |
+
/**
|
| 293 |
+
* multi_grid_group this_multi_grid()
|
| 294 |
+
*
|
| 295 |
+
* Constructs a multi_grid_group
|
| 296 |
+
*/
|
| 297 |
+
# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 298 |
+
template <typename>
|
| 299 |
+
__device__
|
| 300 |
+
#else
|
| 301 |
+
_CG_QUALIFIER
|
| 302 |
+
# endif
|
| 303 |
+
_CG_DEPRECATED
|
| 304 |
+
multi_grid_group this_multi_grid()
|
| 305 |
+
{
|
| 306 |
+
return multi_grid_group();
|
| 307 |
+
}
|
| 308 |
+
#endif
|
| 309 |
+
|
| 310 |
+
/**
|
| 311 |
+
* class grid_group;
|
| 312 |
+
*
|
| 313 |
+
* Threads within this this group are guaranteed to be co-resident on the
|
| 314 |
+
* same device within the same launched kernel. To use this group, the kernel
|
| 315 |
+
* must have been launched with cuLaunchCooperativeKernel (or the CUDA Runtime equivalent),
|
| 316 |
+
* and the device must support it (queryable device attribute).
|
| 317 |
+
*
|
| 318 |
+
* Constructed via this_grid();
|
| 319 |
+
*/
|
| 320 |
+
class grid_group : public thread_group_base<details::grid_group_id>
|
| 321 |
+
{
|
| 322 |
+
_CG_STATIC_CONST_DECL unsigned int _group_id = details::grid_group_id;
|
| 323 |
+
friend _CG_QUALIFIER grid_group this_grid();
|
| 324 |
+
|
| 325 |
+
private:
|
| 326 |
+
_CG_QUALIFIER grid_group(details::grid_workspace *gridWs) {
|
| 327 |
+
_data.grid.gridWs = gridWs;
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
public:
|
| 331 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
|
| 332 |
+
|
| 333 |
+
_CG_QUALIFIER bool is_valid() const {
|
| 334 |
+
return (_data.grid.gridWs != NULL);
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
_CG_QUALIFIER void sync() const {
|
| 338 |
+
if (!is_valid()) {
|
| 339 |
+
_CG_ABORT();
|
| 340 |
+
}
|
| 341 |
+
details::grid::sync(&_data.grid.gridWs->barrier);
|
| 342 |
+
}
|
| 343 |
+
|
| 344 |
+
_CG_STATIC_QUALIFIER unsigned long long size() {
|
| 345 |
+
return details::grid::size();
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
_CG_STATIC_QUALIFIER unsigned long long thread_rank() {
|
| 349 |
+
return details::grid::thread_rank();
|
| 350 |
+
}
|
| 351 |
+
|
| 352 |
+
_CG_STATIC_QUALIFIER dim3 group_dim() {
|
| 353 |
+
return details::grid::grid_dim();
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
_CG_STATIC_QUALIFIER unsigned long long num_threads() {
|
| 357 |
+
return details::grid::num_threads();
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
_CG_STATIC_QUALIFIER dim3 dim_blocks() {
|
| 361 |
+
return details::grid::dim_blocks();
|
| 362 |
+
}
|
| 363 |
+
|
| 364 |
+
_CG_STATIC_QUALIFIER unsigned long long num_blocks() {
|
| 365 |
+
return details::grid::num_blocks();
|
| 366 |
+
}
|
| 367 |
+
|
| 368 |
+
_CG_STATIC_QUALIFIER dim3 block_index() {
|
| 369 |
+
return details::grid::block_index();
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
_CG_STATIC_QUALIFIER unsigned long long block_rank() {
|
| 373 |
+
return details::grid::block_rank();
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
# if defined(_CG_HAS_CLUSTER_GROUP)
|
| 377 |
+
_CG_STATIC_QUALIFIER dim3 dim_clusters() {
|
| 378 |
+
return details::grid::dim_clusters();
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
_CG_STATIC_QUALIFIER unsigned long long num_clusters() {
|
| 382 |
+
return details::grid::num_clusters();
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
+
_CG_STATIC_QUALIFIER dim3 cluster_index() {
|
| 386 |
+
return details::grid::cluster_index();
|
| 387 |
+
}
|
| 388 |
+
|
| 389 |
+
_CG_STATIC_QUALIFIER unsigned long long cluster_rank() {
|
| 390 |
+
return details::grid::cluster_rank();
|
| 391 |
+
}
|
| 392 |
+
# endif
|
| 393 |
+
};
|
| 394 |
+
|
| 395 |
+
_CG_QUALIFIER grid_group this_grid() {
|
| 396 |
+
// Load a workspace from the driver
|
| 397 |
+
grid_group gg(details::get_grid_workspace());
|
| 398 |
+
#ifdef _CG_DEBUG
|
| 399 |
+
// *all* threads must be available to synchronize
|
| 400 |
+
gg.sync();
|
| 401 |
+
#endif // _CG_DEBUG
|
| 402 |
+
return gg;
|
| 403 |
+
}
|
| 404 |
+
|
| 405 |
+
#if defined(_CG_HAS_CLUSTER_GROUP)
|
| 406 |
+
/**
|
| 407 |
+
* class cluster_group
|
| 408 |
+
*
|
| 409 |
+
* Every GPU kernel is executed by a grid of thread blocks. A grid can be evenly
|
| 410 |
+
* divided along all dimensions to form groups of blocks, each group of which is
|
| 411 |
+
* a block cluster. Clustered grids are subject to various restrictions and
|
| 412 |
+
* limitations. Primarily, a cluster consists of at most 8 blocks by default
|
| 413 |
+
* (although the user is allowed to opt-in to non-standard sizes,) and clustered
|
| 414 |
+
* grids are subject to additional occupancy limitations due to per-cluster
|
| 415 |
+
* hardware resource consumption. In exchange, a block cluster is guaranteed to
|
| 416 |
+
* be a cooperative group, with access to all cooperative group capabilities, as
|
| 417 |
+
* well as cluster specific capabilities and accelerations. A cluster_group
|
| 418 |
+
* represents a block cluster.
|
| 419 |
+
*
|
| 420 |
+
* Constructed via this_cluster_group();
|
| 421 |
+
*/
|
| 422 |
+
class cluster_group : public thread_group_base<details::cluster_group_id>
|
| 423 |
+
{
|
| 424 |
+
// Friends
|
| 425 |
+
friend _CG_QUALIFIER cluster_group this_cluster();
|
| 426 |
+
|
| 427 |
+
// Disable constructor
|
| 428 |
+
_CG_QUALIFIER cluster_group()
|
| 429 |
+
{
|
| 430 |
+
}
|
| 431 |
+
|
| 432 |
+
public:
|
| 433 |
+
//_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_cluster)
|
| 434 |
+
|
| 435 |
+
// Functionality exposed by the group
|
| 436 |
+
_CG_STATIC_QUALIFIER void sync()
|
| 437 |
+
{
|
| 438 |
+
return details::cluster::sync();
|
| 439 |
+
}
|
| 440 |
+
|
| 441 |
+
_CG_STATIC_QUALIFIER void barrier_arrive()
|
| 442 |
+
{
|
| 443 |
+
return details::cluster::barrier_arrive();
|
| 444 |
+
}
|
| 445 |
+
|
| 446 |
+
_CG_STATIC_QUALIFIER void barrier_wait()
|
| 447 |
+
{
|
| 448 |
+
return details::cluster::barrier_wait();
|
| 449 |
+
}
|
| 450 |
+
|
| 451 |
+
_CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr)
|
| 452 |
+
{
|
| 453 |
+
return details::cluster::query_shared_rank(addr);
|
| 454 |
+
}
|
| 455 |
+
|
| 456 |
+
template <typename T>
|
| 457 |
+
_CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank)
|
| 458 |
+
{
|
| 459 |
+
return details::cluster::map_shared_rank(addr, rank);
|
| 460 |
+
}
|
| 461 |
+
|
| 462 |
+
_CG_STATIC_QUALIFIER dim3 block_index()
|
| 463 |
+
{
|
| 464 |
+
return details::cluster::block_index();
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
_CG_STATIC_QUALIFIER unsigned int block_rank()
|
| 468 |
+
{
|
| 469 |
+
return details::cluster::block_rank();
|
| 470 |
+
}
|
| 471 |
+
|
| 472 |
+
_CG_STATIC_QUALIFIER unsigned int thread_rank()
|
| 473 |
+
{
|
| 474 |
+
return details::cluster::thread_rank();
|
| 475 |
+
}
|
| 476 |
+
|
| 477 |
+
_CG_STATIC_QUALIFIER dim3 dim_blocks()
|
| 478 |
+
{
|
| 479 |
+
return details::cluster::dim_blocks();
|
| 480 |
+
}
|
| 481 |
+
|
| 482 |
+
_CG_STATIC_QUALIFIER unsigned int num_blocks()
|
| 483 |
+
{
|
| 484 |
+
return details::cluster::num_blocks();
|
| 485 |
+
}
|
| 486 |
+
|
| 487 |
+
_CG_STATIC_QUALIFIER dim3 dim_threads()
|
| 488 |
+
{
|
| 489 |
+
return details::cluster::dim_threads();
|
| 490 |
+
}
|
| 491 |
+
|
| 492 |
+
_CG_STATIC_QUALIFIER unsigned int num_threads()
|
| 493 |
+
{
|
| 494 |
+
return details::cluster::num_threads();
|
| 495 |
+
}
|
| 496 |
+
|
| 497 |
+
// Legacy aliases
|
| 498 |
+
_CG_STATIC_QUALIFIER unsigned int size()
|
| 499 |
+
{
|
| 500 |
+
return num_threads();
|
| 501 |
+
}
|
| 502 |
+
};
|
| 503 |
+
|
| 504 |
+
/*
|
| 505 |
+
* cluster_group this_cluster()
|
| 506 |
+
*
|
| 507 |
+
* Constructs a cluster_group
|
| 508 |
+
*/
|
| 509 |
+
_CG_QUALIFIER cluster_group this_cluster()
|
| 510 |
+
{
|
| 511 |
+
cluster_group cg;
|
| 512 |
+
#ifdef _CG_DEBUG
|
| 513 |
+
cg.sync();
|
| 514 |
+
#endif
|
| 515 |
+
return cg;
|
| 516 |
+
}
|
| 517 |
+
#endif
|
| 518 |
+
|
| 519 |
+
#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 520 |
+
namespace details {
|
| 521 |
+
|
| 522 |
+
_CG_CONSTEXPR_QUALIFIER unsigned int scratch_sync_memory_size(unsigned int max_block_size) {
|
| 523 |
+
// One barrier per possible size of the group rounded up to multiple of 4.
|
| 524 |
+
return 8 * sizeof(details::barrier_t);
|
| 525 |
+
}
|
| 526 |
+
|
| 527 |
+
_CG_CONSTEXPR_QUALIFIER unsigned int scratch_collectives_memory_size(unsigned int communication_size, unsigned int max_block_size) {
|
| 528 |
+
// One slot of collectives memory per warp.
|
| 529 |
+
return max_block_size / 32 * communication_size;
|
| 530 |
+
}
|
| 531 |
+
|
| 532 |
+
_CG_CONSTEXPR_QUALIFIER unsigned int scratch_size_needed(unsigned int communication_size, unsigned int max_block_size) {
|
| 533 |
+
return scratch_sync_memory_size(max_block_size) + scratch_collectives_memory_size(communication_size, max_block_size);
|
| 534 |
+
}
|
| 535 |
+
|
| 536 |
+
_CG_CONSTEXPR_QUALIFIER size_t scratch_alignment(unsigned int communication_size) {
|
| 537 |
+
return ((communication_size & (communication_size - 1) == 0) && communication_size > 8) ?
|
| 538 |
+
communication_size : 8;
|
| 539 |
+
}
|
| 540 |
+
|
| 541 |
+
_CG_CONST_DECL unsigned int default_tile_communication_size = 8;
|
| 542 |
+
_CG_CONST_DECL unsigned int default_max_block_size = 1024;
|
| 543 |
+
|
| 544 |
+
struct multi_warp_scratch {
|
| 545 |
+
char memory[1];
|
| 546 |
+
};
|
| 547 |
+
}
|
| 548 |
+
|
| 549 |
+
class thread_block;
|
| 550 |
+
namespace experimental {
|
| 551 |
+
template <unsigned int TileCommunicationSize = details::default_tile_communication_size,
|
| 552 |
+
unsigned int MaxBlockSize = details::default_max_block_size>
|
| 553 |
+
struct __align__(details::scratch_alignment(TileCommunicationSize)) block_tile_memory {
|
| 554 |
+
private:
|
| 555 |
+
char scratch[details::scratch_size_needed(TileCommunicationSize, MaxBlockSize)];
|
| 556 |
+
|
| 557 |
+
public:
|
| 558 |
+
_CG_QUALIFIER void* get_memory() {
|
| 559 |
+
return static_cast<void*>(scratch);
|
| 560 |
+
}
|
| 561 |
+
|
| 562 |
+
_CG_STATIC_QUALIFIER unsigned int get_size() {
|
| 563 |
+
return details::scratch_size_needed(TileCommunicationSize, MaxBlockSize);
|
| 564 |
+
}
|
| 565 |
+
};
|
| 566 |
+
|
| 567 |
+
template <unsigned int TileCommunicationSize, unsigned int MaxBlockSize>
|
| 568 |
+
_CG_QUALIFIER thread_block this_thread_block(experimental::block_tile_memory<TileCommunicationSize, MaxBlockSize>& scratch);
|
| 569 |
+
}
|
| 570 |
+
#endif
|
| 571 |
+
|
| 572 |
+
/**
|
| 573 |
+
* class thread_block
|
| 574 |
+
*
|
| 575 |
+
* Every GPU kernel is executed by a grid of thread blocks, and threads within
|
| 576 |
+
* each block are guaranteed to reside on the same streaming multiprocessor.
|
| 577 |
+
* A thread_block represents a thread block whose dimensions are not known until runtime.
|
| 578 |
+
*
|
| 579 |
+
* Constructed via this_thread_block();
|
| 580 |
+
*/
|
| 581 |
+
class thread_block : public thread_group_base<details::thread_block_id>
|
| 582 |
+
{
|
| 583 |
+
// Friends
|
| 584 |
+
friend _CG_QUALIFIER thread_block this_thread_block();
|
| 585 |
+
friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
|
| 586 |
+
friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz);
|
| 587 |
+
|
| 588 |
+
#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 589 |
+
template <unsigned int TileCommunicationSize, unsigned int MaxBlockSize>
|
| 590 |
+
friend _CG_QUALIFIER thread_block experimental::this_thread_block(
|
| 591 |
+
experimental::block_tile_memory<TileCommunicationSize, MaxBlockSize>& scratch);
|
| 592 |
+
|
| 593 |
+
const unsigned short communication_size;
|
| 594 |
+
const unsigned short max_block_size;
|
| 595 |
+
details::multi_warp_scratch* const tile_memory;
|
| 596 |
+
|
| 597 |
+
template <unsigned int Size>
|
| 598 |
+
friend class __static_size_multi_warp_tile_base;
|
| 599 |
+
|
| 600 |
+
template <unsigned int TileCommunicationSize, unsigned int MaxBlockSize>
|
| 601 |
+
_CG_QUALIFIER thread_block(experimental::block_tile_memory<TileCommunicationSize, MaxBlockSize>& scratch) :
|
| 602 |
+
tile_memory(reinterpret_cast<details::multi_warp_scratch*>(&scratch)),
|
| 603 |
+
communication_size(TileCommunicationSize), max_block_size(MaxBlockSize) {
|
| 604 |
+
if (thread_rank() < details::scratch_sync_memory_size(MaxBlockSize) / sizeof(details::barrier_t)) {
|
| 605 |
+
details::barrier_t* barriers = reinterpret_cast<details::barrier_t*>(&tile_memory->memory);
|
| 606 |
+
barriers[thread_rank()] = 0;
|
| 607 |
+
}
|
| 608 |
+
sync();
|
| 609 |
+
}
|
| 610 |
+
#endif
|
| 611 |
+
|
| 612 |
+
// Disable constructor
|
| 613 |
+
_CG_QUALIFIER thread_block()
|
| 614 |
+
#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 615 |
+
: tile_memory(NULL), communication_size(0), max_block_size(0)
|
| 616 |
+
#endif
|
| 617 |
+
{ }
|
| 618 |
+
|
| 619 |
+
// Internal Use
|
| 620 |
+
_CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const {
|
| 621 |
+
const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
|
| 622 |
+
|
| 623 |
+
// Invalid, immediately fail
|
| 624 |
+
if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
|
| 625 |
+
details::abort();
|
| 626 |
+
return (thread_block());
|
| 627 |
+
}
|
| 628 |
+
|
| 629 |
+
unsigned int mask;
|
| 630 |
+
unsigned int base_offset = thread_rank() & (~(tilesz - 1));
|
| 631 |
+
unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
|
| 632 |
+
|
| 633 |
+
mask = (unsigned int)(-1) >> (32 - masklength);
|
| 634 |
+
mask <<= (details::laneid() & ~(tilesz - 1));
|
| 635 |
+
thread_group tile = thread_group(details::coalesced_group_id);
|
| 636 |
+
tile._data.coalesced.mask = mask;
|
| 637 |
+
tile._data.coalesced.size = __popc(mask);
|
| 638 |
+
tile._data.coalesced.metaGroupSize = (details::cta::size() + tilesz - 1) / tilesz;
|
| 639 |
+
tile._data.coalesced.metaGroupRank = details::cta::thread_rank() / tilesz;
|
| 640 |
+
tile._data.coalesced.is_tiled = true;
|
| 641 |
+
return (tile);
|
| 642 |
+
}
|
| 643 |
+
|
| 644 |
+
public:
|
| 645 |
+
_CG_STATIC_CONST_DECL unsigned int _group_id = details::thread_block_id;
|
| 646 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
|
| 647 |
+
|
| 648 |
+
_CG_STATIC_QUALIFIER void sync() {
|
| 649 |
+
details::cta::sync();
|
| 650 |
+
}
|
| 651 |
+
|
| 652 |
+
_CG_STATIC_QUALIFIER unsigned int size() {
|
| 653 |
+
return details::cta::size();
|
| 654 |
+
}
|
| 655 |
+
|
| 656 |
+
_CG_STATIC_QUALIFIER unsigned int thread_rank() {
|
| 657 |
+
return details::cta::thread_rank();
|
| 658 |
+
}
|
| 659 |
+
|
| 660 |
+
// Additional functionality exposed by the group
|
| 661 |
+
_CG_STATIC_QUALIFIER dim3 group_index() {
|
| 662 |
+
return details::cta::group_index();
|
| 663 |
+
}
|
| 664 |
+
|
| 665 |
+
_CG_STATIC_QUALIFIER dim3 thread_index() {
|
| 666 |
+
return details::cta::thread_index();
|
| 667 |
+
}
|
| 668 |
+
|
| 669 |
+
_CG_STATIC_QUALIFIER dim3 group_dim() {
|
| 670 |
+
return details::cta::block_dim();
|
| 671 |
+
}
|
| 672 |
+
|
| 673 |
+
_CG_STATIC_QUALIFIER dim3 dim_threads() {
|
| 674 |
+
return details::cta::dim_threads();
|
| 675 |
+
}
|
| 676 |
+
|
| 677 |
+
_CG_STATIC_QUALIFIER unsigned int num_threads() {
|
| 678 |
+
return details::cta::num_threads();
|
| 679 |
+
}
|
| 680 |
+
|
| 681 |
+
};
|
| 682 |
+
|
| 683 |
+
/**
|
| 684 |
+
* thread_block this_thread_block()
|
| 685 |
+
*
|
| 686 |
+
* Constructs a thread_block group
|
| 687 |
+
*/
|
| 688 |
+
_CG_QUALIFIER thread_block this_thread_block()
|
| 689 |
+
{
|
| 690 |
+
return (thread_block());
|
| 691 |
+
}
|
| 692 |
+
|
| 693 |
+
#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 694 |
+
namespace experimental {
|
| 695 |
+
template <unsigned int TileCommunicationSize, unsigned int MaxBlockSize>
|
| 696 |
+
_CG_QUALIFIER thread_block this_thread_block(experimental::block_tile_memory<TileCommunicationSize, MaxBlockSize>& scratch) {
|
| 697 |
+
return (thread_block(scratch));
|
| 698 |
+
}
|
| 699 |
+
}
|
| 700 |
+
#endif
|
| 701 |
+
|
| 702 |
+
/**
|
| 703 |
+
* class coalesced_group
|
| 704 |
+
*
|
| 705 |
+
* A group representing the current set of converged threads in a warp.
|
| 706 |
+
* The size of the group is not guaranteed and it may return a group of
|
| 707 |
+
* only one thread (itself).
|
| 708 |
+
*
|
| 709 |
+
* This group exposes warp-synchronous builtins.
|
| 710 |
+
* Constructed via coalesced_threads();
|
| 711 |
+
*/
|
| 712 |
+
class coalesced_group : public thread_group_base<details::coalesced_group_id>
|
| 713 |
+
{
|
| 714 |
+
private:
|
| 715 |
+
friend _CG_QUALIFIER coalesced_group coalesced_threads();
|
| 716 |
+
friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
|
| 717 |
+
friend _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz);
|
| 718 |
+
friend class details::_coalesced_group_data_access;
|
| 719 |
+
|
| 720 |
+
_CG_QUALIFIER unsigned int _packLanes(unsigned laneMask) const {
|
| 721 |
+
unsigned int member_pack = 0;
|
| 722 |
+
unsigned int member_rank = 0;
|
| 723 |
+
for (int bit_idx = 0; bit_idx < 32; bit_idx++) {
|
| 724 |
+
unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
|
| 725 |
+
if (lane_bit) {
|
| 726 |
+
if (laneMask & lane_bit)
|
| 727 |
+
member_pack |= 1 << member_rank;
|
| 728 |
+
member_rank++;
|
| 729 |
+
}
|
| 730 |
+
}
|
| 731 |
+
return (member_pack);
|
| 732 |
+
}
|
| 733 |
+
|
| 734 |
+
// Internal Use
|
| 735 |
+
_CG_QUALIFIER coalesced_group _get_tiled_threads(unsigned int tilesz) const {
|
| 736 |
+
const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
|
| 737 |
+
|
| 738 |
+
// Invalid, immediately fail
|
| 739 |
+
if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
|
| 740 |
+
details::abort();
|
| 741 |
+
return (coalesced_group(0));
|
| 742 |
+
}
|
| 743 |
+
if (size() <= tilesz) {
|
| 744 |
+
return (*this);
|
| 745 |
+
}
|
| 746 |
+
|
| 747 |
+
if ((_data.coalesced.is_tiled == true) && pow2_tilesz) {
|
| 748 |
+
unsigned int base_offset = (thread_rank() & (~(tilesz - 1)));
|
| 749 |
+
unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
|
| 750 |
+
unsigned int mask = (unsigned int)(-1) >> (32 - masklength);
|
| 751 |
+
|
| 752 |
+
mask <<= (details::laneid() & ~(tilesz - 1));
|
| 753 |
+
coalesced_group coalesced_tile = coalesced_group(mask);
|
| 754 |
+
coalesced_tile._data.coalesced.metaGroupSize = size() / tilesz;
|
| 755 |
+
coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
|
| 756 |
+
coalesced_tile._data.coalesced.is_tiled = true;
|
| 757 |
+
return (coalesced_tile);
|
| 758 |
+
}
|
| 759 |
+
else if ((_data.coalesced.is_tiled == false) && pow2_tilesz) {
|
| 760 |
+
unsigned int mask = 0;
|
| 761 |
+
unsigned int member_rank = 0;
|
| 762 |
+
int seen_lanes = (thread_rank() / tilesz) * tilesz;
|
| 763 |
+
for (unsigned int bit_idx = 0; bit_idx < 32; bit_idx++) {
|
| 764 |
+
unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
|
| 765 |
+
if (lane_bit) {
|
| 766 |
+
if (seen_lanes <= 0 && member_rank < tilesz) {
|
| 767 |
+
mask |= lane_bit;
|
| 768 |
+
member_rank++;
|
| 769 |
+
}
|
| 770 |
+
seen_lanes--;
|
| 771 |
+
}
|
| 772 |
+
}
|
| 773 |
+
coalesced_group coalesced_tile = coalesced_group(mask);
|
| 774 |
+
// Override parent with the size of this group
|
| 775 |
+
coalesced_tile._data.coalesced.metaGroupSize = (size() + tilesz - 1) / tilesz;
|
| 776 |
+
coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
|
| 777 |
+
return coalesced_tile;
|
| 778 |
+
}
|
| 779 |
+
else {
|
| 780 |
+
// None in _CG_VERSION 1000
|
| 781 |
+
details::abort();
|
| 782 |
+
}
|
| 783 |
+
|
| 784 |
+
return (coalesced_group(0));
|
| 785 |
+
}
|
| 786 |
+
|
| 787 |
+
protected:
|
| 788 |
+
_CG_QUALIFIER coalesced_group(unsigned int mask) {
|
| 789 |
+
_data.coalesced.mask = mask;
|
| 790 |
+
_data.coalesced.size = __popc(mask);
|
| 791 |
+
_data.coalesced.metaGroupRank = 0;
|
| 792 |
+
_data.coalesced.metaGroupSize = 1;
|
| 793 |
+
_data.coalesced.is_tiled = false;
|
| 794 |
+
}
|
| 795 |
+
|
| 796 |
+
_CG_QUALIFIER unsigned int get_mask() const {
|
| 797 |
+
return (_data.coalesced.mask);
|
| 798 |
+
}
|
| 799 |
+
|
| 800 |
+
public:
|
| 801 |
+
_CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
|
| 802 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
|
| 803 |
+
|
| 804 |
+
_CG_QUALIFIER unsigned int num_threads() const {
|
| 805 |
+
return _data.coalesced.size;
|
| 806 |
+
}
|
| 807 |
+
|
| 808 |
+
_CG_QUALIFIER unsigned int size() const {
|
| 809 |
+
return num_threads();
|
| 810 |
+
}
|
| 811 |
+
|
| 812 |
+
_CG_QUALIFIER unsigned int thread_rank() const {
|
| 813 |
+
return (__popc(_data.coalesced.mask & details::lanemask32_lt()));
|
| 814 |
+
}
|
| 815 |
+
|
| 816 |
+
// Rank of this group in the upper level of the hierarchy
|
| 817 |
+
_CG_QUALIFIER unsigned int meta_group_rank() const {
|
| 818 |
+
return _data.coalesced.metaGroupRank;
|
| 819 |
+
}
|
| 820 |
+
|
| 821 |
+
// Total num partitions created out of all CTAs when the group was created
|
| 822 |
+
_CG_QUALIFIER unsigned int meta_group_size() const {
|
| 823 |
+
return _data.coalesced.metaGroupSize;
|
| 824 |
+
}
|
| 825 |
+
|
| 826 |
+
_CG_QUALIFIER void sync() const {
|
| 827 |
+
__syncwarp(_data.coalesced.mask);
|
| 828 |
+
}
|
| 829 |
+
|
| 830 |
+
#ifdef _CG_CPP11_FEATURES
|
| 831 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 832 |
+
_CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
|
| 833 |
+
unsigned int lane = (srcRank == 0) ? __ffs(_data.coalesced.mask) - 1 :
|
| 834 |
+
(size() == 32) ? srcRank : __fns(_data.coalesced.mask, 0, (srcRank + 1));
|
| 835 |
+
|
| 836 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl(
|
| 837 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
|
| 838 |
+
}
|
| 839 |
+
|
| 840 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 841 |
+
_CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
|
| 842 |
+
if (size() == 32) {
|
| 843 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl_down(
|
| 844 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
|
| 845 |
+
}
|
| 846 |
+
|
| 847 |
+
unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
|
| 848 |
+
|
| 849 |
+
if (lane >= 32)
|
| 850 |
+
lane = details::laneid();
|
| 851 |
+
|
| 852 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl(
|
| 853 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
|
| 854 |
+
}
|
| 855 |
+
|
| 856 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 857 |
+
_CG_QUALIFIER TyRet shfl_up(TyElem&& elem, int delta) const {
|
| 858 |
+
if (size() == 32) {
|
| 859 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl_up(
|
| 860 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
|
| 861 |
+
}
|
| 862 |
+
|
| 863 |
+
unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
|
| 864 |
+
if (lane >= 32)
|
| 865 |
+
lane = details::laneid();
|
| 866 |
+
|
| 867 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl(
|
| 868 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
|
| 869 |
+
}
|
| 870 |
+
#else
|
| 871 |
+
template <typename TyIntegral>
|
| 872 |
+
_CG_QUALIFIER TyIntegral shfl(TyIntegral var, unsigned int src_rank) const {
|
| 873 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 874 |
+
unsigned int lane = (src_rank == 0) ? __ffs(_data.coalesced.mask) - 1 :
|
| 875 |
+
(size() == 32) ? src_rank : __fns(_data.coalesced.mask, 0, (src_rank + 1));
|
| 876 |
+
return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
|
| 877 |
+
}
|
| 878 |
+
|
| 879 |
+
template <typename TyIntegral>
|
| 880 |
+
_CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, int delta) const {
|
| 881 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 882 |
+
if (size() == 32) {
|
| 883 |
+
return (__shfl_up_sync(0xFFFFFFFF, var, delta, 32));
|
| 884 |
+
}
|
| 885 |
+
unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
|
| 886 |
+
if (lane >= 32) lane = details::laneid();
|
| 887 |
+
return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
|
| 888 |
+
}
|
| 889 |
+
|
| 890 |
+
template <typename TyIntegral>
|
| 891 |
+
_CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, int delta) const {
|
| 892 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 893 |
+
if (size() == 32) {
|
| 894 |
+
return (__shfl_down_sync(0xFFFFFFFF, var, delta, 32));
|
| 895 |
+
}
|
| 896 |
+
unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
|
| 897 |
+
if (lane >= 32) lane = details::laneid();
|
| 898 |
+
return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
|
| 899 |
+
}
|
| 900 |
+
#endif
|
| 901 |
+
|
| 902 |
+
_CG_QUALIFIER int any(int predicate) const {
|
| 903 |
+
return (__ballot_sync(_data.coalesced.mask, predicate) != 0);
|
| 904 |
+
}
|
| 905 |
+
_CG_QUALIFIER int all(int predicate) const {
|
| 906 |
+
return (__ballot_sync(_data.coalesced.mask, predicate) == _data.coalesced.mask);
|
| 907 |
+
}
|
| 908 |
+
_CG_QUALIFIER unsigned int ballot(int predicate) const {
|
| 909 |
+
if (size() == 32) {
|
| 910 |
+
return (__ballot_sync(0xFFFFFFFF, predicate));
|
| 911 |
+
}
|
| 912 |
+
unsigned int lane_ballot = __ballot_sync(_data.coalesced.mask, predicate);
|
| 913 |
+
return (_packLanes(lane_ballot));
|
| 914 |
+
}
|
| 915 |
+
|
| 916 |
+
#ifdef _CG_HAS_MATCH_COLLECTIVE
|
| 917 |
+
|
| 918 |
+
template <typename TyIntegral>
|
| 919 |
+
_CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
|
| 920 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 921 |
+
if (size() == 32) {
|
| 922 |
+
return (__match_any_sync(0xFFFFFFFF, val));
|
| 923 |
+
}
|
| 924 |
+
unsigned int lane_match = __match_any_sync(_data.coalesced.mask, val);
|
| 925 |
+
return (_packLanes(lane_match));
|
| 926 |
+
}
|
| 927 |
+
|
| 928 |
+
template <typename TyIntegral>
|
| 929 |
+
_CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
|
| 930 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 931 |
+
if (size() == 32) {
|
| 932 |
+
return (__match_all_sync(0xFFFFFFFF, val, &pred));
|
| 933 |
+
}
|
| 934 |
+
unsigned int lane_match = __match_all_sync(_data.coalesced.mask, val, &pred);
|
| 935 |
+
return (_packLanes(lane_match));
|
| 936 |
+
}
|
| 937 |
+
|
| 938 |
+
#endif /* !_CG_HAS_MATCH_COLLECTIVE */
|
| 939 |
+
|
| 940 |
+
};
|
| 941 |
+
|
| 942 |
+
_CG_QUALIFIER coalesced_group coalesced_threads()
|
| 943 |
+
{
|
| 944 |
+
return (coalesced_group(__activemask()));
|
| 945 |
+
}
|
| 946 |
+
|
| 947 |
+
namespace details {
|
| 948 |
+
template <unsigned int Size> struct verify_thread_block_tile_size;
|
| 949 |
+
template <> struct verify_thread_block_tile_size<32> { typedef void OK; };
|
| 950 |
+
template <> struct verify_thread_block_tile_size<16> { typedef void OK; };
|
| 951 |
+
template <> struct verify_thread_block_tile_size<8> { typedef void OK; };
|
| 952 |
+
template <> struct verify_thread_block_tile_size<4> { typedef void OK; };
|
| 953 |
+
template <> struct verify_thread_block_tile_size<2> { typedef void OK; };
|
| 954 |
+
template <> struct verify_thread_block_tile_size<1> { typedef void OK; };
|
| 955 |
+
|
| 956 |
+
#ifdef _CG_CPP11_FEATURES
|
| 957 |
+
template <unsigned int Size>
|
| 958 |
+
using _is_power_of_2 = _CG_STL_NAMESPACE::integral_constant<bool, (Size & (Size - 1)) == 0>;
|
| 959 |
+
|
| 960 |
+
template <unsigned int Size>
|
| 961 |
+
using _is_single_warp = _CG_STL_NAMESPACE::integral_constant<bool, Size <= 32>;
|
| 962 |
+
template <unsigned int Size>
|
| 963 |
+
using _is_multi_warp =
|
| 964 |
+
_CG_STL_NAMESPACE::integral_constant<bool, (Size > 32) && (Size <= 1024)>;
|
| 965 |
+
|
| 966 |
+
template <unsigned int Size>
|
| 967 |
+
using _is_valid_single_warp_tile =
|
| 968 |
+
_CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_single_warp<Size>::value>;
|
| 969 |
+
template <unsigned int Size>
|
| 970 |
+
using _is_valid_multi_warp_tile =
|
| 971 |
+
_CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_multi_warp<Size>::value>;
|
| 972 |
+
#else
|
| 973 |
+
template <unsigned int Size>
|
| 974 |
+
struct _is_multi_warp {
|
| 975 |
+
static const bool value = false;
|
| 976 |
+
};
|
| 977 |
+
#endif
|
| 978 |
+
}
|
| 979 |
+
|
| 980 |
+
template <unsigned int Size>
|
| 981 |
+
class __static_size_tile_base
|
| 982 |
+
{
|
| 983 |
+
protected:
|
| 984 |
+
_CG_STATIC_CONST_DECL unsigned int numThreads = Size;
|
| 985 |
+
|
| 986 |
+
public:
|
| 987 |
+
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
|
| 988 |
+
|
| 989 |
+
// Rank of thread within tile
|
| 990 |
+
_CG_STATIC_QUALIFIER unsigned int thread_rank() {
|
| 991 |
+
return (details::cta::thread_rank() & (numThreads - 1));
|
| 992 |
+
}
|
| 993 |
+
|
| 994 |
+
// Number of threads within tile
|
| 995 |
+
_CG_STATIC_CONSTEXPR_QUALIFIER unsigned int num_threads() {
|
| 996 |
+
return numThreads;
|
| 997 |
+
}
|
| 998 |
+
|
| 999 |
+
_CG_STATIC_CONSTEXPR_QUALIFIER unsigned int size() {
|
| 1000 |
+
return num_threads();
|
| 1001 |
+
}
|
| 1002 |
+
};
|
| 1003 |
+
|
| 1004 |
+
template <unsigned int Size>
|
| 1005 |
+
class __static_size_thread_block_tile_base : public __static_size_tile_base<Size>
|
| 1006 |
+
{
|
| 1007 |
+
friend class details::_coalesced_group_data_access;
|
| 1008 |
+
typedef details::tile::tile_helpers<Size> th;
|
| 1009 |
+
|
| 1010 |
+
#ifdef _CG_CPP11_FEATURES
|
| 1011 |
+
static_assert(details::_is_valid_single_warp_tile<Size>::value, "Size must be one of 1/2/4/8/16/32");
|
| 1012 |
+
#else
|
| 1013 |
+
typedef typename details::verify_thread_block_tile_size<Size>::OK valid;
|
| 1014 |
+
#endif
|
| 1015 |
+
using __static_size_tile_base<Size>::numThreads;
|
| 1016 |
+
_CG_STATIC_CONST_DECL unsigned int fullMask = 0xFFFFFFFF;
|
| 1017 |
+
|
| 1018 |
+
protected:
|
| 1019 |
+
_CG_STATIC_QUALIFIER unsigned int build_mask() {
|
| 1020 |
+
unsigned int mask = fullMask;
|
| 1021 |
+
if (numThreads != 32) {
|
| 1022 |
+
// [0,31] representing the current active thread in the warp
|
| 1023 |
+
unsigned int laneId = details::laneid();
|
| 1024 |
+
// shift mask according to the partition it belongs to
|
| 1025 |
+
mask = th::tileMask << (laneId & ~(th::laneMask));
|
| 1026 |
+
}
|
| 1027 |
+
return (mask);
|
| 1028 |
+
}
|
| 1029 |
+
|
| 1030 |
+
public:
|
| 1031 |
+
_CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
|
| 1032 |
+
|
| 1033 |
+
_CG_STATIC_QUALIFIER void sync() {
|
| 1034 |
+
__syncwarp(build_mask());
|
| 1035 |
+
}
|
| 1036 |
+
|
| 1037 |
+
#ifdef _CG_CPP11_FEATURES
|
| 1038 |
+
// PTX supported collectives
|
| 1039 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 1040 |
+
_CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
|
| 1041 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl(
|
| 1042 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), srcRank, numThreads);
|
| 1043 |
+
}
|
| 1044 |
+
|
| 1045 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 1046 |
+
_CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
|
| 1047 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl_down(
|
| 1048 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
|
| 1049 |
+
}
|
| 1050 |
+
|
| 1051 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 1052 |
+
_CG_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int delta) const {
|
| 1053 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl_up(
|
| 1054 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
|
| 1055 |
+
}
|
| 1056 |
+
|
| 1057 |
+
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
|
| 1058 |
+
_CG_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int laneMask) const {
|
| 1059 |
+
return details::tile::shuffle_dispatch<TyElem>::shfl_xor(
|
| 1060 |
+
_CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), laneMask, numThreads);
|
| 1061 |
+
}
|
| 1062 |
+
#else
|
| 1063 |
+
template <typename TyIntegral>
|
| 1064 |
+
_CG_QUALIFIER TyIntegral shfl(TyIntegral var, int srcRank) const {
|
| 1065 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 1066 |
+
return (__shfl_sync(build_mask(), var, srcRank, numThreads));
|
| 1067 |
+
}
|
| 1068 |
+
|
| 1069 |
+
template <typename TyIntegral>
|
| 1070 |
+
_CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, unsigned int delta) const {
|
| 1071 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 1072 |
+
return (__shfl_down_sync(build_mask(), var, delta, numThreads));
|
| 1073 |
+
}
|
| 1074 |
+
|
| 1075 |
+
template <typename TyIntegral>
|
| 1076 |
+
_CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, unsigned int delta) const {
|
| 1077 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 1078 |
+
return (__shfl_up_sync(build_mask(), var, delta, numThreads));
|
| 1079 |
+
}
|
| 1080 |
+
|
| 1081 |
+
template <typename TyIntegral>
|
| 1082 |
+
_CG_QUALIFIER TyIntegral shfl_xor(TyIntegral var, unsigned int laneMask) const {
|
| 1083 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 1084 |
+
return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
|
| 1085 |
+
}
|
| 1086 |
+
#endif //_CG_CPP11_FEATURES
|
| 1087 |
+
|
| 1088 |
+
_CG_QUALIFIER int any(int predicate) const {
|
| 1089 |
+
unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
|
| 1090 |
+
return (lane_ballot != 0);
|
| 1091 |
+
}
|
| 1092 |
+
_CG_QUALIFIER int all(int predicate) const {
|
| 1093 |
+
unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
|
| 1094 |
+
return (lane_ballot == build_mask());
|
| 1095 |
+
}
|
| 1096 |
+
_CG_QUALIFIER unsigned int ballot(int predicate) const {
|
| 1097 |
+
unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
|
| 1098 |
+
return (lane_ballot >> (details::laneid() & (~(th::laneMask))));
|
| 1099 |
+
}
|
| 1100 |
+
|
| 1101 |
+
#ifdef _CG_HAS_MATCH_COLLECTIVE
|
| 1102 |
+
template <typename TyIntegral>
|
| 1103 |
+
_CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
|
| 1104 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 1105 |
+
unsigned int lane_match = __match_any_sync(build_mask(), val);
|
| 1106 |
+
return (lane_match >> (details::laneid() & (~(th::laneMask))));
|
| 1107 |
+
}
|
| 1108 |
+
|
| 1109 |
+
template <typename TyIntegral>
|
| 1110 |
+
_CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
|
| 1111 |
+
details::assert_if_not_arithmetic<TyIntegral>();
|
| 1112 |
+
unsigned int lane_match = __match_all_sync(build_mask(), val, &pred);
|
| 1113 |
+
return (lane_match >> (details::laneid() & (~(th::laneMask))));
|
| 1114 |
+
}
|
| 1115 |
+
#endif
|
| 1116 |
+
|
| 1117 |
+
};
|
| 1118 |
+
|
| 1119 |
+
template <unsigned int Size, typename ParentT>
|
| 1120 |
+
class __static_parent_thread_block_tile_base
|
| 1121 |
+
{
|
| 1122 |
+
public:
|
| 1123 |
+
// Rank of this group in the upper level of the hierarchy
|
| 1124 |
+
_CG_STATIC_QUALIFIER unsigned int meta_group_rank() {
|
| 1125 |
+
return ParentT::thread_rank() / Size;
|
| 1126 |
+
}
|
| 1127 |
+
|
| 1128 |
+
// Total num partitions created out of all CTAs when the group was created
|
| 1129 |
+
_CG_STATIC_QUALIFIER unsigned int meta_group_size() {
|
| 1130 |
+
return (ParentT::size() + Size - 1) / Size;
|
| 1131 |
+
}
|
| 1132 |
+
};
|
| 1133 |
+
|
| 1134 |
+
/**
|
| 1135 |
+
* class thread_block_tile<unsigned int Size, ParentT = void>
|
| 1136 |
+
*
|
| 1137 |
+
* Statically-sized group type, representing one tile of a thread block.
|
| 1138 |
+
* The only specializations currently supported are those with native
|
| 1139 |
+
* hardware support (1/2/4/8/16/32)
|
| 1140 |
+
*
|
| 1141 |
+
* This group exposes warp-synchronous builtins.
|
| 1142 |
+
* Can only be constructed via tiled_partition<Size>(ParentT&)
|
| 1143 |
+
*/
|
| 1144 |
+
|
| 1145 |
+
template <unsigned int Size, typename ParentT = void>
|
| 1146 |
+
class __single_warp_thread_block_tile :
|
| 1147 |
+
public __static_size_thread_block_tile_base<Size>,
|
| 1148 |
+
public __static_parent_thread_block_tile_base<Size, ParentT>
|
| 1149 |
+
{
|
| 1150 |
+
typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
|
| 1151 |
+
friend class details::_coalesced_group_data_access;
|
| 1152 |
+
|
| 1153 |
+
protected:
|
| 1154 |
+
_CG_QUALIFIER __single_warp_thread_block_tile() { };
|
| 1155 |
+
_CG_QUALIFIER __single_warp_thread_block_tile(unsigned int, unsigned int) { };
|
| 1156 |
+
|
| 1157 |
+
_CG_STATIC_QUALIFIER unsigned int get_mask() {
|
| 1158 |
+
return __static_size_thread_block_tile_base<Size>::build_mask();
|
| 1159 |
+
}
|
| 1160 |
+
};
|
| 1161 |
+
|
| 1162 |
+
template <unsigned int Size>
|
| 1163 |
+
class __single_warp_thread_block_tile<Size, void> :
|
| 1164 |
+
public __static_size_thread_block_tile_base<Size>,
|
| 1165 |
+
public thread_group_base<details::coalesced_group_id>
|
| 1166 |
+
{
|
| 1167 |
+
_CG_STATIC_CONST_DECL unsigned int numThreads = Size;
|
| 1168 |
+
|
| 1169 |
+
template <unsigned int, typename ParentT> friend class __single_warp_thread_block_tile;
|
| 1170 |
+
friend class details::_coalesced_group_data_access;
|
| 1171 |
+
|
| 1172 |
+
typedef __static_size_thread_block_tile_base<numThreads> staticSizeBaseT;
|
| 1173 |
+
|
| 1174 |
+
protected:
|
| 1175 |
+
_CG_QUALIFIER __single_warp_thread_block_tile(unsigned int meta_group_rank, unsigned int meta_group_size) {
|
| 1176 |
+
_data.coalesced.mask = staticSizeBaseT::build_mask();
|
| 1177 |
+
_data.coalesced.size = numThreads;
|
| 1178 |
+
_data.coalesced.metaGroupRank = meta_group_rank;
|
| 1179 |
+
_data.coalesced.metaGroupSize = meta_group_size;
|
| 1180 |
+
_data.coalesced.is_tiled = true;
|
| 1181 |
+
}
|
| 1182 |
+
|
| 1183 |
+
_CG_QUALIFIER unsigned int get_mask() const {
|
| 1184 |
+
return (_data.coalesced.mask);
|
| 1185 |
+
}
|
| 1186 |
+
|
| 1187 |
+
public:
|
| 1188 |
+
using staticSizeBaseT::sync;
|
| 1189 |
+
using staticSizeBaseT::size;
|
| 1190 |
+
using staticSizeBaseT::num_threads;
|
| 1191 |
+
using staticSizeBaseT::thread_rank;
|
| 1192 |
+
|
| 1193 |
+
_CG_QUALIFIER unsigned int meta_group_rank() const {
|
| 1194 |
+
return _data.coalesced.metaGroupRank;
|
| 1195 |
+
}
|
| 1196 |
+
|
| 1197 |
+
_CG_QUALIFIER unsigned int meta_group_size() const {
|
| 1198 |
+
return _data.coalesced.metaGroupSize;
|
| 1199 |
+
}
|
| 1200 |
+
};
|
| 1201 |
+
|
| 1202 |
+
/**
|
| 1203 |
+
* Outer level API calls
|
| 1204 |
+
* void sync(GroupT) - see <group_type>.sync()
|
| 1205 |
+
* void thread_rank(GroupT) - see <group_type>.thread_rank()
|
| 1206 |
+
* void group_size(GroupT) - see <group_type>.size()
|
| 1207 |
+
*/
|
| 1208 |
+
template <class GroupT>
|
| 1209 |
+
_CG_QUALIFIER void sync(GroupT const &g)
|
| 1210 |
+
{
|
| 1211 |
+
g.sync();
|
| 1212 |
+
}
|
| 1213 |
+
|
| 1214 |
+
// TODO: Use a static dispatch to determine appropriate return type
|
| 1215 |
+
// C++03 is stuck with unsigned long long for now
|
| 1216 |
+
#ifdef _CG_CPP11_FEATURES
|
| 1217 |
+
template <class GroupT>
|
| 1218 |
+
_CG_QUALIFIER auto thread_rank(GroupT const& g) -> decltype(g.thread_rank()) {
|
| 1219 |
+
return g.thread_rank();
|
| 1220 |
+
}
|
| 1221 |
+
|
| 1222 |
+
|
| 1223 |
+
template <class GroupT>
|
| 1224 |
+
_CG_QUALIFIER auto group_size(GroupT const &g) -> decltype(g.num_threads()) {
|
| 1225 |
+
return g.num_threads();
|
| 1226 |
+
}
|
| 1227 |
+
#else
|
| 1228 |
+
template <class GroupT>
|
| 1229 |
+
_CG_QUALIFIER unsigned long long thread_rank(GroupT const& g) {
|
| 1230 |
+
return static_cast<unsigned long long>(g.thread_rank());
|
| 1231 |
+
}
|
| 1232 |
+
|
| 1233 |
+
|
| 1234 |
+
template <class GroupT>
|
| 1235 |
+
_CG_QUALIFIER unsigned long long group_size(GroupT const &g) {
|
| 1236 |
+
return static_cast<unsigned long long>(g.num_threads());
|
| 1237 |
+
}
|
| 1238 |
+
#endif
|
| 1239 |
+
|
| 1240 |
+
|
| 1241 |
+
/**
|
| 1242 |
+
* tiled_partition
|
| 1243 |
+
*
|
| 1244 |
+
* The tiled_partition(parent, tilesz) method is a collective operation that
|
| 1245 |
+
* partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
|
| 1246 |
+
*
|
| 1247 |
+
* A total of ((size(parent)+tilesz-1)/tilesz) subgroups will
|
| 1248 |
+
* be created where threads having identical k = (thread_rank(parent)/tilesz)
|
| 1249 |
+
* will be members of the same subgroup.
|
| 1250 |
+
*
|
| 1251 |
+
* The implementation may cause the calling thread to wait until all the members
|
| 1252 |
+
* of the parent group have invoked the operation before resuming execution.
|
| 1253 |
+
*
|
| 1254 |
+
* Functionality is limited to power-of-two sized subgorup instances of at most
|
| 1255 |
+
* 32 threads. Only thread_block, thread_block_tile<>, and their subgroups can be
|
| 1256 |
+
* tiled_partition() in _CG_VERSION 1000.
|
| 1257 |
+
*/
|
| 1258 |
+
_CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz)
|
| 1259 |
+
{
|
| 1260 |
+
if (parent.get_type() == details::coalesced_group_id) {
|
| 1261 |
+
const coalesced_group *_cg = static_cast<const coalesced_group*>(&parent);
|
| 1262 |
+
return _cg->_get_tiled_threads(tilesz);
|
| 1263 |
+
}
|
| 1264 |
+
else {
|
| 1265 |
+
const thread_block *_tb = static_cast<const thread_block*>(&parent);
|
| 1266 |
+
return _tb->_get_tiled_threads(tilesz);
|
| 1267 |
+
}
|
| 1268 |
+
}
|
| 1269 |
+
|
| 1270 |
+
// Thread block type overload: returns a basic thread_group for now (may be specialized later)
|
| 1271 |
+
_CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz)
|
| 1272 |
+
{
|
| 1273 |
+
return (parent._get_tiled_threads(tilesz));
|
| 1274 |
+
}
|
| 1275 |
+
|
| 1276 |
+
// Coalesced group type overload: retains its ability to stay coalesced
|
| 1277 |
+
_CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz)
|
| 1278 |
+
{
|
| 1279 |
+
return (parent._get_tiled_threads(tilesz));
|
| 1280 |
+
}
|
| 1281 |
+
|
| 1282 |
+
namespace details {
|
| 1283 |
+
template <unsigned int Size, typename ParentT>
|
| 1284 |
+
class internal_thread_block_tile : public __single_warp_thread_block_tile<Size, ParentT> {};
|
| 1285 |
+
|
| 1286 |
+
template <unsigned int Size, typename ParentT>
|
| 1287 |
+
_CG_QUALIFIER internal_thread_block_tile<Size, ParentT> tiled_partition_internal() {
|
| 1288 |
+
return internal_thread_block_tile<Size, ParentT>();
|
| 1289 |
+
}
|
| 1290 |
+
|
| 1291 |
+
template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
|
| 1292 |
+
_CG_QUALIFIER TyVal multi_warp_collectives_helper(
|
| 1293 |
+
const GroupT& group,
|
| 1294 |
+
WarpLambda warp_lambda,
|
| 1295 |
+
InterWarpLambda inter_warp_lambda) {
|
| 1296 |
+
return group.template collectives_scheme<TyVal>(warp_lambda, inter_warp_lambda);
|
| 1297 |
+
}
|
| 1298 |
+
|
| 1299 |
+
template <typename T, typename GroupT>
|
| 1300 |
+
_CG_QUALIFIER T* multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id) {
|
| 1301 |
+
return group.template get_scratch_location<T>(warp_id);
|
| 1302 |
+
}
|
| 1303 |
+
|
| 1304 |
+
template <typename GroupT>
|
| 1305 |
+
_CG_QUALIFIER details::barrier_t* multi_warp_sync_location_getter(const GroupT& group) {
|
| 1306 |
+
return group.get_sync_location();
|
| 1307 |
+
}
|
| 1308 |
+
|
| 1309 |
+
}
|
| 1310 |
+
/**
|
| 1311 |
+
* tiled_partition<tilesz>
|
| 1312 |
+
*
|
| 1313 |
+
* The tiled_partition<tilesz>(parent) method is a collective operation that
|
| 1314 |
+
* partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
|
| 1315 |
+
*
|
| 1316 |
+
* A total of ((size(parent)/tilesz) subgroups will be created,
|
| 1317 |
+
* therefore the parent group size must be evenly divisible by the tilesz.
|
| 1318 |
+
* The allow parent groups are thread_block or thread_block_tile<size>.
|
| 1319 |
+
*
|
| 1320 |
+
* The implementation may cause the calling thread to wait until all the members
|
| 1321 |
+
* of the parent group have invoked the operation before resuming execution.
|
| 1322 |
+
*
|
| 1323 |
+
* Functionality is limited to native hardware sizes, 1/2/4/8/16/32.
|
| 1324 |
+
* The size(parent) must be greater than the template Size parameter
|
| 1325 |
+
* otherwise the results are undefined.
|
| 1326 |
+
*/
|
| 1327 |
+
|
| 1328 |
+
#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 1329 |
+
template <unsigned int Size>
|
| 1330 |
+
class __static_size_multi_warp_tile_base : public __static_size_tile_base<Size>
|
| 1331 |
+
{
|
| 1332 |
+
static_assert(details::_is_valid_multi_warp_tile<Size>::value, "Size must be one of 64/128/256/512");
|
| 1333 |
+
|
| 1334 |
+
template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
|
| 1335 |
+
friend TyVal details::multi_warp_collectives_helper(
|
| 1336 |
+
const GroupT& group,
|
| 1337 |
+
WarpLambda warp_lambda,
|
| 1338 |
+
InterWarpLambda inter_warp_lambda);
|
| 1339 |
+
template <typename T, typename GroupT>
|
| 1340 |
+
friend T* details::multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id);
|
| 1341 |
+
template <typename GroupT>
|
| 1342 |
+
friend details::barrier_t* details::multi_warp_sync_location_getter(const GroupT& group);
|
| 1343 |
+
template <unsigned int OtherSize>
|
| 1344 |
+
friend class __static_size_multi_warp_tile_base;
|
| 1345 |
+
using WarpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
|
| 1346 |
+
using ThisType = __static_size_multi_warp_tile_base<Size>;
|
| 1347 |
+
_CG_STATIC_CONST_DECL int numWarps = Size / 32;
|
| 1348 |
+
const unsigned short communication_size;
|
| 1349 |
+
const unsigned short max_block_size;
|
| 1350 |
+
|
| 1351 |
+
protected:
|
| 1352 |
+
details::multi_warp_scratch* const tile_memory;
|
| 1353 |
+
|
| 1354 |
+
template <typename GroupT>
|
| 1355 |
+
_CG_QUALIFIER __static_size_multi_warp_tile_base(const GroupT& g) :
|
| 1356 |
+
tile_memory(g.tile_memory), communication_size(g.communication_size), max_block_size(g.max_block_size) {}
|
| 1357 |
+
|
| 1358 |
+
|
| 1359 |
+
private:
|
| 1360 |
+
_CG_QUALIFIER details::barrier_t* get_sync_location() const {
|
| 1361 |
+
// Different group sizes use different barriers, all groups of a given size share one barrier.
|
| 1362 |
+
unsigned int sync_id = details::log2(Size / 64);
|
| 1363 |
+
return &(reinterpret_cast<details::barrier_t*>(tile_memory->memory)[sync_id]);
|
| 1364 |
+
}
|
| 1365 |
+
|
| 1366 |
+
template <typename T>
|
| 1367 |
+
_CG_QUALIFIER T* get_scratch_location(unsigned int warp_id) const {
|
| 1368 |
+
unsigned int sync_mem_size = details::scratch_sync_memory_size(max_block_size);
|
| 1369 |
+
unsigned int scratch_id = (details::cta::thread_rank() - thread_rank()) / 32 + warp_id;
|
| 1370 |
+
return reinterpret_cast<T*>(&tile_memory->memory[sync_mem_size + scratch_id * communication_size]);
|
| 1371 |
+
}
|
| 1372 |
+
|
| 1373 |
+
template <typename T>
|
| 1374 |
+
_CG_QUALIFIER T* get_scratch_location() const {
|
| 1375 |
+
unsigned int sync_mem_size = details::scratch_sync_memory_size(max_block_size);
|
| 1376 |
+
unsigned int scratch_id = details::cta::thread_rank() / 32;
|
| 1377 |
+
return reinterpret_cast<T*>(&tile_memory->memory[sync_mem_size + scratch_id * communication_size]);
|
| 1378 |
+
}
|
| 1379 |
+
|
| 1380 |
+
template <typename TyVal>
|
| 1381 |
+
_CG_QUALIFIER TyVal shfl_impl(TyVal val, unsigned int src) const {
|
| 1382 |
+
unsigned int src_warp = src / 32;
|
| 1383 |
+
auto warp = details::tiled_partition_internal<32, ThisType>();
|
| 1384 |
+
details::barrier_t* sync_location = get_sync_location();
|
| 1385 |
+
|
| 1386 |
+
// Get warp slot of the source threads warp.
|
| 1387 |
+
TyVal* warp_scratch_location = get_scratch_location<TyVal>(src_warp);
|
| 1388 |
+
|
| 1389 |
+
if (warp.meta_group_rank() == src_warp) {
|
| 1390 |
+
// Put shuffled value into my warp slot and let my warp arrive at the barrier.
|
| 1391 |
+
if (thread_rank() == src) {
|
| 1392 |
+
*warp_scratch_location = val;
|
| 1393 |
+
}
|
| 1394 |
+
details::sync_warps_arrive(sync_location, details::cta::thread_rank(), numWarps);
|
| 1395 |
+
TyVal result = *warp_scratch_location;
|
| 1396 |
+
details::sync_warps_wait(sync_location, details::cta::thread_rank());
|
| 1397 |
+
return result;
|
| 1398 |
+
}
|
| 1399 |
+
else {
|
| 1400 |
+
// Wait for the source warp to arrive on the barrier.
|
| 1401 |
+
details::sync_warps_wait_for_warps<details::wait_for_specific_warp>(
|
| 1402 |
+
(details::cta::thread_rank() / 32 - warp.meta_group_rank() + src_warp),
|
| 1403 |
+
sync_location, details::cta::thread_rank(),
|
| 1404 |
+
numWarps);
|
| 1405 |
+
TyVal result = *warp_scratch_location;
|
| 1406 |
+
details::sync_warps(sync_location, details::cta::thread_rank(), numWarps);
|
| 1407 |
+
return result;
|
| 1408 |
+
}
|
| 1409 |
+
}
|
| 1410 |
+
|
| 1411 |
+
template <typename TyVal>
|
| 1412 |
+
_CG_QUALIFIER TyVal shfl_iterative_impl(TyVal val, unsigned int src) const {
|
| 1413 |
+
auto warp = details::tiled_partition_internal<32, ThisType>();
|
| 1414 |
+
|
| 1415 |
+
details::copy_channel<numWarps> broadcast_channel{
|
| 1416 |
+
get_scratch_location<char>(0),
|
| 1417 |
+
get_sync_location(),
|
| 1418 |
+
(size_t) communication_size * numWarps};
|
| 1419 |
+
|
| 1420 |
+
if (warp.meta_group_rank() == src / 32) {
|
| 1421 |
+
val = warp.shfl(val, src);
|
| 1422 |
+
broadcast_channel.template send_value<
|
| 1423 |
+
TyVal, 32, decltype(broadcast_channel)::send_many_to_many>(
|
| 1424 |
+
val, warp.thread_rank(), details::cta::thread_rank() / 32);
|
| 1425 |
+
}
|
| 1426 |
+
else {
|
| 1427 |
+
broadcast_channel.template receive_value<TyVal>(val, warp.thread_rank() == 0);
|
| 1428 |
+
}
|
| 1429 |
+
sync();
|
| 1430 |
+
return val;
|
| 1431 |
+
}
|
| 1432 |
+
|
| 1433 |
+
template <typename TyVal, typename WarpLambda, typename InterWarpLambda>
|
| 1434 |
+
_CG_QUALIFIER TyVal collectives_scheme_impl(const WarpLambda& warp_lambda, const InterWarpLambda& inter_warp_lambda) const {
|
| 1435 |
+
auto warp = details::tiled_partition_internal<32, ThisType>();
|
| 1436 |
+
details::barrier_t* sync_location = get_sync_location();
|
| 1437 |
+
TyVal* warp_scratch_location = get_scratch_location<TyVal>();
|
| 1438 |
+
|
| 1439 |
+
warp_lambda(warp, warp_scratch_location);
|
| 1440 |
+
|
| 1441 |
+
if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), numWarps)) {
|
| 1442 |
+
auto subwarp = details::tiled_partition_internal<numWarps, decltype(warp)>();
|
| 1443 |
+
if (subwarp.meta_group_rank() == 0) {
|
| 1444 |
+
TyVal* thread_scratch_location = get_scratch_location<TyVal>(subwarp.thread_rank());
|
| 1445 |
+
inter_warp_lambda(subwarp, thread_scratch_location);
|
| 1446 |
+
}
|
| 1447 |
+
warp.sync();
|
| 1448 |
+
details::sync_warps_release(sync_location, warp.thread_rank() == 0, details::cta::thread_rank(), numWarps);
|
| 1449 |
+
}
|
| 1450 |
+
TyVal result = *warp_scratch_location;
|
| 1451 |
+
warp.sync(); // Added warpsync, if all collectives do sync before writing to reduce_location (they does right now),
|
| 1452 |
+
// we could delete it.
|
| 1453 |
+
return result;
|
| 1454 |
+
}
|
| 1455 |
+
|
| 1456 |
+
template <typename TyVal, typename WarpLambda, typename InterWarpLambda>
|
| 1457 |
+
_CG_QUALIFIER TyVal collectives_scheme_iterative_impl(
|
| 1458 |
+
const WarpLambda& warp_lambda,
|
| 1459 |
+
const InterWarpLambda& inter_warp_lambda) const {
|
| 1460 |
+
auto warp = details::tiled_partition_internal<32, ThisType>();
|
| 1461 |
+
details::barrier_t* sync_location = get_sync_location();
|
| 1462 |
+
details::copy_channel<numWarps> final_result_channel{
|
| 1463 |
+
get_scratch_location<char>(0),
|
| 1464 |
+
sync_location,
|
| 1465 |
+
(size_t) communication_size * numWarps};
|
| 1466 |
+
|
| 1467 |
+
TyVal warp_result;
|
| 1468 |
+
warp_lambda(warp, &warp_result);
|
| 1469 |
+
|
| 1470 |
+
if (warp.meta_group_rank() == 0) {
|
| 1471 |
+
auto subwarp = details::tiled_partition_internal<numWarps, decltype(warp)>();
|
| 1472 |
+
details::copy_channel<numWarps> partial_results_channel{
|
| 1473 |
+
get_scratch_location<char>(subwarp.thread_rank()),
|
| 1474 |
+
sync_location,
|
| 1475 |
+
(size_t) communication_size};
|
| 1476 |
+
|
| 1477 |
+
// Thread 0 in subwarp set as inactive to not overwrite warp 0 warp_result.
|
| 1478 |
+
partial_results_channel.template receive_value<TyVal>(
|
| 1479 |
+
warp_result,
|
| 1480 |
+
warp.thread_rank() == 0,
|
| 1481 |
+
subwarp.thread_rank() != 0 && subwarp.meta_group_rank() == 0);
|
| 1482 |
+
if (subwarp.meta_group_rank() == 0) {
|
| 1483 |
+
inter_warp_lambda(subwarp, &warp_result);
|
| 1484 |
+
}
|
| 1485 |
+
warp_result = warp.shfl(warp_result, 0);
|
| 1486 |
+
final_result_channel.template send_value<TyVal, 32, decltype(final_result_channel)::send_many_to_many>(
|
| 1487 |
+
warp_result,
|
| 1488 |
+
warp.thread_rank(),
|
| 1489 |
+
details::cta::thread_rank() / 32);
|
| 1490 |
+
}
|
| 1491 |
+
else {
|
| 1492 |
+
details::copy_channel<numWarps> partial_results_channel{get_scratch_location<char>(), sync_location, (size_t) communication_size};
|
| 1493 |
+
partial_results_channel.template send_value<TyVal, 32, decltype(partial_results_channel)::send_many_to_one>(
|
| 1494 |
+
warp_result,
|
| 1495 |
+
warp.thread_rank(),
|
| 1496 |
+
(details::cta::thread_rank() - thread_rank()) / 32);
|
| 1497 |
+
final_result_channel.template receive_value<TyVal>(warp_result, warp.thread_rank() == 0);
|
| 1498 |
+
}
|
| 1499 |
+
sync();
|
| 1500 |
+
return warp_result;
|
| 1501 |
+
}
|
| 1502 |
+
|
| 1503 |
+
template <typename TyVal, typename WarpLambda, typename InterWarpLambda>
|
| 1504 |
+
_CG_QUALIFIER TyVal collectives_scheme(const WarpLambda& warp_lambda, const InterWarpLambda& inter_warp_lambda) const {
|
| 1505 |
+
if (sizeof(TyVal) > communication_size) {
|
| 1506 |
+
return collectives_scheme_iterative_impl<TyVal, WarpLambda, InterWarpLambda>(warp_lambda, inter_warp_lambda);
|
| 1507 |
+
}
|
| 1508 |
+
else {
|
| 1509 |
+
return collectives_scheme_impl<TyVal, WarpLambda, InterWarpLambda>(warp_lambda, inter_warp_lambda);
|
| 1510 |
+
}
|
| 1511 |
+
}
|
| 1512 |
+
|
| 1513 |
+
public:
|
| 1514 |
+
_CG_STATIC_CONST_DECL unsigned int _group_id = details::multi_tile_group_id;
|
| 1515 |
+
|
| 1516 |
+
using __static_size_tile_base<Size>::thread_rank;
|
| 1517 |
+
|
| 1518 |
+
template <typename TyVal>
|
| 1519 |
+
_CG_QUALIFIER TyVal shfl(TyVal val, unsigned int src) const {
|
| 1520 |
+
if (sizeof(TyVal) > communication_size) {
|
| 1521 |
+
return shfl_iterative_impl(val, src);
|
| 1522 |
+
}
|
| 1523 |
+
else {
|
| 1524 |
+
return shfl_impl(val, src);
|
| 1525 |
+
}
|
| 1526 |
+
}
|
| 1527 |
+
|
| 1528 |
+
_CG_QUALIFIER void sync() const {
|
| 1529 |
+
details::sync_warps(get_sync_location(), details::cta::thread_rank(), numWarps);
|
| 1530 |
+
}
|
| 1531 |
+
|
| 1532 |
+
_CG_QUALIFIER int any(int predicate) const {
|
| 1533 |
+
auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
|
| 1534 |
+
*warp_scratch_location = __any_sync(0xFFFFFFFF, predicate);
|
| 1535 |
+
};
|
| 1536 |
+
auto inter_warp_lambda =
|
| 1537 |
+
[] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
|
| 1538 |
+
*thread_scratch_location = __any_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
|
| 1539 |
+
};
|
| 1540 |
+
return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
|
| 1541 |
+
}
|
| 1542 |
+
|
| 1543 |
+
_CG_QUALIFIER int all(int predicate) const {
|
| 1544 |
+
auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
|
| 1545 |
+
*warp_scratch_location = __all_sync(0xFFFFFFFF, predicate);
|
| 1546 |
+
};
|
| 1547 |
+
auto inter_warp_lambda =
|
| 1548 |
+
[] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
|
| 1549 |
+
*thread_scratch_location = __all_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
|
| 1550 |
+
};
|
| 1551 |
+
return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
|
| 1552 |
+
}
|
| 1553 |
+
};
|
| 1554 |
+
|
| 1555 |
+
|
| 1556 |
+
template <unsigned int Size, typename ParentT = void>
|
| 1557 |
+
class __multi_warp_thread_block_tile :
|
| 1558 |
+
public __static_size_multi_warp_tile_base<Size>,
|
| 1559 |
+
public __static_parent_thread_block_tile_base<Size, ParentT>
|
| 1560 |
+
{
|
| 1561 |
+
typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
|
| 1562 |
+
typedef __static_size_multi_warp_tile_base<Size> staticTileBaseT;
|
| 1563 |
+
protected:
|
| 1564 |
+
_CG_QUALIFIER __multi_warp_thread_block_tile(const ParentT& g) :
|
| 1565 |
+
__static_size_multi_warp_tile_base<Size>(g) {}
|
| 1566 |
+
};
|
| 1567 |
+
|
| 1568 |
+
template <unsigned int Size>
|
| 1569 |
+
class __multi_warp_thread_block_tile<Size, void> : public __static_size_multi_warp_tile_base<Size>
|
| 1570 |
+
{
|
| 1571 |
+
const unsigned int metaGroupRank;
|
| 1572 |
+
const unsigned int metaGroupSize;
|
| 1573 |
+
|
| 1574 |
+
protected:
|
| 1575 |
+
template <unsigned int OtherSize, typename ParentT>
|
| 1576 |
+
_CG_QUALIFIER __multi_warp_thread_block_tile(const __multi_warp_thread_block_tile<OtherSize, ParentT>& g) :
|
| 1577 |
+
__static_size_multi_warp_tile_base<Size>(g), metaGroupRank(g.meta_group_rank()), metaGroupSize(g.meta_group_size()) {}
|
| 1578 |
+
|
| 1579 |
+
public:
|
| 1580 |
+
_CG_QUALIFIER unsigned int meta_group_rank() const {
|
| 1581 |
+
return metaGroupRank;
|
| 1582 |
+
}
|
| 1583 |
+
|
| 1584 |
+
_CG_QUALIFIER unsigned int meta_group_size() const {
|
| 1585 |
+
return metaGroupSize;
|
| 1586 |
+
}
|
| 1587 |
+
};
|
| 1588 |
+
#endif
|
| 1589 |
+
|
| 1590 |
+
template <unsigned int Size, typename ParentT = void>
|
| 1591 |
+
class thread_block_tile;
|
| 1592 |
+
|
| 1593 |
+
namespace details {
|
| 1594 |
+
template <unsigned int Size, typename ParentT, bool IsMultiWarp>
|
| 1595 |
+
class thread_block_tile_impl;
|
| 1596 |
+
|
| 1597 |
+
template <unsigned int Size, typename ParentT>
|
| 1598 |
+
class thread_block_tile_impl<Size, ParentT, false>: public __single_warp_thread_block_tile<Size, ParentT>
|
| 1599 |
+
{
|
| 1600 |
+
protected:
|
| 1601 |
+
template <unsigned int OtherSize, typename OtherParentT, bool OtherIsMultiWarp>
|
| 1602 |
+
_CG_QUALIFIER thread_block_tile_impl(const thread_block_tile_impl<OtherSize, OtherParentT, OtherIsMultiWarp>& g) :
|
| 1603 |
+
__single_warp_thread_block_tile<Size, ParentT>(g.meta_group_rank(), g.meta_group_size()) {}
|
| 1604 |
+
|
| 1605 |
+
_CG_QUALIFIER thread_block_tile_impl(const thread_block& g) :
|
| 1606 |
+
__single_warp_thread_block_tile<Size, ParentT>() {}
|
| 1607 |
+
};
|
| 1608 |
+
|
| 1609 |
+
#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 1610 |
+
template <unsigned int Size, typename ParentT>
|
| 1611 |
+
class thread_block_tile_impl<Size, ParentT, true> : public __multi_warp_thread_block_tile<Size, ParentT>
|
| 1612 |
+
{
|
| 1613 |
+
protected:
|
| 1614 |
+
template <typename GroupT>
|
| 1615 |
+
_CG_QUALIFIER thread_block_tile_impl(const GroupT& g) :
|
| 1616 |
+
__multi_warp_thread_block_tile<Size, ParentT>(g) {}
|
| 1617 |
+
};
|
| 1618 |
+
#else
|
| 1619 |
+
template <unsigned int Size, typename ParentT>
|
| 1620 |
+
class thread_block_tile_impl<Size, ParentT, true>
|
| 1621 |
+
{
|
| 1622 |
+
protected:
|
| 1623 |
+
template <typename GroupT>
|
| 1624 |
+
_CG_QUALIFIER thread_block_tile_impl(const GroupT& g) {}
|
| 1625 |
+
};
|
| 1626 |
+
#endif
|
| 1627 |
+
}
|
| 1628 |
+
|
| 1629 |
+
template <unsigned int Size, typename ParentT>
|
| 1630 |
+
class thread_block_tile : public details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>
|
| 1631 |
+
{
|
| 1632 |
+
friend _CG_QUALIFIER thread_block_tile<1, void> this_thread();
|
| 1633 |
+
|
| 1634 |
+
protected:
|
| 1635 |
+
_CG_QUALIFIER thread_block_tile(const ParentT& g) :
|
| 1636 |
+
details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>(g) {}
|
| 1637 |
+
|
| 1638 |
+
public:
|
| 1639 |
+
_CG_QUALIFIER operator thread_block_tile<Size, void>() const {
|
| 1640 |
+
return thread_block_tile<Size, void>(*this);
|
| 1641 |
+
}
|
| 1642 |
+
};
|
| 1643 |
+
|
| 1644 |
+
template <unsigned int Size>
|
| 1645 |
+
class thread_block_tile<Size, void> : public details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>
|
| 1646 |
+
{
|
| 1647 |
+
template <unsigned int, typename ParentT>
|
| 1648 |
+
friend class thread_block_tile;
|
| 1649 |
+
|
| 1650 |
+
protected:
|
| 1651 |
+
template <unsigned int OtherSize, typename OtherParentT>
|
| 1652 |
+
_CG_QUALIFIER thread_block_tile(const thread_block_tile<OtherSize, OtherParentT>& g) :
|
| 1653 |
+
details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
|
| 1654 |
+
|
| 1655 |
+
public:
|
| 1656 |
+
template <typename ParentT>
|
| 1657 |
+
_CG_QUALIFIER thread_block_tile(const thread_block_tile<Size, ParentT>& g) :
|
| 1658 |
+
details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
|
| 1659 |
+
};
|
| 1660 |
+
|
| 1661 |
+
namespace details {
|
| 1662 |
+
template <unsigned int Size, typename ParentT>
|
| 1663 |
+
struct tiled_partition_impl;
|
| 1664 |
+
|
| 1665 |
+
template <unsigned int Size>
|
| 1666 |
+
struct tiled_partition_impl<Size, thread_block> : public thread_block_tile<Size, thread_block> {
|
| 1667 |
+
_CG_QUALIFIER tiled_partition_impl(const thread_block& g) :
|
| 1668 |
+
thread_block_tile<Size, thread_block>(g) {}
|
| 1669 |
+
};
|
| 1670 |
+
|
| 1671 |
+
// ParentT = static thread_block_tile<ParentSize, GrandParent> specialization
|
| 1672 |
+
template <unsigned int Size, unsigned int ParentSize, typename GrandParent>
|
| 1673 |
+
struct tiled_partition_impl<Size, thread_block_tile<ParentSize, GrandParent> > :
|
| 1674 |
+
public thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> > {
|
| 1675 |
+
#ifdef _CG_CPP11_FEATURES
|
| 1676 |
+
static_assert(Size < ParentSize, "Tile size bigger or equal to the parent group size");
|
| 1677 |
+
#endif
|
| 1678 |
+
_CG_QUALIFIER tiled_partition_impl(const thread_block_tile<ParentSize, GrandParent>& g) :
|
| 1679 |
+
thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> >(g) {}
|
| 1680 |
+
};
|
| 1681 |
+
|
| 1682 |
+
}
|
| 1683 |
+
|
| 1684 |
+
namespace experimental {
|
| 1685 |
+
template <unsigned int Size, typename ParentT>
|
| 1686 |
+
_CG_QUALIFIER thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g)
|
| 1687 |
+
{
|
| 1688 |
+
#if defined(_CG_CPP11_FEATURES) && !defined(_CG_ABI_EXPERIMENTAL)
|
| 1689 |
+
static_assert(details::_is_single_warp<Size>::value, "_CG_ABI_EXPERIMENTAL needs to be defined"
|
| 1690 |
+
" before cooperative_groups header is included to enable experimental features");
|
| 1691 |
+
#endif
|
| 1692 |
+
return details::tiled_partition_impl<Size, ParentT>(g);
|
| 1693 |
+
}
|
| 1694 |
+
|
| 1695 |
+
}
|
| 1696 |
+
|
| 1697 |
+
template <unsigned int Size, typename ParentT>
|
| 1698 |
+
_CG_QUALIFIER thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g)
|
| 1699 |
+
{
|
| 1700 |
+
#ifdef _CG_CPP11_FEATURES
|
| 1701 |
+
static_assert(details::_is_single_warp<Size>::value, "Tiled partition with Size > 32 is supported only by"
|
| 1702 |
+
" cooperative_groups::experimental::tiled_partition available with experimental features enabled");
|
| 1703 |
+
#endif
|
| 1704 |
+
return details::tiled_partition_impl<Size, ParentT>(g);
|
| 1705 |
+
}
|
| 1706 |
+
|
| 1707 |
+
/**
|
| 1708 |
+
* thread_group this_thread()
|
| 1709 |
+
*
|
| 1710 |
+
* Constructs a generic thread_group containing only the calling thread
|
| 1711 |
+
*/
|
| 1712 |
+
_CG_QUALIFIER thread_block_tile<1, void> this_thread()
|
| 1713 |
+
{
|
| 1714 |
+
// Make thread_block_tile<1, thread_block> parent of the returned group, so it will have its
|
| 1715 |
+
// meta group rank and size set to 0 and 1 respectively.
|
| 1716 |
+
return thread_block_tile<1, thread_block_tile<1, thread_block> >(this_thread_block());
|
| 1717 |
+
}
|
| 1718 |
+
|
| 1719 |
+
/**
|
| 1720 |
+
* <group_type>.sync()
|
| 1721 |
+
*
|
| 1722 |
+
* Executes a barrier across the group
|
| 1723 |
+
*
|
| 1724 |
+
* Implements both a compiler fence and an architectural fence to prevent,
|
| 1725 |
+
* memory reordering around the barrier.
|
| 1726 |
+
*/
|
| 1727 |
+
_CG_QUALIFIER void thread_group::sync() const
|
| 1728 |
+
{
|
| 1729 |
+
switch (_data.group.type) {
|
| 1730 |
+
case details::coalesced_group_id:
|
| 1731 |
+
cooperative_groups::sync(*static_cast<const coalesced_group*>(this));
|
| 1732 |
+
break;
|
| 1733 |
+
case details::thread_block_id:
|
| 1734 |
+
cooperative_groups::sync(*static_cast<const thread_block*>(this));
|
| 1735 |
+
break;
|
| 1736 |
+
case details::grid_group_id:
|
| 1737 |
+
cooperative_groups::sync(*static_cast<const grid_group*>(this));
|
| 1738 |
+
break;
|
| 1739 |
+
#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 1740 |
+
case details::multi_grid_group_id:
|
| 1741 |
+
cooperative_groups::sync(*static_cast<const multi_grid_group*>(this));
|
| 1742 |
+
break;
|
| 1743 |
+
#endif
|
| 1744 |
+
#if defined(_CG_HAS_CLUSTER_GROUP)
|
| 1745 |
+
case details::cluster_group_id:
|
| 1746 |
+
cooperative_groups::sync(*static_cast<const cluster_group*>(this));
|
| 1747 |
+
break;
|
| 1748 |
+
#endif
|
| 1749 |
+
default:
|
| 1750 |
+
break;
|
| 1751 |
+
}
|
| 1752 |
+
}
|
| 1753 |
+
|
| 1754 |
+
/**
|
| 1755 |
+
* <group_type>.size()
|
| 1756 |
+
*
|
| 1757 |
+
* Returns the total number of threads in the group.
|
| 1758 |
+
*/
|
| 1759 |
+
_CG_QUALIFIER unsigned long long thread_group::size() const
|
| 1760 |
+
{
|
| 1761 |
+
unsigned long long size = 0;
|
| 1762 |
+
switch (_data.group.type) {
|
| 1763 |
+
case details::coalesced_group_id:
|
| 1764 |
+
size = cooperative_groups::group_size(*static_cast<const coalesced_group*>(this));
|
| 1765 |
+
break;
|
| 1766 |
+
case details::thread_block_id:
|
| 1767 |
+
size = cooperative_groups::group_size(*static_cast<const thread_block*>(this));
|
| 1768 |
+
break;
|
| 1769 |
+
case details::grid_group_id:
|
| 1770 |
+
size = cooperative_groups::group_size(*static_cast<const grid_group*>(this));
|
| 1771 |
+
break;
|
| 1772 |
+
#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 1773 |
+
case details::multi_grid_group_id:
|
| 1774 |
+
size = cooperative_groups::group_size(*static_cast<const multi_grid_group*>(this));
|
| 1775 |
+
break;
|
| 1776 |
+
#endif
|
| 1777 |
+
#if defined(_CG_HAS_CLUSTER_GROUP)
|
| 1778 |
+
case details::cluster_group_id:
|
| 1779 |
+
size = cooperative_groups::group_size(*static_cast<const cluster_group*>(this));
|
| 1780 |
+
break;
|
| 1781 |
+
#endif
|
| 1782 |
+
default:
|
| 1783 |
+
break;
|
| 1784 |
+
}
|
| 1785 |
+
return size;
|
| 1786 |
+
}
|
| 1787 |
+
|
| 1788 |
+
/**
|
| 1789 |
+
* <group_type>.thread_rank()
|
| 1790 |
+
*
|
| 1791 |
+
* Returns the linearized rank of the calling thread along the interval [0, size()).
|
| 1792 |
+
*/
|
| 1793 |
+
_CG_QUALIFIER unsigned long long thread_group::thread_rank() const
|
| 1794 |
+
{
|
| 1795 |
+
unsigned long long rank = 0;
|
| 1796 |
+
switch (_data.group.type) {
|
| 1797 |
+
case details::coalesced_group_id:
|
| 1798 |
+
rank = cooperative_groups::thread_rank(*static_cast<const coalesced_group*>(this));
|
| 1799 |
+
break;
|
| 1800 |
+
case details::thread_block_id:
|
| 1801 |
+
rank = cooperative_groups::thread_rank(*static_cast<const thread_block*>(this));
|
| 1802 |
+
break;
|
| 1803 |
+
case details::grid_group_id:
|
| 1804 |
+
rank = cooperative_groups::thread_rank(*static_cast<const grid_group*>(this));
|
| 1805 |
+
break;
|
| 1806 |
+
#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 1807 |
+
case details::multi_grid_group_id:
|
| 1808 |
+
rank = cooperative_groups::thread_rank(*static_cast<const multi_grid_group*>(this));
|
| 1809 |
+
break;
|
| 1810 |
+
#endif
|
| 1811 |
+
#if defined(_CG_HAS_CLUSTER_GROUP)
|
| 1812 |
+
case details::cluster_group_id:
|
| 1813 |
+
rank = cooperative_groups::thread_rank(*static_cast<const cluster_group*>(this));
|
| 1814 |
+
break;
|
| 1815 |
+
#endif
|
| 1816 |
+
default:
|
| 1817 |
+
break;
|
| 1818 |
+
}
|
| 1819 |
+
return rank;
|
| 1820 |
+
}
|
| 1821 |
+
|
| 1822 |
+
_CG_END_NAMESPACE
|
| 1823 |
+
|
| 1824 |
+
#include <cooperative_groups/details/partitioning.h>
|
| 1825 |
+
|
| 1826 |
+
# endif /* ! (__cplusplus, __CUDACC__) */
|
| 1827 |
+
|
| 1828 |
+
#endif /* !_COOPERATIVE_GROUPS_H_ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/coalesced_reduce.h
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _CG_COALESCED_REDUCE_H_
|
| 50 |
+
#define _CG_COALESCED_REDUCE_H_
|
| 51 |
+
|
| 52 |
+
#include "info.h"
|
| 53 |
+
#include "helpers.h"
|
| 54 |
+
#include "cooperative_groups.h"
|
| 55 |
+
#include "partitioning.h"
|
| 56 |
+
#include "coalesced_scan.h"
|
| 57 |
+
|
| 58 |
+
_CG_BEGIN_NAMESPACE
|
| 59 |
+
|
| 60 |
+
namespace details {
|
| 61 |
+
|
| 62 |
+
template <typename TyVal, typename TyOp>
|
| 63 |
+
_CG_QUALIFIER auto coalesced_reduce_to_one(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
|
| 64 |
+
if (group.size() == 32) {
|
| 65 |
+
auto out = val;
|
| 66 |
+
for (int offset = group.size() >> 1; offset > 0; offset >>= 1) {
|
| 67 |
+
out = op(out, group.shfl_up(out, offset));
|
| 68 |
+
}
|
| 69 |
+
return out;
|
| 70 |
+
}
|
| 71 |
+
else {
|
| 72 |
+
auto scan_result =
|
| 73 |
+
inclusive_scan_non_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
|
| 74 |
+
return scan_result;
|
| 75 |
+
}
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
template <typename TyVal, typename TyOp>
|
| 79 |
+
_CG_QUALIFIER auto coalesced_reduce(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
|
| 80 |
+
auto out = coalesced_reduce_to_one(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
|
| 81 |
+
if (group.size() == 32) {
|
| 82 |
+
return group.shfl(out, 31);
|
| 83 |
+
}
|
| 84 |
+
else {
|
| 85 |
+
unsigned int group_mask = _coalesced_group_data_access::get_mask(group);
|
| 86 |
+
unsigned int last_thread_id = 31 - __clz(group_mask);
|
| 87 |
+
return details::tile::shuffle_dispatch<TyVal>::shfl(
|
| 88 |
+
_CG_STL_NAMESPACE::forward<TyVal>(out), group_mask, last_thread_id, 32);
|
| 89 |
+
}
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
template <typename TyVal, typename TyOp, unsigned int TySize, typename ParentT>
|
| 93 |
+
_CG_QUALIFIER auto coalesced_reduce(const __single_warp_thread_block_tile<TySize, ParentT>& group,
|
| 94 |
+
TyVal&& val,
|
| 95 |
+
TyOp&& op) -> decltype(op(val, val)) {
|
| 96 |
+
auto out = val;
|
| 97 |
+
for (int mask = TySize >> 1; mask > 0; mask >>= 1) {
|
| 98 |
+
out = op(out, group.shfl_xor(out, mask));
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
return out;
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
} // details
|
| 105 |
+
|
| 106 |
+
_CG_END_NAMESPACE
|
| 107 |
+
|
| 108 |
+
#endif // _CG_COALESCED_REDUCE_H_
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/functional.h
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _CG_FUNCTIONAL_H
|
| 50 |
+
#define _CG_FUNCTIONAL_H
|
| 51 |
+
|
| 52 |
+
#include "info.h"
|
| 53 |
+
#include "helpers.h"
|
| 54 |
+
|
| 55 |
+
#ifdef _CG_CPP11_FEATURES
|
| 56 |
+
#ifdef _CG_USE_CUDA_STL
|
| 57 |
+
# include <cuda/std/functional>
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
_CG_BEGIN_NAMESPACE
|
| 61 |
+
|
| 62 |
+
namespace details {
|
| 63 |
+
#ifdef _CG_USE_CUDA_STL
|
| 64 |
+
using cuda::std::plus;
|
| 65 |
+
using cuda::std::bit_and;
|
| 66 |
+
using cuda::std::bit_xor;
|
| 67 |
+
using cuda::std::bit_or;
|
| 68 |
+
#else
|
| 69 |
+
template <typename Ty> struct plus {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 + arg2;}};
|
| 70 |
+
template <typename Ty> struct bit_and {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 & arg2;}};
|
| 71 |
+
template <typename Ty> struct bit_xor {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 ^ arg2;}};
|
| 72 |
+
template <typename Ty> struct bit_or {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 | arg2;}};
|
| 73 |
+
#endif // _CG_USE_PLATFORM_STL
|
| 74 |
+
} // details
|
| 75 |
+
|
| 76 |
+
template <typename Ty>
|
| 77 |
+
struct plus : public details::plus<Ty> {};
|
| 78 |
+
|
| 79 |
+
template <typename Ty>
|
| 80 |
+
struct less {
|
| 81 |
+
__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {
|
| 82 |
+
return (arg2 < arg1) ? arg2 : arg1;
|
| 83 |
+
}
|
| 84 |
+
};
|
| 85 |
+
|
| 86 |
+
template <typename Ty>
|
| 87 |
+
struct greater {
|
| 88 |
+
__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {
|
| 89 |
+
return (arg1 < arg2) ? arg2 : arg1;
|
| 90 |
+
}
|
| 91 |
+
};
|
| 92 |
+
|
| 93 |
+
template <typename Ty>
|
| 94 |
+
struct bit_and : public details::bit_and<Ty> {};
|
| 95 |
+
|
| 96 |
+
template <typename Ty>
|
| 97 |
+
struct bit_xor : public details::bit_xor<Ty> {};
|
| 98 |
+
|
| 99 |
+
template <typename Ty>
|
| 100 |
+
struct bit_or : public details::bit_or<Ty> {};
|
| 101 |
+
|
| 102 |
+
#if defined(_CG_HAS_STL_ATOMICS) && defined(_CG_ABI_EXPERIMENTAL)
|
| 103 |
+
namespace details {
|
| 104 |
+
template <class Ty>
|
| 105 |
+
using _atomic_is_type_supported = _CG_STL_NAMESPACE::integral_constant<bool,
|
| 106 |
+
_CG_STL_NAMESPACE::is_integral<Ty>::value && (sizeof(Ty) == 4 || sizeof(Ty) == 8)>;
|
| 107 |
+
|
| 108 |
+
template <typename TyOp> struct _atomic_op_supported : public _CG_STL_NAMESPACE::false_type {};
|
| 109 |
+
template <typename Ty> struct _atomic_op_supported<cooperative_groups::plus<Ty>> : public _atomic_is_type_supported<Ty> {};
|
| 110 |
+
template <typename Ty> struct _atomic_op_supported<cooperative_groups::less<Ty>> : public _atomic_is_type_supported<Ty> {};
|
| 111 |
+
template <typename Ty> struct _atomic_op_supported<cooperative_groups::greater<Ty>> : public _atomic_is_type_supported<Ty> {};
|
| 112 |
+
template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_and<Ty>> : public _atomic_is_type_supported<Ty> {};
|
| 113 |
+
template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_or<Ty>> : public _atomic_is_type_supported<Ty> {};
|
| 114 |
+
template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_xor<Ty>> : public _atomic_is_type_supported<Ty> {};
|
| 115 |
+
|
| 116 |
+
template<typename TyAtomic, typename TyVal, typename TyOp>
|
| 117 |
+
_CG_QUALIFIER remove_qual<TyVal> atomic_cas_fallback(TyAtomic&& atomic, TyVal&& val, TyOp&& op) {
|
| 118 |
+
remove_qual<TyVal> old = atomic;
|
| 119 |
+
while(!atomic.compare_exchange_weak(old, op(old, val)));
|
| 120 |
+
return old;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
template<typename TyOp>
|
| 124 |
+
struct op_picker;
|
| 125 |
+
|
| 126 |
+
template<typename TyVal>
|
| 127 |
+
struct op_picker<cooperative_groups::plus<TyVal>> {
|
| 128 |
+
template<typename TyAtomic>
|
| 129 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
|
| 130 |
+
return atomic.fetch_add(val);
|
| 131 |
+
}
|
| 132 |
+
};
|
| 133 |
+
|
| 134 |
+
template<typename TyVal>
|
| 135 |
+
struct op_picker<cooperative_groups::less<TyVal>> {
|
| 136 |
+
template<typename TyAtomic>
|
| 137 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
|
| 138 |
+
return atomic.fetch_min(val);
|
| 139 |
+
}
|
| 140 |
+
};
|
| 141 |
+
|
| 142 |
+
template<typename TyVal>
|
| 143 |
+
struct op_picker<cooperative_groups::greater<TyVal>> {
|
| 144 |
+
template<typename TyAtomic>
|
| 145 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
|
| 146 |
+
return atomic.fetch_max(val);
|
| 147 |
+
}
|
| 148 |
+
};
|
| 149 |
+
|
| 150 |
+
template<typename TyVal>
|
| 151 |
+
struct op_picker<cooperative_groups::bit_and<TyVal>> {
|
| 152 |
+
template<typename TyAtomic>
|
| 153 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
|
| 154 |
+
return atomic.fetch_and(val);
|
| 155 |
+
}
|
| 156 |
+
};
|
| 157 |
+
|
| 158 |
+
template<typename TyVal>
|
| 159 |
+
struct op_picker<cooperative_groups::bit_xor<TyVal>> {
|
| 160 |
+
template<typename TyAtomic>
|
| 161 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
|
| 162 |
+
return atomic.fetch_xor(val);
|
| 163 |
+
}
|
| 164 |
+
};
|
| 165 |
+
|
| 166 |
+
template<typename TyVal>
|
| 167 |
+
struct op_picker<cooperative_groups::bit_or<TyVal>> {
|
| 168 |
+
template<typename TyAtomic>
|
| 169 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
|
| 170 |
+
return atomic.fetch_or(val);
|
| 171 |
+
}
|
| 172 |
+
};
|
| 173 |
+
|
| 174 |
+
template<bool atomic_supported>
|
| 175 |
+
struct atomic_update_dispatch {};
|
| 176 |
+
|
| 177 |
+
template<>
|
| 178 |
+
struct atomic_update_dispatch<false> {
|
| 179 |
+
template<typename TyAtomic, typename TyVal, typename TyOp>
|
| 180 |
+
_CG_STATIC_QUALIFIER remove_qual<TyVal> atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) {
|
| 181 |
+
return atomic_cas_fallback(atomic, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
|
| 182 |
+
}
|
| 183 |
+
};
|
| 184 |
+
|
| 185 |
+
template<>
|
| 186 |
+
struct atomic_update_dispatch<true> {
|
| 187 |
+
template<typename TyAtomic, typename TyVal, typename TyOp>
|
| 188 |
+
_CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val, TyOp&& op) {
|
| 189 |
+
using dispatch = op_picker<details::remove_qual<TyOp>>;
|
| 190 |
+
|
| 191 |
+
return dispatch::atomic_update(atomic, val);
|
| 192 |
+
}
|
| 193 |
+
};
|
| 194 |
+
|
| 195 |
+
template<typename TyAtomic, typename TyVal, typename TyOp>
|
| 196 |
+
_CG_QUALIFIER remove_qual<TyVal> atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) {
|
| 197 |
+
using dispatch = atomic_update_dispatch<_atomic_op_supported<details::remove_qual<TyOp>>::value>;
|
| 198 |
+
|
| 199 |
+
return dispatch::atomic_update(atomic, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
|
| 200 |
+
}
|
| 201 |
+
}
|
| 202 |
+
#endif
|
| 203 |
+
|
| 204 |
+
_CG_END_NAMESPACE
|
| 205 |
+
|
| 206 |
+
#endif
|
| 207 |
+
#endif //_CG_FUNCTIONAL_H
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/helpers.h
ADDED
|
@@ -0,0 +1,707 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _COOPERATIVE_GROUPS_HELPERS_H_
|
| 50 |
+
# define _COOPERATIVE_GROUPS_HELPERS_H_
|
| 51 |
+
|
| 52 |
+
#include "info.h"
|
| 53 |
+
#include "sync.h"
|
| 54 |
+
|
| 55 |
+
_CG_BEGIN_NAMESPACE
|
| 56 |
+
|
| 57 |
+
namespace details {
|
| 58 |
+
#ifdef _CG_CPP11_FEATURES
|
| 59 |
+
template <typename Ty> struct _is_float_or_half : public _CG_STL_NAMESPACE::is_floating_point<Ty> {};
|
| 60 |
+
# ifdef _CG_HAS_FP16_COLLECTIVE
|
| 61 |
+
template <> struct _is_float_or_half<__half> : public _CG_STL_NAMESPACE::true_type {};
|
| 62 |
+
template <> struct _is_float_or_half<__half2> : public _CG_STL_NAMESPACE::true_type {};
|
| 63 |
+
# endif
|
| 64 |
+
template <typename Ty>
|
| 65 |
+
using is_float_or_half = _is_float_or_half<typename _CG_STL_NAMESPACE::remove_cv<Ty>::type>;
|
| 66 |
+
|
| 67 |
+
// Non-STL utility templates
|
| 68 |
+
template <typename Ty>
|
| 69 |
+
using remove_qual = typename _CG_STL_NAMESPACE::remove_cv<typename _CG_STL_NAMESPACE::remove_reference<Ty>::type>::type;
|
| 70 |
+
|
| 71 |
+
template <typename TyLhs, typename TyRhs>
|
| 72 |
+
using is_op_type_same = _CG_STL_NAMESPACE::is_same<remove_qual<TyLhs>, remove_qual<TyRhs>
|
| 73 |
+
>;
|
| 74 |
+
#endif
|
| 75 |
+
|
| 76 |
+
template <typename TyTrunc>
|
| 77 |
+
_CG_STATIC_QUALIFIER TyTrunc vec3_to_linear(dim3 index, dim3 nIndex) {
|
| 78 |
+
return ((TyTrunc)index.z * nIndex.y * nIndex.x) +
|
| 79 |
+
((TyTrunc)index.y * nIndex.x) +
|
| 80 |
+
(TyTrunc)index.x;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
namespace cta {
|
| 84 |
+
|
| 85 |
+
_CG_STATIC_QUALIFIER void sync()
|
| 86 |
+
{
|
| 87 |
+
__barrier_sync(0);
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
_CG_STATIC_QUALIFIER unsigned int num_threads()
|
| 91 |
+
{
|
| 92 |
+
return static_cast<unsigned int>(blockDim.x * blockDim.y * blockDim.z);
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
_CG_STATIC_QUALIFIER unsigned int thread_rank()
|
| 96 |
+
{
|
| 97 |
+
return vec3_to_linear<unsigned int>(threadIdx, blockDim);
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
_CG_STATIC_QUALIFIER dim3 group_index()
|
| 101 |
+
{
|
| 102 |
+
return dim3(blockIdx.x, blockIdx.y, blockIdx.z);
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
_CG_STATIC_QUALIFIER dim3 thread_index()
|
| 106 |
+
{
|
| 107 |
+
return dim3(threadIdx.x, threadIdx.y, threadIdx.z);
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
_CG_STATIC_QUALIFIER dim3 dim_threads()
|
| 111 |
+
{
|
| 112 |
+
return dim3(blockDim.x, blockDim.y, blockDim.z);
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
// Legacy aliases
|
| 116 |
+
_CG_STATIC_QUALIFIER unsigned int size()
|
| 117 |
+
{
|
| 118 |
+
return num_threads();
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
_CG_STATIC_QUALIFIER dim3 block_dim()
|
| 122 |
+
{
|
| 123 |
+
return dim_threads();
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
};
|
| 127 |
+
|
| 128 |
+
class _coalesced_group_data_access {
|
| 129 |
+
public:
|
| 130 |
+
// Retrieve mask of coalesced groups
|
| 131 |
+
template <typename TyGroup>
|
| 132 |
+
_CG_STATIC_QUALIFIER unsigned int get_mask(const TyGroup &group) {
|
| 133 |
+
return group.get_mask();
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
// Retrieve mask of tiles
|
| 137 |
+
template <template <typename, typename> typename TyGroup, typename Sz, typename Parent>
|
| 138 |
+
_CG_STATIC_QUALIFIER unsigned int get_mask(const TyGroup<Sz, Parent> &group) {
|
| 139 |
+
return group.build_maks();
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
template <typename TyGroup>
|
| 143 |
+
_CG_STATIC_QUALIFIER TyGroup construct_from_mask(unsigned int mask) {
|
| 144 |
+
return TyGroup(mask);
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
template <typename TyGroup>
|
| 148 |
+
_CG_STATIC_QUALIFIER void modify_meta_group(TyGroup &group, unsigned int mgRank, unsigned int mgSize) {
|
| 149 |
+
group._data.coalesced.metaGroupRank = mgRank;
|
| 150 |
+
group._data.coalesced.metaGroupSize = mgSize;
|
| 151 |
+
}
|
| 152 |
+
};
|
| 153 |
+
|
| 154 |
+
namespace tile {
|
| 155 |
+
template <unsigned int TileCount, unsigned int TileMask, unsigned int LaneMask, unsigned int ShiftCount>
|
| 156 |
+
struct _tile_helpers{
|
| 157 |
+
_CG_STATIC_CONST_DECL unsigned int tileCount = TileCount;
|
| 158 |
+
_CG_STATIC_CONST_DECL unsigned int tileMask = TileMask;
|
| 159 |
+
_CG_STATIC_CONST_DECL unsigned int laneMask = LaneMask;
|
| 160 |
+
_CG_STATIC_CONST_DECL unsigned int shiftCount = ShiftCount;
|
| 161 |
+
};
|
| 162 |
+
|
| 163 |
+
template <unsigned int> struct tile_helpers;
|
| 164 |
+
template <> struct tile_helpers<32> : public _tile_helpers<1, 0xFFFFFFFF, 0x1F, 5> {};
|
| 165 |
+
template <> struct tile_helpers<16> : public _tile_helpers<2, 0x0000FFFF, 0x0F, 4> {};
|
| 166 |
+
template <> struct tile_helpers<8> : public _tile_helpers<4, 0x000000FF, 0x07, 3> {};
|
| 167 |
+
template <> struct tile_helpers<4> : public _tile_helpers<8, 0x0000000F, 0x03, 2> {};
|
| 168 |
+
template <> struct tile_helpers<2> : public _tile_helpers<16, 0x00000003, 0x01, 1> {};
|
| 169 |
+
template <> struct tile_helpers<1> : public _tile_helpers<32, 0x00000001, 0x00, 0> {};
|
| 170 |
+
|
| 171 |
+
#ifdef _CG_CPP11_FEATURES
|
| 172 |
+
namespace shfl {
|
| 173 |
+
/***********************************************************************************
|
| 174 |
+
* Recursively Sliced Shuffle
|
| 175 |
+
* Purpose:
|
| 176 |
+
* Slices an input type a number of times into integral types so that shuffles
|
| 177 |
+
* are well defined
|
| 178 |
+
* Expectations:
|
| 179 |
+
* This object *should not* be used from a reinterpret_cast pointer unless
|
| 180 |
+
* some alignment guarantees can be met. Use a memcpy to guarantee that loads
|
| 181 |
+
* from the integral types stored within are aligned and correct.
|
| 182 |
+
**********************************************************************************/
|
| 183 |
+
template <unsigned int count, bool intSized = (count <= sizeof(int))>
|
| 184 |
+
struct recursive_sliced_shuffle_helper;
|
| 185 |
+
|
| 186 |
+
template <unsigned int count>
|
| 187 |
+
struct recursive_sliced_shuffle_helper<count, true> {
|
| 188 |
+
int val;
|
| 189 |
+
|
| 190 |
+
template <typename TyFn>
|
| 191 |
+
_CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) {
|
| 192 |
+
val = shfl(val);
|
| 193 |
+
}
|
| 194 |
+
};
|
| 195 |
+
|
| 196 |
+
template <unsigned int count>
|
| 197 |
+
struct recursive_sliced_shuffle_helper<count, false> {
|
| 198 |
+
int val;
|
| 199 |
+
recursive_sliced_shuffle_helper<count - sizeof(int)> next;
|
| 200 |
+
|
| 201 |
+
template <typename TyFn>
|
| 202 |
+
_CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) {
|
| 203 |
+
val = shfl(val);
|
| 204 |
+
next.invoke_shuffle(shfl);
|
| 205 |
+
}
|
| 206 |
+
};
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
struct _memory_shuffle {
|
| 210 |
+
template <typename TyElem, typename TyShflFn>
|
| 211 |
+
_CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) {
|
| 212 |
+
static_assert(sizeof(TyElem) > 0, "in memory shuffle is not yet implemented");
|
| 213 |
+
return TyElem{};
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 217 |
+
_CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
|
| 218 |
+
auto shfl = [=](int val) -> int {
|
| 219 |
+
return 0;
|
| 220 |
+
};
|
| 221 |
+
|
| 222 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 226 |
+
_CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
|
| 227 |
+
auto shfl = [=](int val) -> int {
|
| 228 |
+
return 0;
|
| 229 |
+
};
|
| 230 |
+
|
| 231 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 235 |
+
_CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
|
| 236 |
+
auto shfl = [=](int val) -> int {
|
| 237 |
+
return 0;
|
| 238 |
+
};
|
| 239 |
+
|
| 240 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 244 |
+
_CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
|
| 245 |
+
auto shfl = [=](int val) -> int {
|
| 246 |
+
return 0;
|
| 247 |
+
};
|
| 248 |
+
|
| 249 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 250 |
+
}
|
| 251 |
+
};
|
| 252 |
+
|
| 253 |
+
/***********************************************************************************
|
| 254 |
+
* Intrinsic Device Function Shuffle
|
| 255 |
+
* Purpose:
|
| 256 |
+
* Uses a shuffle helper that has characteristics best suited for moving
|
| 257 |
+
* elements between threads
|
| 258 |
+
* Expectations:
|
| 259 |
+
* Object given will be forced into an l-value type so that it can be used
|
| 260 |
+
* with a helper structure that reinterprets the data into intrinsic compatible
|
| 261 |
+
* types
|
| 262 |
+
* Notes:
|
| 263 |
+
* !! TyRet is required so that objects are returned by value and not as
|
| 264 |
+
* dangling references depending on the value category of the passed object
|
| 265 |
+
**********************************************************************************/
|
| 266 |
+
struct _intrinsic_compat_shuffle {
|
| 267 |
+
template <unsigned int count>
|
| 268 |
+
using shfl_helper = shfl::recursive_sliced_shuffle_helper<count>;
|
| 269 |
+
|
| 270 |
+
template <typename TyElem, typename TyShflFn>
|
| 271 |
+
_CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) {
|
| 272 |
+
static_assert(__is_trivially_copyable(TyElem), "Type is not compatible with device shuffle");
|
| 273 |
+
shfl_helper<sizeof(TyElem)> helper;
|
| 274 |
+
memcpy(&helper, &elem, sizeof(TyElem));
|
| 275 |
+
helper.invoke_shuffle(fn);
|
| 276 |
+
memcpy(&elem, &helper, sizeof(TyElem));
|
| 277 |
+
return elem;
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 281 |
+
_CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
|
| 282 |
+
auto shfl = [=](int val) -> int {
|
| 283 |
+
return __shfl_sync(gMask, val, srcRank, threads);
|
| 284 |
+
};
|
| 285 |
+
|
| 286 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 287 |
+
}
|
| 288 |
+
|
| 289 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 290 |
+
_CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
|
| 291 |
+
auto shfl = [=](int val) -> int {
|
| 292 |
+
return __shfl_down_sync(gMask, val, delta, threads);
|
| 293 |
+
};
|
| 294 |
+
|
| 295 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 299 |
+
_CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
|
| 300 |
+
auto shfl = [=](int val) -> int {
|
| 301 |
+
return __shfl_up_sync(gMask, val, delta, threads);
|
| 302 |
+
};
|
| 303 |
+
|
| 304 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
template <typename TyElem, typename TyRet = remove_qual<TyElem>>
|
| 308 |
+
_CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
|
| 309 |
+
auto shfl = [=](int val) -> int {
|
| 310 |
+
return __shfl_xor_sync(gMask, val, lMask, threads);
|
| 311 |
+
};
|
| 312 |
+
|
| 313 |
+
return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
|
| 314 |
+
}
|
| 315 |
+
};
|
| 316 |
+
|
| 317 |
+
struct _native_shuffle {
|
| 318 |
+
template <typename TyElem>
|
| 319 |
+
_CG_STATIC_QUALIFIER TyElem shfl(
|
| 320 |
+
TyElem elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
|
| 321 |
+
return static_cast<TyElem>(__shfl_sync(gMask, elem, srcRank, threads));
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
template <typename TyElem>
|
| 325 |
+
_CG_STATIC_QUALIFIER TyElem shfl_down(
|
| 326 |
+
TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
|
| 327 |
+
return static_cast<TyElem>(__shfl_down_sync(gMask, elem, delta, threads));
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
template <typename TyElem>
|
| 331 |
+
_CG_STATIC_QUALIFIER TyElem shfl_up(
|
| 332 |
+
TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
|
| 333 |
+
return static_cast<TyElem>(__shfl_up_sync(gMask, elem, delta, threads));
|
| 334 |
+
}
|
| 335 |
+
|
| 336 |
+
template <typename TyElem>
|
| 337 |
+
_CG_STATIC_QUALIFIER TyElem shfl_xor(
|
| 338 |
+
TyElem elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
|
| 339 |
+
return static_cast<TyElem>(__shfl_xor_sync(gMask, elem, lMask, threads));
|
| 340 |
+
}
|
| 341 |
+
};
|
| 342 |
+
|
| 343 |
+
// Almost all arithmetic types are supported by native shuffle
|
| 344 |
+
// Vector types are the exception
|
| 345 |
+
template <typename TyElem>
|
| 346 |
+
using use_native_shuffle = _CG_STL_NAMESPACE::integral_constant<
|
| 347 |
+
bool,
|
| 348 |
+
_CG_STL_NAMESPACE::is_integral<
|
| 349 |
+
remove_qual<TyElem>>::value ||
|
| 350 |
+
details::is_float_or_half<
|
| 351 |
+
remove_qual<TyElem>>::value
|
| 352 |
+
>;
|
| 353 |
+
|
| 354 |
+
constexpr unsigned long long _MemoryShuffleCutoff = 32;
|
| 355 |
+
|
| 356 |
+
template <typename TyElem,
|
| 357 |
+
bool IsNative = use_native_shuffle<TyElem>::value,
|
| 358 |
+
bool InMem = (sizeof(TyElem) > _MemoryShuffleCutoff)>
|
| 359 |
+
struct shuffle_dispatch;
|
| 360 |
+
|
| 361 |
+
template <typename TyElem>
|
| 362 |
+
struct shuffle_dispatch<TyElem, true, false> : public _native_shuffle {};
|
| 363 |
+
|
| 364 |
+
template <typename TyElem>
|
| 365 |
+
struct shuffle_dispatch<TyElem, false, false> : public _intrinsic_compat_shuffle {};
|
| 366 |
+
|
| 367 |
+
template <typename TyElem>
|
| 368 |
+
struct shuffle_dispatch<TyElem, false, true> : public _memory_shuffle {};
|
| 369 |
+
|
| 370 |
+
#endif //_CG_CPP11_FEATURES
|
| 371 |
+
};
|
| 372 |
+
|
| 373 |
+
namespace multi_grid {
|
| 374 |
+
struct multi_grid_functions;
|
| 375 |
+
};
|
| 376 |
+
|
| 377 |
+
namespace grid {
|
| 378 |
+
_CG_STATIC_QUALIFIER void sync(unsigned int *bar) {
|
| 379 |
+
unsigned int expected = gridDim.x * gridDim.y * gridDim.z;
|
| 380 |
+
|
| 381 |
+
details::sync_grids(expected, bar);
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
_CG_STATIC_QUALIFIER unsigned long long num_blocks()
|
| 385 |
+
{
|
| 386 |
+
// grid.y * grid.z -> [max(65535) * max(65535)] fits within 4b, promote after multiplication
|
| 387 |
+
// grid.x * (grid.y * grid.z) -> [max(2^31-1) * max(65535 * 65535)] exceeds 4b, promote before multiplication
|
| 388 |
+
return (unsigned long long)gridDim.x * (gridDim.y * gridDim.z);
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
_CG_STATIC_QUALIFIER unsigned long long num_threads()
|
| 392 |
+
{
|
| 393 |
+
return num_blocks() * cta::num_threads();
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
_CG_STATIC_QUALIFIER unsigned long long block_rank()
|
| 397 |
+
{
|
| 398 |
+
return vec3_to_linear<unsigned long long>(blockIdx, gridDim);
|
| 399 |
+
}
|
| 400 |
+
|
| 401 |
+
_CG_STATIC_QUALIFIER unsigned long long thread_rank()
|
| 402 |
+
{
|
| 403 |
+
return block_rank() * cta::num_threads() + cta::thread_rank();
|
| 404 |
+
}
|
| 405 |
+
|
| 406 |
+
_CG_STATIC_QUALIFIER dim3 dim_blocks()
|
| 407 |
+
{
|
| 408 |
+
return dim3(gridDim.x, gridDim.y, gridDim.z);
|
| 409 |
+
}
|
| 410 |
+
|
| 411 |
+
_CG_STATIC_QUALIFIER dim3 block_index()
|
| 412 |
+
{
|
| 413 |
+
return dim3(blockIdx.x, blockIdx.y, blockIdx.z);
|
| 414 |
+
}
|
| 415 |
+
|
| 416 |
+
#if defined(_CG_HAS_CLUSTER_GROUP)
|
| 417 |
+
_CG_STATIC_QUALIFIER dim3 dim_clusters() {
|
| 418 |
+
return __clusterGridDimInClusters();
|
| 419 |
+
}
|
| 420 |
+
|
| 421 |
+
_CG_STATIC_QUALIFIER unsigned long long num_clusters() {
|
| 422 |
+
const dim3 dimClusters = dim_clusters();
|
| 423 |
+
return dimClusters.x * dimClusters.y * dimClusters.z;
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
_CG_STATIC_QUALIFIER dim3 cluster_index() {
|
| 427 |
+
return __clusterIdx();
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
_CG_STATIC_QUALIFIER unsigned long long cluster_rank() {
|
| 431 |
+
return vec3_to_linear<unsigned long long>(cluster_index(), dim_clusters());
|
| 432 |
+
}
|
| 433 |
+
#endif
|
| 434 |
+
|
| 435 |
+
// Legacy aliases
|
| 436 |
+
_CG_STATIC_QUALIFIER unsigned long long size()
|
| 437 |
+
{
|
| 438 |
+
return num_threads();
|
| 439 |
+
}
|
| 440 |
+
|
| 441 |
+
_CG_STATIC_QUALIFIER dim3 grid_dim()
|
| 442 |
+
{
|
| 443 |
+
return dim_blocks();
|
| 444 |
+
}
|
| 445 |
+
};
|
| 446 |
+
|
| 447 |
+
|
| 448 |
+
#if defined(_CG_HAS_MULTI_GRID_GROUP)
|
| 449 |
+
|
| 450 |
+
namespace multi_grid {
|
| 451 |
+
_CG_STATIC_QUALIFIER unsigned long long get_intrinsic_handle()
|
| 452 |
+
{
|
| 453 |
+
return (cudaCGGetIntrinsicHandle(cudaCGScopeMultiGrid));
|
| 454 |
+
}
|
| 455 |
+
|
| 456 |
+
_CG_STATIC_QUALIFIER void sync(const unsigned long long handle)
|
| 457 |
+
{
|
| 458 |
+
cudaError_t err = cudaCGSynchronize(handle, 0);
|
| 459 |
+
}
|
| 460 |
+
|
| 461 |
+
_CG_STATIC_QUALIFIER unsigned int size(const unsigned long long handle)
|
| 462 |
+
{
|
| 463 |
+
unsigned int numThreads = 0;
|
| 464 |
+
cudaCGGetSize(&numThreads, NULL, handle);
|
| 465 |
+
return numThreads;
|
| 466 |
+
}
|
| 467 |
+
|
| 468 |
+
_CG_STATIC_QUALIFIER unsigned int thread_rank(const unsigned long long handle)
|
| 469 |
+
{
|
| 470 |
+
unsigned int threadRank = 0;
|
| 471 |
+
cudaCGGetRank(&threadRank, NULL, handle);
|
| 472 |
+
return threadRank;
|
| 473 |
+
}
|
| 474 |
+
|
| 475 |
+
_CG_STATIC_QUALIFIER unsigned int grid_rank(const unsigned long long handle)
|
| 476 |
+
{
|
| 477 |
+
unsigned int gridRank = 0;
|
| 478 |
+
cudaCGGetRank(NULL, &gridRank, handle);
|
| 479 |
+
return gridRank;
|
| 480 |
+
}
|
| 481 |
+
|
| 482 |
+
_CG_STATIC_QUALIFIER unsigned int num_grids(const unsigned long long handle)
|
| 483 |
+
{
|
| 484 |
+
unsigned int numGrids = 0;
|
| 485 |
+
cudaCGGetSize(NULL, &numGrids, handle);
|
| 486 |
+
return numGrids;
|
| 487 |
+
}
|
| 488 |
+
|
| 489 |
+
# ifdef _CG_CPP11_FEATURES
|
| 490 |
+
struct multi_grid_functions {
|
| 491 |
+
decltype(multi_grid::get_intrinsic_handle) *get_intrinsic_handle;
|
| 492 |
+
decltype(multi_grid::sync) *sync;
|
| 493 |
+
decltype(multi_grid::size) *size;
|
| 494 |
+
decltype(multi_grid::thread_rank) *thread_rank;
|
| 495 |
+
decltype(multi_grid::grid_rank) *grid_rank;
|
| 496 |
+
decltype(multi_grid::num_grids) *num_grids;
|
| 497 |
+
};
|
| 498 |
+
|
| 499 |
+
template <typename = void>
|
| 500 |
+
_CG_STATIC_QUALIFIER const multi_grid_functions* load_grid_intrinsics() {
|
| 501 |
+
__constant__ static const multi_grid_functions mgf {
|
| 502 |
+
&multi_grid::get_intrinsic_handle,
|
| 503 |
+
&multi_grid::sync,
|
| 504 |
+
&multi_grid::size,
|
| 505 |
+
&multi_grid::thread_rank,
|
| 506 |
+
&multi_grid::grid_rank,
|
| 507 |
+
&multi_grid::num_grids
|
| 508 |
+
};
|
| 509 |
+
|
| 510 |
+
return &mgf;
|
| 511 |
+
}
|
| 512 |
+
# endif
|
| 513 |
+
};
|
| 514 |
+
#endif
|
| 515 |
+
|
| 516 |
+
#if defined(_CG_HAS_CLUSTER_GROUP)
|
| 517 |
+
namespace cluster {
|
| 518 |
+
|
| 519 |
+
_CG_STATIC_QUALIFIER bool isReal()
|
| 520 |
+
{
|
| 521 |
+
return __clusterDimIsSpecified();
|
| 522 |
+
}
|
| 523 |
+
|
| 524 |
+
_CG_STATIC_QUALIFIER void barrier_arrive()
|
| 525 |
+
{
|
| 526 |
+
__cluster_barrier_arrive();
|
| 527 |
+
}
|
| 528 |
+
|
| 529 |
+
_CG_STATIC_QUALIFIER void barrier_wait()
|
| 530 |
+
{
|
| 531 |
+
__cluster_barrier_wait();
|
| 532 |
+
}
|
| 533 |
+
|
| 534 |
+
_CG_STATIC_QUALIFIER void sync()
|
| 535 |
+
{
|
| 536 |
+
barrier_arrive();
|
| 537 |
+
barrier_wait();
|
| 538 |
+
}
|
| 539 |
+
|
| 540 |
+
_CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr)
|
| 541 |
+
{
|
| 542 |
+
return __cluster_query_shared_rank(addr);
|
| 543 |
+
}
|
| 544 |
+
|
| 545 |
+
template <typename T>
|
| 546 |
+
_CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank)
|
| 547 |
+
{
|
| 548 |
+
return static_cast<T*>(__cluster_map_shared_rank(addr, rank));
|
| 549 |
+
}
|
| 550 |
+
|
| 551 |
+
_CG_STATIC_QUALIFIER dim3 block_index()
|
| 552 |
+
{
|
| 553 |
+
return __clusterRelativeBlockIdx();
|
| 554 |
+
}
|
| 555 |
+
|
| 556 |
+
_CG_STATIC_QUALIFIER unsigned int block_rank()
|
| 557 |
+
{
|
| 558 |
+
return __clusterRelativeBlockRank();
|
| 559 |
+
}
|
| 560 |
+
|
| 561 |
+
_CG_STATIC_QUALIFIER unsigned int thread_rank()
|
| 562 |
+
{
|
| 563 |
+
return block_rank() * cta::num_threads() + cta::thread_rank();
|
| 564 |
+
}
|
| 565 |
+
|
| 566 |
+
_CG_STATIC_QUALIFIER dim3 dim_blocks()
|
| 567 |
+
{
|
| 568 |
+
return __clusterDim();
|
| 569 |
+
}
|
| 570 |
+
|
| 571 |
+
_CG_STATIC_QUALIFIER unsigned int num_blocks()
|
| 572 |
+
{
|
| 573 |
+
return __clusterSizeInBlocks();
|
| 574 |
+
}
|
| 575 |
+
|
| 576 |
+
_CG_STATIC_QUALIFIER dim3 dim_threads()
|
| 577 |
+
{
|
| 578 |
+
const dim3 dimBlocks = dim_blocks();
|
| 579 |
+
const unsigned int x = dimBlocks.x * blockDim.x;
|
| 580 |
+
const unsigned int y = dimBlocks.y * blockDim.y;
|
| 581 |
+
const unsigned int z = dimBlocks.z * blockDim.z;
|
| 582 |
+
return dim3(x, y, z);
|
| 583 |
+
}
|
| 584 |
+
|
| 585 |
+
_CG_STATIC_QUALIFIER unsigned int num_threads()
|
| 586 |
+
{
|
| 587 |
+
return num_blocks() * cta::num_threads();
|
| 588 |
+
}
|
| 589 |
+
|
| 590 |
+
};
|
| 591 |
+
#endif
|
| 592 |
+
|
| 593 |
+
_CG_STATIC_QUALIFIER unsigned int laneid()
|
| 594 |
+
{
|
| 595 |
+
unsigned int laneid;
|
| 596 |
+
asm ("mov.u32 %0, %%laneid;" : "=r"(laneid));
|
| 597 |
+
return laneid;
|
| 598 |
+
}
|
| 599 |
+
|
| 600 |
+
_CG_STATIC_QUALIFIER unsigned int lanemask32_eq()
|
| 601 |
+
{
|
| 602 |
+
unsigned int lanemask32_eq;
|
| 603 |
+
asm ("mov.u32 %0, %%lanemask_eq;" : "=r"(lanemask32_eq));
|
| 604 |
+
return (lanemask32_eq);
|
| 605 |
+
}
|
| 606 |
+
|
| 607 |
+
_CG_STATIC_QUALIFIER unsigned int lanemask32_lt()
|
| 608 |
+
{
|
| 609 |
+
unsigned int lanemask32_lt;
|
| 610 |
+
asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask32_lt));
|
| 611 |
+
return (lanemask32_lt);
|
| 612 |
+
}
|
| 613 |
+
|
| 614 |
+
_CG_STATIC_QUALIFIER void abort()
|
| 615 |
+
{
|
| 616 |
+
_CG_ABORT();
|
| 617 |
+
}
|
| 618 |
+
|
| 619 |
+
template <typename Ty>
|
| 620 |
+
_CG_QUALIFIER void assert_if_not_arithmetic() {
|
| 621 |
+
#ifdef _CG_CPP11_FEATURES
|
| 622 |
+
static_assert(
|
| 623 |
+
_CG_STL_NAMESPACE::is_integral<Ty>::value ||
|
| 624 |
+
details::is_float_or_half<Ty>::value,
|
| 625 |
+
"Error: Ty is neither integer or float"
|
| 626 |
+
);
|
| 627 |
+
#endif
|
| 628 |
+
}
|
| 629 |
+
|
| 630 |
+
#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 631 |
+
template <unsigned int numWarps>
|
| 632 |
+
struct copy_channel {
|
| 633 |
+
char* channel_ptr;
|
| 634 |
+
barrier_t* sync_location;
|
| 635 |
+
size_t channel_size;
|
| 636 |
+
|
| 637 |
+
// One warp sending to all other warps, it has to wait for all other warps.
|
| 638 |
+
struct send_many_to_many {
|
| 639 |
+
_CG_STATIC_CONST_DECL wait_for_warps_kind wait_kind = wait_for_all_other_warps;
|
| 640 |
+
_CG_STATIC_QUALIFIER void post_iter_release(unsigned int thread_idx, barrier_t* sync_location) {
|
| 641 |
+
__syncwarp(0xFFFFFFFF);
|
| 642 |
+
details::sync_warps_release(sync_location, thread_idx == 0, cta::thread_rank(), numWarps);
|
| 643 |
+
}
|
| 644 |
+
};
|
| 645 |
+
|
| 646 |
+
// One warp is receiving and all other warps are sending to that warp, they have to wait for that one warp.
|
| 647 |
+
struct send_many_to_one {
|
| 648 |
+
_CG_STATIC_CONST_DECL wait_for_warps_kind wait_kind = wait_for_specific_warp;
|
| 649 |
+
_CG_STATIC_QUALIFIER void post_iter_release(unsigned int thread_idx, barrier_t* sync_location) {
|
| 650 |
+
// Wait for all warps to finish and let the last warp release all threads.
|
| 651 |
+
if (details::sync_warps_last_releases(sync_location, cta::thread_rank(), numWarps)) {
|
| 652 |
+
details::sync_warps_release(sync_location, thread_idx == 0, cta::thread_rank(), numWarps);
|
| 653 |
+
}
|
| 654 |
+
}
|
| 655 |
+
};
|
| 656 |
+
|
| 657 |
+
template <unsigned int ThreadCnt, size_t ValSize, typename SendDetails>
|
| 658 |
+
_CG_QUALIFIER void _send_value_internal(char* val_ptr, unsigned int thread_idx, unsigned int warp_id) {
|
| 659 |
+
size_t thread_offset = thread_idx * sizeof(int);
|
| 660 |
+
|
| 661 |
+
for (size_t i = 0; i < ValSize; i += channel_size) {
|
| 662 |
+
size_t bytes_left = ValSize - i;
|
| 663 |
+
size_t copy_chunk = min(bytes_left, channel_size);
|
| 664 |
+
|
| 665 |
+
details::sync_warps_wait_for_warps<SendDetails::wait_kind>(warp_id, sync_location, cta::thread_rank(), numWarps);
|
| 666 |
+
#pragma unroll 1
|
| 667 |
+
for (size_t j = thread_offset; j < copy_chunk ; j += sizeof(int) * ThreadCnt) {
|
| 668 |
+
size_t my_bytes_left = copy_chunk - j;
|
| 669 |
+
memcpy(channel_ptr + j, val_ptr + i + j, min(my_bytes_left, sizeof(int)));
|
| 670 |
+
}
|
| 671 |
+
SendDetails::post_iter_release(thread_idx, sync_location);
|
| 672 |
+
}
|
| 673 |
+
}
|
| 674 |
+
|
| 675 |
+
|
| 676 |
+
template <typename TyVal, unsigned int ThreadCnt, typename SendDetails>
|
| 677 |
+
_CG_QUALIFIER void send_value(TyVal& val, unsigned int thread_idx, unsigned int warp_id) {
|
| 678 |
+
_send_value_internal<ThreadCnt, sizeof(TyVal), SendDetails>(reinterpret_cast<char*>(&val), thread_idx, warp_id);
|
| 679 |
+
}
|
| 680 |
+
|
| 681 |
+
template <size_t ValSize>
|
| 682 |
+
_CG_QUALIFIER void _receive_value_internal(char* val_ptr, bool warp_master, bool active) {
|
| 683 |
+
for (size_t i = 0; i < ValSize; i += channel_size) {
|
| 684 |
+
size_t bytes_left = ValSize - i;
|
| 685 |
+
details::sync_warps_wait_for_release(sync_location, warp_master, cta::thread_rank(), numWarps);
|
| 686 |
+
if (active) {
|
| 687 |
+
memcpy(val_ptr + i, channel_ptr, min(bytes_left, channel_size));
|
| 688 |
+
}
|
| 689 |
+
}
|
| 690 |
+
}
|
| 691 |
+
|
| 692 |
+
template <typename TyVal>
|
| 693 |
+
_CG_QUALIFIER void receive_value(TyVal& val, bool warp_master, bool active = true) {
|
| 694 |
+
_receive_value_internal<sizeof(TyVal)>(reinterpret_cast<char*>(&val), warp_master, active);
|
| 695 |
+
}
|
| 696 |
+
};
|
| 697 |
+
|
| 698 |
+
_CG_STATIC_QUALIFIER constexpr unsigned int log2(unsigned int x) {
|
| 699 |
+
return x == 1 ? 0 : 1 + log2(x / 2);
|
| 700 |
+
}
|
| 701 |
+
#endif //_CG_CPP11_FEATURES
|
| 702 |
+
|
| 703 |
+
}; // !Namespace internal
|
| 704 |
+
|
| 705 |
+
_CG_END_NAMESPACE
|
| 706 |
+
|
| 707 |
+
#endif /* !_COOPERATIVE_GROUPS_HELPERS_H_ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/info.h
ADDED
|
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
#ifndef _CG_INFO_H_
|
| 52 |
+
#define _CG_INFO_H_
|
| 53 |
+
/*
|
| 54 |
+
** Define: _CG_VERSION
|
| 55 |
+
*/
|
| 56 |
+
#define _CG_VERSION 1000
|
| 57 |
+
|
| 58 |
+
/*
|
| 59 |
+
** Define: _CG_ABI_VERSION
|
| 60 |
+
*/
|
| 61 |
+
#ifndef _CG_ABI_VERSION
|
| 62 |
+
# define _CG_ABI_VERSION 1
|
| 63 |
+
#endif
|
| 64 |
+
|
| 65 |
+
/*
|
| 66 |
+
** Define: _CG_ABI_EXPERIMENTAL
|
| 67 |
+
** Desc: If enabled, sets all features enabled (ABI-breaking or experimental)
|
| 68 |
+
*/
|
| 69 |
+
#if defined(_CG_ABI_EXPERIMENTAL)
|
| 70 |
+
#endif
|
| 71 |
+
|
| 72 |
+
#define _CG_CONCAT_INNER(x, y) x ## y
|
| 73 |
+
#define _CG_CONCAT_OUTER(x, y) _CG_CONCAT_INNER(x, y)
|
| 74 |
+
#define _CG_NAMESPACE _CG_CONCAT_OUTER(__v, _CG_ABI_VERSION)
|
| 75 |
+
|
| 76 |
+
#define _CG_BEGIN_NAMESPACE \
|
| 77 |
+
namespace cooperative_groups { namespace _CG_NAMESPACE {
|
| 78 |
+
#define _CG_END_NAMESPACE \
|
| 79 |
+
}; using namespace _CG_NAMESPACE; };
|
| 80 |
+
|
| 81 |
+
#if (defined(__cplusplus) && (__cplusplus >= 201103L)) || (defined(_MSC_VER) && (_MSC_VER >= 1900))
|
| 82 |
+
# define _CG_CPP11_FEATURES
|
| 83 |
+
#endif
|
| 84 |
+
|
| 85 |
+
#if !defined(_CG_QUALIFIER)
|
| 86 |
+
# define _CG_QUALIFIER __forceinline__ __device__
|
| 87 |
+
#endif
|
| 88 |
+
#if !defined(_CG_STATIC_QUALIFIER)
|
| 89 |
+
# define _CG_STATIC_QUALIFIER static __forceinline__ __device__
|
| 90 |
+
#endif
|
| 91 |
+
#if !defined(_CG_CONSTEXPR_QUALIFIER)
|
| 92 |
+
# if defined(_CG_CPP11_FEATURES)
|
| 93 |
+
# define _CG_CONSTEXPR_QUALIFIER constexpr __forceinline__ __device__
|
| 94 |
+
# else
|
| 95 |
+
# define _CG_CONSTEXPR_QUALIFIER _CG_QUALIFIER
|
| 96 |
+
# endif
|
| 97 |
+
#endif
|
| 98 |
+
#if !defined(_CG_STATIC_CONSTEXPR_QUALIFIER)
|
| 99 |
+
# if defined(_CG_CPP11_FEATURES)
|
| 100 |
+
# define _CG_STATIC_CONSTEXPR_QUALIFIER static constexpr __forceinline__ __device__
|
| 101 |
+
# else
|
| 102 |
+
# define _CG_STATIC_CONSTEXPR_QUALIFIER _CG_STATIC_QUALIFIER
|
| 103 |
+
# endif
|
| 104 |
+
#endif
|
| 105 |
+
|
| 106 |
+
#if defined(_MSC_VER)
|
| 107 |
+
# define _CG_DEPRECATED __declspec(deprecated)
|
| 108 |
+
#else
|
| 109 |
+
# define _CG_DEPRECATED __attribute__((deprecated))
|
| 110 |
+
#endif
|
| 111 |
+
|
| 112 |
+
#if (__CUDA_ARCH__ >= 600) || !defined(__CUDA_ARCH__)
|
| 113 |
+
# define _CG_HAS_GRID_GROUP
|
| 114 |
+
#endif
|
| 115 |
+
#if (__CUDA_ARCH__ >= 600) || !defined(__CUDA_ARCH__)
|
| 116 |
+
# define _CG_HAS_MULTI_GRID_GROUP
|
| 117 |
+
#endif
|
| 118 |
+
#if (__CUDA_ARCH__ >= 700) || !defined(__CUDA_ARCH__)
|
| 119 |
+
# define _CG_HAS_MATCH_COLLECTIVE
|
| 120 |
+
#endif
|
| 121 |
+
#if ((__CUDA_ARCH__ >= 900) || !defined(__CUDA_ARCH__)) && (defined(__NVCC__) || defined(__CUDACC_RTC__) || defined(_CG_CLUSTER_INTRINSICS_AVAILABLE))
|
| 122 |
+
# define _CG_HAS_CLUSTER_GROUP
|
| 123 |
+
#endif
|
| 124 |
+
// Has __half and __half2
|
| 125 |
+
// Only usable if you include the cuda_fp16.h extension, and
|
| 126 |
+
// _before_ including cooperative_groups.h
|
| 127 |
+
#ifdef __CUDA_FP16_TYPES_EXIST__
|
| 128 |
+
# define _CG_HAS_FP16_COLLECTIVE
|
| 129 |
+
#endif
|
| 130 |
+
|
| 131 |
+
#if (__CUDA_ARCH__ >= 800) || !defined(__CUDA_ARCH__) && (defined(__NVCC__) || defined(__CUDACC_RTC__))
|
| 132 |
+
# define _CG_HAS_OP_REDUX
|
| 133 |
+
#endif
|
| 134 |
+
|
| 135 |
+
// Include libcu++ where supported.
|
| 136 |
+
#if defined(_CG_CPP11_FEATURES) && !defined(__QNX__) && !defined(__ibmxl__) && \
|
| 137 |
+
(defined(__NVCC__) || defined(__CUDACC_RTC__)) && \
|
| 138 |
+
(defined(__x86_64__) || defined(__aarch64__) || defined(__ppc64__)|| defined(_M_X64) || defined(_M_ARM64)) && \
|
| 139 |
+
(defined(_MSC_VER) || defined(__GNUC__) || defined(__clang__))
|
| 140 |
+
# define _CG_USE_CUDA_STL
|
| 141 |
+
#else
|
| 142 |
+
# define _CG_USE_OWN_TRAITS
|
| 143 |
+
#endif
|
| 144 |
+
|
| 145 |
+
#if defined(_CG_USE_CUDA_STL) && (!defined(__CUDA_ARCH__) || \
|
| 146 |
+
((!defined(_MSC_VER) && __CUDA_ARCH__ >= 600) || (defined(_MSC_VER) && __CUDA_ARCH__ >= 700)))
|
| 147 |
+
# define _CG_HAS_STL_ATOMICS
|
| 148 |
+
#endif
|
| 149 |
+
|
| 150 |
+
#ifdef _CG_CPP11_FEATURES
|
| 151 |
+
// Use cuda::std:: for type_traits
|
| 152 |
+
# if defined(_CG_USE_CUDA_STL)
|
| 153 |
+
# define _CG_STL_NAMESPACE cuda::std
|
| 154 |
+
# include <cuda/std/type_traits>
|
| 155 |
+
// Use CG's implementation of type traits
|
| 156 |
+
# else
|
| 157 |
+
# define _CG_STL_NAMESPACE cooperative_groups::details::templates
|
| 158 |
+
# endif
|
| 159 |
+
#endif
|
| 160 |
+
|
| 161 |
+
#ifdef _CG_CPP11_FEATURES
|
| 162 |
+
# define _CG_STATIC_CONST_DECL static constexpr
|
| 163 |
+
# define _CG_CONST_DECL constexpr
|
| 164 |
+
#else
|
| 165 |
+
# define _CG_STATIC_CONST_DECL static const
|
| 166 |
+
# define _CG_CONST_DECL const
|
| 167 |
+
#endif
|
| 168 |
+
|
| 169 |
+
#if (defined(_MSC_VER) && !defined(_WIN64)) || defined(__arm__)
|
| 170 |
+
# define _CG_ASM_PTR_CONSTRAINT "r"
|
| 171 |
+
#else
|
| 172 |
+
# define _CG_ASM_PTR_CONSTRAINT "l"
|
| 173 |
+
#endif
|
| 174 |
+
|
| 175 |
+
/*
|
| 176 |
+
** Define: CG_DEBUG
|
| 177 |
+
** What: Enables various runtime safety checks
|
| 178 |
+
*/
|
| 179 |
+
#if defined(__CUDACC_DEBUG__) && defined(CG_DEBUG) && !defined(NDEBUG)
|
| 180 |
+
# define _CG_DEBUG
|
| 181 |
+
#endif
|
| 182 |
+
|
| 183 |
+
#if defined(_CG_DEBUG)
|
| 184 |
+
# include <assert.h>
|
| 185 |
+
# define _CG_ASSERT(x) assert((x));
|
| 186 |
+
# define _CG_ABORT() assert(0);
|
| 187 |
+
#else
|
| 188 |
+
# define _CG_ASSERT(x)
|
| 189 |
+
# define _CG_ABORT() __trap();
|
| 190 |
+
#endif
|
| 191 |
+
|
| 192 |
+
#if defined(_CG_CPP11_FEATURES) && !defined(_CG_USE_CUDA_STL)
|
| 193 |
+
_CG_BEGIN_NAMESPACE
|
| 194 |
+
|
| 195 |
+
namespace details {
|
| 196 |
+
namespace templates {
|
| 197 |
+
|
| 198 |
+
/**
|
| 199 |
+
* Integral constants
|
| 200 |
+
**/
|
| 201 |
+
template <typename Ty, Ty Val>
|
| 202 |
+
struct integral_constant {
|
| 203 |
+
static constexpr Ty value = Val;
|
| 204 |
+
typedef Ty type;
|
| 205 |
+
|
| 206 |
+
_CG_QUALIFIER constexpr operator type() const noexcept { return value; }
|
| 207 |
+
_CG_QUALIFIER constexpr type operator()() const noexcept { return value; }
|
| 208 |
+
};
|
| 209 |
+
|
| 210 |
+
typedef integral_constant<bool, true> true_type;
|
| 211 |
+
typedef integral_constant<bool, false> false_type;
|
| 212 |
+
|
| 213 |
+
/**
|
| 214 |
+
* CV Qualifiers
|
| 215 |
+
**/
|
| 216 |
+
template <class Ty> struct is_lvalue_reference : public details::templates::false_type {};
|
| 217 |
+
template <class Ty> struct is_lvalue_reference<Ty&> : public details::templates::true_type {};
|
| 218 |
+
|
| 219 |
+
template <class Ty> struct remove_reference {typedef Ty type;};
|
| 220 |
+
template <class Ty> struct remove_reference<Ty&> {typedef Ty type;};
|
| 221 |
+
template <class Ty> struct remove_reference<Ty&&> {typedef Ty type;};
|
| 222 |
+
|
| 223 |
+
template <class Ty>
|
| 224 |
+
using remove_reference_t = typename details::templates::remove_reference<Ty>::type;
|
| 225 |
+
|
| 226 |
+
template <class Ty> struct remove_const {typedef Ty type;};
|
| 227 |
+
template <class Ty> struct remove_const<const Ty> {typedef Ty type;};
|
| 228 |
+
|
| 229 |
+
template <class Ty> struct remove_volatile {typedef Ty type;};
|
| 230 |
+
template <class Ty> struct remove_volatile<volatile Ty> {typedef Ty type;};
|
| 231 |
+
|
| 232 |
+
template <class Ty> struct remove_cv {typedef typename details::templates::remove_volatile<typename details::templates::remove_const<Ty>::type>::type type;};
|
| 233 |
+
|
| 234 |
+
template <class Ty>
|
| 235 |
+
using remove_cv_t = typename details::templates::remove_cv<Ty>::type;
|
| 236 |
+
|
| 237 |
+
template <class Ty>
|
| 238 |
+
_CG_QUALIFIER Ty&& forward(remove_reference_t<Ty> &t) noexcept {
|
| 239 |
+
return static_cast<Ty&&>(t);
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
template <class Ty>
|
| 243 |
+
_CG_QUALIFIER Ty&& forward(remove_reference_t<Ty> &&t) noexcept {
|
| 244 |
+
static_assert(!details::templates::is_lvalue_reference<Ty>::value, "Forwarding an rvalue as an lvalue is not allowed.");
|
| 245 |
+
return static_cast<Ty&&>(t);
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
/**
|
| 249 |
+
* is_integral
|
| 250 |
+
**/
|
| 251 |
+
template <class Ty> struct _is_integral : public details::templates::false_type {};
|
| 252 |
+
template <> struct _is_integral<bool> : public details::templates::true_type {};
|
| 253 |
+
template <> struct _is_integral<char> : public details::templates::true_type {};
|
| 254 |
+
template <> struct _is_integral<unsigned char> : public details::templates::true_type {};
|
| 255 |
+
template <> struct _is_integral<short> : public details::templates::true_type {};
|
| 256 |
+
template <> struct _is_integral<unsigned short> : public details::templates::true_type {};
|
| 257 |
+
template <> struct _is_integral<int> : public details::templates::true_type {};
|
| 258 |
+
template <> struct _is_integral<unsigned int> : public details::templates::true_type {};
|
| 259 |
+
template <> struct _is_integral<long> : public details::templates::true_type {};
|
| 260 |
+
template <> struct _is_integral<long long> : public details::templates::true_type {};
|
| 261 |
+
template <> struct _is_integral<unsigned long> : public details::templates::true_type {};
|
| 262 |
+
template <> struct _is_integral<unsigned long long> : public details::templates::true_type {};
|
| 263 |
+
//Vector type support?
|
| 264 |
+
|
| 265 |
+
template <typename Ty>
|
| 266 |
+
struct is_integral : public details::templates::_is_integral<typename details::templates::remove_cv<Ty>::type> {};
|
| 267 |
+
|
| 268 |
+
/**
|
| 269 |
+
* is_floating_point
|
| 270 |
+
**/
|
| 271 |
+
template <class Ty> struct _is_floating_point : public details::templates::false_type {};
|
| 272 |
+
template <> struct _is_floating_point<float> : public details::templates::true_type {};
|
| 273 |
+
template <> struct _is_floating_point<double> : public details::templates::true_type {};
|
| 274 |
+
template <> struct _is_floating_point<long double> : public details::templates::true_type {};
|
| 275 |
+
# ifdef __CUDA_FP16_TYPES_EXIST__
|
| 276 |
+
template <> struct _is_floating_point<__half> : public details::templates::true_type {};
|
| 277 |
+
template <> struct _is_floating_point<__half2> : public details::templates::true_type {};
|
| 278 |
+
# endif
|
| 279 |
+
//Vector type support?
|
| 280 |
+
|
| 281 |
+
template <typename Ty>
|
| 282 |
+
struct is_floating_point : public details::templates::_is_floating_point<typename details::templates::remove_cv<Ty>::type> {};
|
| 283 |
+
|
| 284 |
+
template <class T>
|
| 285 |
+
struct is_arithmetic : details::templates::integral_constant<
|
| 286 |
+
bool,
|
| 287 |
+
details::templates::is_integral<T>::value ||
|
| 288 |
+
details::templates::is_floating_point<T>::value> {};
|
| 289 |
+
|
| 290 |
+
template <typename Ty, bool = details::templates::is_arithmetic<Ty>::value>
|
| 291 |
+
struct _is_unsigned : details::templates::integral_constant<bool, Ty(0) < Ty(-1)> {};
|
| 292 |
+
|
| 293 |
+
template <typename Ty>
|
| 294 |
+
struct _is_unsigned<Ty,false> : details::templates::false_type {};
|
| 295 |
+
|
| 296 |
+
template <typename Ty>
|
| 297 |
+
struct is_unsigned : _is_unsigned<typename details::templates::remove_cv<Ty>::type> {};
|
| 298 |
+
|
| 299 |
+
/**
|
| 300 |
+
* programmatic type traits
|
| 301 |
+
**/
|
| 302 |
+
template<bool B, class Ty = void>
|
| 303 |
+
struct enable_if {};
|
| 304 |
+
|
| 305 |
+
template<class Ty>
|
| 306 |
+
struct enable_if<true, Ty> { typedef Ty type; };
|
| 307 |
+
|
| 308 |
+
template<bool Cond, typename Ty = void>
|
| 309 |
+
using enable_if_t = typename details::templates::enable_if<Cond, Ty>::type;
|
| 310 |
+
|
| 311 |
+
template<class Ty1, class Ty2>
|
| 312 |
+
struct is_same : details::templates::false_type {};
|
| 313 |
+
|
| 314 |
+
template<class Ty>
|
| 315 |
+
struct is_same<Ty, Ty> : details::templates::true_type {};
|
| 316 |
+
|
| 317 |
+
} // templates
|
| 318 |
+
} // details
|
| 319 |
+
_CG_END_NAMESPACE
|
| 320 |
+
|
| 321 |
+
#endif // _CG_CPP11_FEATURES
|
| 322 |
+
|
| 323 |
+
#endif // _CG_INFO_H_
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/partitioning.h
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef _CG_PARTITIONING_H
|
| 51 |
+
#define _CG_PARTITIONING_H
|
| 52 |
+
|
| 53 |
+
#include "info.h"
|
| 54 |
+
#include "helpers.h"
|
| 55 |
+
|
| 56 |
+
_CG_BEGIN_NAMESPACE
|
| 57 |
+
|
| 58 |
+
namespace details {
|
| 59 |
+
|
| 60 |
+
template <typename TyGroup>
|
| 61 |
+
_CG_STATIC_QUALIFIER coalesced_group _binary_partition(const TyGroup &tile, bool pred) {
|
| 62 |
+
const unsigned int fullMask = ~0u;
|
| 63 |
+
|
| 64 |
+
unsigned int thisMask = _coalesced_group_data_access::get_mask(tile);
|
| 65 |
+
unsigned int predMask = pred ? 0 : fullMask;
|
| 66 |
+
unsigned int setMask = __ballot_sync(thisMask, pred);
|
| 67 |
+
|
| 68 |
+
if (setMask == thisMask || setMask == 0) {
|
| 69 |
+
coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(thisMask);
|
| 70 |
+
_coalesced_group_data_access::modify_meta_group(subTile, 0, 1);
|
| 71 |
+
return subTile;
|
| 72 |
+
}
|
| 73 |
+
else {
|
| 74 |
+
unsigned int subMask = thisMask & (setMask ^ predMask);
|
| 75 |
+
coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(subMask);
|
| 76 |
+
_coalesced_group_data_access::modify_meta_group(subTile, pred, 2);
|
| 77 |
+
return subTile;
|
| 78 |
+
}
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
#ifdef _CG_HAS_MATCH_COLLECTIVE
|
| 82 |
+
template <typename TyGroup, typename TyPredicate>
|
| 83 |
+
_CG_STATIC_QUALIFIER coalesced_group _labeled_partition(const TyGroup &tile, TyPredicate pred) {
|
| 84 |
+
unsigned int thisMask = _coalesced_group_data_access::get_mask(tile);
|
| 85 |
+
unsigned int thisBias = __ffs(thisMask) - 1; // Subtract 1 to index properly from [1-32]
|
| 86 |
+
unsigned int subMask = __match_any_sync(thisMask, pred);
|
| 87 |
+
|
| 88 |
+
coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(subMask);
|
| 89 |
+
|
| 90 |
+
int leaderLaneId = subTile.shfl(details::laneid(), 0);
|
| 91 |
+
|
| 92 |
+
bool isLeader = !subTile.thread_rank();
|
| 93 |
+
unsigned int leaderMask = __ballot_sync(thisMask, isLeader);
|
| 94 |
+
unsigned int tileRank = __fns(leaderMask, leaderLaneId, 0) - thisBias;
|
| 95 |
+
|
| 96 |
+
_coalesced_group_data_access::modify_meta_group(subTile, tileRank, __popc(leaderMask));
|
| 97 |
+
|
| 98 |
+
return subTile;
|
| 99 |
+
}
|
| 100 |
+
#endif
|
| 101 |
+
}; // namespace details
|
| 102 |
+
|
| 103 |
+
_CG_STATIC_QUALIFIER coalesced_group binary_partition(const coalesced_group &tile, bool pred) {
|
| 104 |
+
return details::_binary_partition(tile, pred);
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
template <unsigned int Size, typename ParentT>
|
| 108 |
+
_CG_STATIC_QUALIFIER coalesced_group binary_partition(const thread_block_tile<Size, ParentT> &tile, bool pred) {
|
| 109 |
+
#ifdef _CG_CPP11_FEATURES
|
| 110 |
+
static_assert(Size <= 32, "Binary partition is available only for tiles of size smaller or equal to 32");
|
| 111 |
+
#endif
|
| 112 |
+
return details::_binary_partition(tile, pred);
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
#if defined(_CG_HAS_MATCH_COLLECTIVE) && defined(_CG_CPP11_FEATURES)
|
| 117 |
+
template <typename TyPredicate>
|
| 118 |
+
_CG_STATIC_QUALIFIER coalesced_group labeled_partition(const coalesced_group &tile, TyPredicate pred) {
|
| 119 |
+
static_assert(_CG_STL_NAMESPACE::is_integral<TyPredicate>::value, "labeled_partition predicate must be an integral type");
|
| 120 |
+
return details::_labeled_partition(tile, pred);
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
template <typename TyPredicate, unsigned int Size, typename ParentT>
|
| 124 |
+
_CG_STATIC_QUALIFIER coalesced_group labeled_partition(const thread_block_tile<Size, ParentT> &tile, TyPredicate pred) {
|
| 125 |
+
static_assert(_CG_STL_NAMESPACE::is_integral<TyPredicate>::value, "labeled_partition predicate must be an integral type");
|
| 126 |
+
static_assert(Size <= 32, "Labeled partition is available only for tiles of size smaller or equal to 32");
|
| 127 |
+
return details::_labeled_partition(tile, pred);
|
| 128 |
+
}
|
| 129 |
+
#endif
|
| 130 |
+
|
| 131 |
+
_CG_END_NAMESPACE
|
| 132 |
+
|
| 133 |
+
#endif // _CG_PARTITIONING_H
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/reduce.h
ADDED
|
@@ -0,0 +1,430 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _CG_REDUCE_H_
|
| 50 |
+
#define _CG_REDUCE_H_
|
| 51 |
+
|
| 52 |
+
#include "info.h"
|
| 53 |
+
#include "helpers.h"
|
| 54 |
+
#include "coalesced_reduce.h"
|
| 55 |
+
#include "functional.h"
|
| 56 |
+
#include "cooperative_groups.h"
|
| 57 |
+
|
| 58 |
+
_CG_BEGIN_NAMESPACE
|
| 59 |
+
|
| 60 |
+
namespace details {
|
| 61 |
+
|
| 62 |
+
template <class Ty>
|
| 63 |
+
using _redux_is_add_supported = _CG_STL_NAMESPACE::integral_constant<
|
| 64 |
+
bool,
|
| 65 |
+
_CG_STL_NAMESPACE::is_integral<Ty>::value && (sizeof(Ty) <= 4)>;
|
| 66 |
+
|
| 67 |
+
template <class Ty>
|
| 68 |
+
using redux_is_add_supported = _redux_is_add_supported<Ty>;
|
| 69 |
+
|
| 70 |
+
// A specialization for 64 bit logical operations is possible
|
| 71 |
+
// but for now only accelerate 32 bit bitwise ops
|
| 72 |
+
template <class Ty>
|
| 73 |
+
using redux_is_logical_supported = redux_is_add_supported<Ty>;
|
| 74 |
+
|
| 75 |
+
// Base operator support case
|
| 76 |
+
template <class TyOp, class Ty> struct _redux_op_supported : public _CG_STL_NAMESPACE::false_type {};
|
| 77 |
+
#ifdef _CG_HAS_OP_REDUX
|
| 78 |
+
template <class Ty> struct _redux_op_supported<cooperative_groups::plus<Ty>, Ty> : public redux_is_add_supported<Ty> {};
|
| 79 |
+
template <class Ty> struct _redux_op_supported<cooperative_groups::less<Ty>, Ty> : public redux_is_add_supported<Ty> {};
|
| 80 |
+
template <class Ty> struct _redux_op_supported<cooperative_groups::greater<Ty>, Ty> : public redux_is_add_supported<Ty> {};
|
| 81 |
+
template <class Ty> struct _redux_op_supported<cooperative_groups::bit_and<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
|
| 82 |
+
template <class Ty> struct _redux_op_supported<cooperative_groups::bit_or<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
|
| 83 |
+
template <class Ty> struct _redux_op_supported<cooperative_groups::bit_xor<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
|
| 84 |
+
#endif
|
| 85 |
+
|
| 86 |
+
template <class Ty, template <class> class TyOp>
|
| 87 |
+
using redux_op_supported = _redux_op_supported<
|
| 88 |
+
typename details::remove_qual<TyOp<Ty>>,
|
| 89 |
+
Ty>;
|
| 90 |
+
|
| 91 |
+
// Groups smaller than 16 actually have worse performance characteristics when used with redux
|
| 92 |
+
// tiles of size 16 and 32 perform the same or better and have better code generation profiles
|
| 93 |
+
template <class TyGroup> struct _redux_group_optimized : public _CG_STL_NAMESPACE::false_type {};
|
| 94 |
+
|
| 95 |
+
template <unsigned int Sz, typename TyPar>
|
| 96 |
+
struct _redux_group_optimized<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::integral_constant<
|
| 97 |
+
bool,
|
| 98 |
+
(Sz >= 16)> {};
|
| 99 |
+
template <unsigned int Sz, typename TyPar>
|
| 100 |
+
struct _redux_group_optimized<internal_thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::integral_constant<
|
| 101 |
+
bool,
|
| 102 |
+
(Sz >= 16)> {};
|
| 103 |
+
template <>
|
| 104 |
+
struct _redux_group_optimized<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
|
| 105 |
+
|
| 106 |
+
template <typename TyGroup>
|
| 107 |
+
using redux_group_optimized = _redux_group_optimized<details::remove_qual<TyGroup>>;
|
| 108 |
+
|
| 109 |
+
template <template <class> class TyOp>
|
| 110 |
+
_CG_STATIC_QUALIFIER int pick_redux(int mask, int val);
|
| 111 |
+
template <template <class> class TyOp>
|
| 112 |
+
_CG_STATIC_QUALIFIER unsigned int pick_redux(int mask, unsigned int val);
|
| 113 |
+
|
| 114 |
+
#ifdef _CG_HAS_OP_REDUX
|
| 115 |
+
template <> _CG_QUALIFIER int pick_redux<cooperative_groups::plus>(int mask, int val) {
|
| 116 |
+
return __reduce_add_sync(mask, val);
|
| 117 |
+
}
|
| 118 |
+
template <> _CG_QUALIFIER int pick_redux<cooperative_groups::less>(int mask, int val) {
|
| 119 |
+
return __reduce_min_sync(mask, val);
|
| 120 |
+
}
|
| 121 |
+
template <> _CG_QUALIFIER int pick_redux<cooperative_groups::greater>(int mask, int val) {
|
| 122 |
+
return __reduce_max_sync(mask, val);
|
| 123 |
+
}
|
| 124 |
+
template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_and>(int mask, int val) {
|
| 125 |
+
return __reduce_and_sync(mask, val);
|
| 126 |
+
}
|
| 127 |
+
template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_xor>(int mask, int val) {
|
| 128 |
+
return __reduce_xor_sync(mask, val);
|
| 129 |
+
}
|
| 130 |
+
template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_or>(int mask, int val) {
|
| 131 |
+
return __reduce_or_sync(mask, val);
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::plus>(int mask, unsigned int val) {
|
| 135 |
+
return __reduce_add_sync(mask, val);
|
| 136 |
+
}
|
| 137 |
+
template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::less>(int mask, unsigned int val) {
|
| 138 |
+
return __reduce_min_sync(mask, val);
|
| 139 |
+
}
|
| 140 |
+
template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::greater>(int mask, unsigned int val) {
|
| 141 |
+
return __reduce_max_sync(mask, val);
|
| 142 |
+
}
|
| 143 |
+
template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_and>(int mask, unsigned int val) {
|
| 144 |
+
return __reduce_and_sync(mask, val);
|
| 145 |
+
}
|
| 146 |
+
template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_xor>(int mask, unsigned int val) {
|
| 147 |
+
return __reduce_xor_sync(mask, val);
|
| 148 |
+
}
|
| 149 |
+
template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_or>(int mask, unsigned int val) {
|
| 150 |
+
return __reduce_or_sync(mask, val);
|
| 151 |
+
}
|
| 152 |
+
#endif
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
template <typename TyVal, bool = _CG_STL_NAMESPACE::is_unsigned<TyVal>::value>
|
| 156 |
+
struct _accelerated_op;
|
| 157 |
+
|
| 158 |
+
// Signed type redux intrinsic dispatch
|
| 159 |
+
template <typename TyVal>
|
| 160 |
+
struct _accelerated_op<TyVal, false> {
|
| 161 |
+
template <template <class> class TyOp>
|
| 162 |
+
_CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) {
|
| 163 |
+
return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<int>(val)));
|
| 164 |
+
}
|
| 165 |
+
};
|
| 166 |
+
|
| 167 |
+
// Unsigned type redux intrinsic dispatch
|
| 168 |
+
template <typename TyVal>
|
| 169 |
+
struct _accelerated_op<TyVal, true> {
|
| 170 |
+
template <template <class> class TyOp>
|
| 171 |
+
_CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) {
|
| 172 |
+
return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<unsigned int>(val)));
|
| 173 |
+
}
|
| 174 |
+
};
|
| 175 |
+
|
| 176 |
+
template <typename TyVal>
|
| 177 |
+
using accelerated_op = _accelerated_op<TyVal>;
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
template <typename TyVal, typename TyFnInput, typename TyGroup>
|
| 181 |
+
class _redux_dispatch {
|
| 182 |
+
template <class Ty, template <class> class TyOp>
|
| 183 |
+
using _redux_is_usable = _CG_STL_NAMESPACE::integral_constant<bool,
|
| 184 |
+
redux_op_supported<Ty, TyOp>::value &&
|
| 185 |
+
redux_group_optimized<TyGroup>::value>;
|
| 186 |
+
|
| 187 |
+
template <class Ty, template <class> class TyOp>
|
| 188 |
+
using redux_is_usable = typename _CG_STL_NAMESPACE::enable_if<_redux_is_usable<Ty, TyOp>::value, void>::type*;
|
| 189 |
+
|
| 190 |
+
template <class Ty, template <class> class TyOp>
|
| 191 |
+
using redux_is_not_usable = typename _CG_STL_NAMESPACE::enable_if<!_redux_is_usable<Ty, TyOp>::value, void>::type*;
|
| 192 |
+
|
| 193 |
+
public:
|
| 194 |
+
// Dispatch to redux if the combination of op and args are supported
|
| 195 |
+
template<
|
| 196 |
+
template <class> class TyOp,
|
| 197 |
+
redux_is_usable<TyFnInput, TyOp> = nullptr>
|
| 198 |
+
_CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
|
| 199 |
+
// Retrieve the mask for the group and dispatch to redux
|
| 200 |
+
return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val));
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
template<
|
| 204 |
+
template <class> class TyOp,
|
| 205 |
+
redux_is_usable<TyFnInput, TyOp> = nullptr>
|
| 206 |
+
_CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) {
|
| 207 |
+
// Retrieve the mask for the group and dispatch to redux
|
| 208 |
+
return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val));
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
// Fallback shuffle sync reduction
|
| 212 |
+
template <
|
| 213 |
+
template <class> class TyOp,
|
| 214 |
+
redux_is_not_usable<TyFnInput, TyOp> = nullptr>
|
| 215 |
+
_CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
|
| 216 |
+
//Dispatch to fallback shuffle sync accelerated reduction
|
| 217 |
+
return coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
};
|
| 221 |
+
|
| 222 |
+
// Group support for reduce.
|
| 223 |
+
template <class TyGroup> struct _reduce_group_supported : public _CG_STL_NAMESPACE::false_type {};
|
| 224 |
+
|
| 225 |
+
template <unsigned int Sz, typename TyPar>
|
| 226 |
+
struct _reduce_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
|
| 227 |
+
template <unsigned int Sz, typename TyPar>
|
| 228 |
+
struct _reduce_group_supported<internal_thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
|
| 229 |
+
template <>
|
| 230 |
+
struct _reduce_group_supported<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
|
| 231 |
+
|
| 232 |
+
template <typename TyGroup>
|
| 233 |
+
using reduce_group_supported = _reduce_group_supported<details::remove_qual<TyGroup>>;
|
| 234 |
+
|
| 235 |
+
template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup>
|
| 236 |
+
_CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
|
| 237 |
+
static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ");
|
| 238 |
+
|
| 239 |
+
using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>;
|
| 240 |
+
return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup>
|
| 244 |
+
_CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) {
|
| 245 |
+
static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ");
|
| 246 |
+
|
| 247 |
+
using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>;
|
| 248 |
+
return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
template <typename TyVal, typename TyOp, typename TyGroup>
|
| 253 |
+
_CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
|
| 254 |
+
return details::coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
template <unsigned int GroupId>
|
| 258 |
+
struct tile_reduce_dispatch;
|
| 259 |
+
|
| 260 |
+
template <>
|
| 261 |
+
struct tile_reduce_dispatch<details::coalesced_group_id> {
|
| 262 |
+
template <typename TyGroup, typename TyVal, typename TyFn>
|
| 263 |
+
_CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 264 |
+
return details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 265 |
+
}
|
| 266 |
+
};
|
| 267 |
+
|
| 268 |
+
#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 269 |
+
template <>
|
| 270 |
+
struct tile_reduce_dispatch<details::multi_tile_group_id> {
|
| 271 |
+
template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn>
|
| 272 |
+
_CG_STATIC_QUALIFIER auto reduce(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 273 |
+
using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
|
| 274 |
+
using TyRet = details::remove_qual<TyVal>;
|
| 275 |
+
const unsigned int num_warps = Size / 32;
|
| 276 |
+
|
| 277 |
+
auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
|
| 278 |
+
*warp_scratch_location =
|
| 279 |
+
details::reduce(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
|
| 280 |
+
};
|
| 281 |
+
auto inter_warp_lambda =
|
| 282 |
+
[&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
|
| 283 |
+
*thread_scratch_location =
|
| 284 |
+
details::reduce(subwarp, *thread_scratch_location, _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 285 |
+
};
|
| 286 |
+
return details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
|
| 287 |
+
}
|
| 288 |
+
};
|
| 289 |
+
|
| 290 |
+
enum class AsyncReduceType { store, update };
|
| 291 |
+
|
| 292 |
+
template <AsyncReduceType TyAsyncReduce>
|
| 293 |
+
struct async_reduce_result_handler;
|
| 294 |
+
|
| 295 |
+
template<>
|
| 296 |
+
struct async_reduce_result_handler<AsyncReduceType::store> {
|
| 297 |
+
template<typename TyDst, typename TyVal, typename TyOp>
|
| 298 |
+
_CG_STATIC_QUALIFIER void handleResult(TyDst *dst, TyVal& result, TyOp&& op) {
|
| 299 |
+
*dst = result;
|
| 300 |
+
}
|
| 301 |
+
};
|
| 302 |
+
|
| 303 |
+
#if defined(_CG_HAS_STL_ATOMICS)
|
| 304 |
+
template<>
|
| 305 |
+
struct async_reduce_result_handler<AsyncReduceType::update> {
|
| 306 |
+
template<typename TyDst, typename TyVal, typename TyOp>
|
| 307 |
+
_CG_STATIC_QUALIFIER void handleResult(TyDst& dst, TyVal& result, TyOp&& op) {
|
| 308 |
+
atomic_update(dst, result, _CG_STL_NAMESPACE::forward<TyOp>(op));
|
| 309 |
+
}
|
| 310 |
+
};
|
| 311 |
+
#endif
|
| 312 |
+
|
| 313 |
+
template <unsigned int GroupId, AsyncReduceType TyAsyncReduce>
|
| 314 |
+
struct tile_async_reduce_dispatch;
|
| 315 |
+
|
| 316 |
+
template <AsyncReduceType TyAsyncReduce>
|
| 317 |
+
struct tile_async_reduce_dispatch<details::coalesced_group_id, TyAsyncReduce> {
|
| 318 |
+
template <unsigned int TySize, typename ParentT, typename TyDst, typename TyVal, typename TyFn>
|
| 319 |
+
_CG_STATIC_QUALIFIER void reduce(const __single_warp_thread_block_tile<TySize, ParentT>& group, TyDst& dst, TyVal&& val, TyFn&& op) {
|
| 320 |
+
// Do regular, in group reduction
|
| 321 |
+
auto result = details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
|
| 322 |
+
|
| 323 |
+
// One thread stores/updates the destination
|
| 324 |
+
if (group.thread_rank() == 0) {
|
| 325 |
+
async_reduce_result_handler<TyAsyncReduce>::handleResult(dst, result, _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 326 |
+
}
|
| 327 |
+
}
|
| 328 |
+
|
| 329 |
+
template <typename TyDst, typename TyVal, typename TyFn>
|
| 330 |
+
_CG_STATIC_QUALIFIER void reduce(const coalesced_group& group, TyDst& dst, TyVal&& val, TyFn&& op) {
|
| 331 |
+
// Do in group reduction to the last thread
|
| 332 |
+
auto result = details::coalesced_reduce_to_one(group, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
|
| 333 |
+
|
| 334 |
+
// One thread stores/updates the destination
|
| 335 |
+
if (group.thread_rank() == group.size() - 1) {
|
| 336 |
+
async_reduce_result_handler<TyAsyncReduce>::handleResult(dst, result, _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 337 |
+
}
|
| 338 |
+
}
|
| 339 |
+
};
|
| 340 |
+
|
| 341 |
+
template <AsyncReduceType TyAsyncReduce>
|
| 342 |
+
struct tile_async_reduce_dispatch<details::multi_tile_group_id, TyAsyncReduce> {
|
| 343 |
+
template <unsigned int TySize, typename ParentT, typename TyDst, typename TyInputVal, typename TyFn>
|
| 344 |
+
_CG_STATIC_QUALIFIER void reduce(const thread_block_tile<TySize, ParentT>& group, TyDst& dst, TyInputVal&& val, TyFn&& op) {
|
| 345 |
+
using TyVal = remove_qual<TyInputVal>;
|
| 346 |
+
const unsigned int num_warps = TySize / 32;
|
| 347 |
+
details::barrier_t* sync_location = multi_warp_sync_location_getter(group);
|
| 348 |
+
auto warp_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, group.thread_rank() / 32);
|
| 349 |
+
|
| 350 |
+
// Do in warp reduce
|
| 351 |
+
auto warp = details::tiled_partition_internal<32, thread_block_tile<TySize, ParentT>>();
|
| 352 |
+
*warp_scratch_location = details::reduce(warp, _CG_STL_NAMESPACE::forward<TyInputVal>(val), op);
|
| 353 |
+
|
| 354 |
+
// Tile of size num_warps from the last warp to arrive does final reduction step
|
| 355 |
+
if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), num_warps)) {
|
| 356 |
+
auto subwarp = details::tiled_partition_internal<num_warps, decltype(warp)>();
|
| 357 |
+
if (subwarp.meta_group_rank() == 0) {
|
| 358 |
+
auto thread_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, subwarp.thread_rank());
|
| 359 |
+
auto thread_val = *thread_scratch_location;
|
| 360 |
+
// Release other warps, we read their contribution already.
|
| 361 |
+
subwarp.sync();
|
| 362 |
+
details::sync_warps_release(sync_location, subwarp.thread_rank() == 0, details::cta::thread_rank(), num_warps);
|
| 363 |
+
TyVal result = details::reduce(subwarp, thread_val, op);
|
| 364 |
+
// One thread stores the result or updates the atomic
|
| 365 |
+
if (subwarp.thread_rank() == 0) {
|
| 366 |
+
async_reduce_result_handler<TyAsyncReduce>::handleResult(dst, result, _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 367 |
+
}
|
| 368 |
+
}
|
| 369 |
+
warp.sync();
|
| 370 |
+
}
|
| 371 |
+
}
|
| 372 |
+
};
|
| 373 |
+
#endif
|
| 374 |
+
|
| 375 |
+
template <typename TyGroup, typename TyInputVal, typename TyRetVal>
|
| 376 |
+
_CG_QUALIFIER void check_reduce_params() {
|
| 377 |
+
static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ");
|
| 378 |
+
static_assert(details::reduce_group_supported<TyGroup>::value, "This group does not exclusively represent a tile");
|
| 379 |
+
};
|
| 380 |
+
|
| 381 |
+
template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal>
|
| 382 |
+
_CG_QUALIFIER void check_async_reduce_params() {
|
| 383 |
+
check_reduce_params<TyGroup, TyInputVal, TyRetVal>();
|
| 384 |
+
static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ");
|
| 385 |
+
}
|
| 386 |
+
} // details
|
| 387 |
+
|
| 388 |
+
template <typename TyGroup, typename TyVal, typename TyFn>
|
| 389 |
+
_CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 390 |
+
details::check_reduce_params<TyGroup, details::remove_qual<TyVal>, decltype(op(val, val))>();
|
| 391 |
+
|
| 392 |
+
using dispatch = details::tile_reduce_dispatch<TyGroup::_group_id>;
|
| 393 |
+
return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 397 |
+
namespace experimental {
|
| 398 |
+
|
| 399 |
+
#if defined(_CG_HAS_STL_ATOMICS)
|
| 400 |
+
template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
|
| 401 |
+
void _CG_QUALIFIER reduce_update_async(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
|
| 402 |
+
details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
|
| 403 |
+
|
| 404 |
+
using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id, details::AsyncReduceType::update>;
|
| 405 |
+
dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 406 |
+
}
|
| 407 |
+
|
| 408 |
+
template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
|
| 409 |
+
void _CG_QUALIFIER reduce_update_async(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
|
| 410 |
+
details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
|
| 411 |
+
|
| 412 |
+
using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id, details::AsyncReduceType::update>;
|
| 413 |
+
dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 414 |
+
}
|
| 415 |
+
#endif
|
| 416 |
+
|
| 417 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, typename TyFn>
|
| 418 |
+
void _CG_QUALIFIER reduce_store_async(const TyGroup& group, TyVal* dst, TyInputVal&& val, TyFn&& op) {
|
| 419 |
+
details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
|
| 420 |
+
|
| 421 |
+
using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id, details::AsyncReduceType::store>;
|
| 422 |
+
dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 423 |
+
}
|
| 424 |
+
|
| 425 |
+
}
|
| 426 |
+
#endif
|
| 427 |
+
|
| 428 |
+
_CG_END_NAMESPACE
|
| 429 |
+
|
| 430 |
+
#endif // _CG_REDUCE_H_
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/scan.h
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _CG_SCAN_H_
|
| 50 |
+
#define _CG_SCAN_H_
|
| 51 |
+
|
| 52 |
+
#include "info.h"
|
| 53 |
+
#include "helpers.h"
|
| 54 |
+
#include "functional.h"
|
| 55 |
+
#include "coalesced_scan.h"
|
| 56 |
+
|
| 57 |
+
_CG_BEGIN_NAMESPACE
|
| 58 |
+
|
| 59 |
+
namespace details {
|
| 60 |
+
|
| 61 |
+
// Group support for scan.
|
| 62 |
+
template <class TyGroup> struct _scan_group_supported : public _CG_STL_NAMESPACE::false_type {};
|
| 63 |
+
|
| 64 |
+
template <unsigned int Sz, typename TyPar>
|
| 65 |
+
struct _scan_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
|
| 66 |
+
template <unsigned int Sz, typename TyPar>
|
| 67 |
+
struct _scan_group_supported<internal_thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
|
| 68 |
+
template <>
|
| 69 |
+
struct _scan_group_supported<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
|
| 70 |
+
|
| 71 |
+
template <typename TyGroup>
|
| 72 |
+
using scan_group_supported = _scan_group_supported<details::remove_qual<TyGroup>>;
|
| 73 |
+
|
| 74 |
+
template <bool IsIntegralPlus>
|
| 75 |
+
struct integral_optimized_scan;
|
| 76 |
+
|
| 77 |
+
enum class ScanType { exclusive, inclusive };
|
| 78 |
+
|
| 79 |
+
template <unsigned int GroupId, ScanType TyScan>
|
| 80 |
+
struct scan_dispatch;
|
| 81 |
+
|
| 82 |
+
template <ScanType TyScan>
|
| 83 |
+
struct scan_dispatch<details::coalesced_group_id, TyScan> {
|
| 84 |
+
template <typename TyGroup, typename TyVal, typename TyFn>
|
| 85 |
+
_CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 86 |
+
auto scan_result = coalesced_inclusive_scan(group, val, op);
|
| 87 |
+
if (TyScan == ScanType::exclusive) {
|
| 88 |
+
scan_result = convert_inclusive_to_exclusive(group,
|
| 89 |
+
scan_result,
|
| 90 |
+
_CG_STL_NAMESPACE::forward<TyVal>(val),
|
| 91 |
+
_CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 92 |
+
}
|
| 93 |
+
return scan_result;
|
| 94 |
+
}
|
| 95 |
+
};
|
| 96 |
+
|
| 97 |
+
#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
|
| 98 |
+
template <ScanType TyScan>
|
| 99 |
+
struct scan_dispatch<details::multi_tile_group_id, TyScan> {
|
| 100 |
+
template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn>
|
| 101 |
+
_CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 102 |
+
using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
|
| 103 |
+
using TyRet = details::remove_qual<TyVal>;
|
| 104 |
+
const unsigned int num_warps = Size / 32;
|
| 105 |
+
// In warp scan result, calculated in warp_lambda
|
| 106 |
+
TyRet warp_scan;
|
| 107 |
+
|
| 108 |
+
// In warp scan, put sum in the warp_scratch_location
|
| 109 |
+
auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
|
| 110 |
+
warp_scan =
|
| 111 |
+
details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
|
| 112 |
+
if (warp.thread_rank() + 1 == warp.size()) {
|
| 113 |
+
*warp_scratch_location = warp_scan;
|
| 114 |
+
}
|
| 115 |
+
if (TyScan == ScanType::exclusive) {
|
| 116 |
+
warp_scan = warp.shfl_up(warp_scan, 1);
|
| 117 |
+
}
|
| 118 |
+
};
|
| 119 |
+
|
| 120 |
+
// Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it
|
| 121 |
+
// to its in-warp scan result
|
| 122 |
+
auto inter_warp_lambda =
|
| 123 |
+
[&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
|
| 124 |
+
auto thread_val = *thread_scratch_location;
|
| 125 |
+
auto result = coalesced_inclusive_scan(subwarp, thread_val, op);
|
| 126 |
+
*thread_scratch_location = convert_inclusive_to_exclusive(subwarp, result, thread_val, op);
|
| 127 |
+
};
|
| 128 |
+
|
| 129 |
+
TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
|
| 130 |
+
if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) {
|
| 131 |
+
return previous_warps_sum;
|
| 132 |
+
}
|
| 133 |
+
if (warpType::meta_group_rank() == 0) {
|
| 134 |
+
return warp_scan;
|
| 135 |
+
}
|
| 136 |
+
else {
|
| 137 |
+
return op(warp_scan, previous_warps_sum);
|
| 138 |
+
}
|
| 139 |
+
}
|
| 140 |
+
};
|
| 141 |
+
|
| 142 |
+
#if defined(_CG_HAS_STL_ATOMICS)
|
| 143 |
+
template <unsigned int GroupId, ScanType TyScan>
|
| 144 |
+
struct scan_update_dispatch;
|
| 145 |
+
|
| 146 |
+
template <ScanType TyScan>
|
| 147 |
+
struct scan_update_dispatch<details::coalesced_group_id, TyScan> {
|
| 148 |
+
template <typename TyGroup, typename TyAtomic, typename TyVal, typename TyFn>
|
| 149 |
+
_CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 150 |
+
details::remove_qual<TyVal> old;
|
| 151 |
+
|
| 152 |
+
// Do regular in group scan
|
| 153 |
+
auto scan_result = details::coalesced_inclusive_scan(group, val, op);
|
| 154 |
+
|
| 155 |
+
// Last thread updates the atomic and distributes its old value to other threads
|
| 156 |
+
if (group.thread_rank() == group.size() - 1) {
|
| 157 |
+
old = atomic_update(dst, scan_result, _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 158 |
+
}
|
| 159 |
+
old = group.shfl(old, group.size() - 1);
|
| 160 |
+
if (TyScan == ScanType::exclusive) {
|
| 161 |
+
scan_result = convert_inclusive_to_exclusive(group, scan_result, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
|
| 162 |
+
}
|
| 163 |
+
scan_result = op(old, scan_result);
|
| 164 |
+
return scan_result;
|
| 165 |
+
}
|
| 166 |
+
};
|
| 167 |
+
|
| 168 |
+
template <ScanType TyScan>
|
| 169 |
+
struct scan_update_dispatch<details::multi_tile_group_id, TyScan> {
|
| 170 |
+
template <unsigned int Size, typename ParentT, typename TyAtomic, typename TyVal, typename TyFn>
|
| 171 |
+
_CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 172 |
+
using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
|
| 173 |
+
using TyRet = details::remove_qual<TyVal>;
|
| 174 |
+
const unsigned int num_warps = Size / 32;
|
| 175 |
+
// In warp scan result, calculated in warp_lambda
|
| 176 |
+
TyRet warp_scan;
|
| 177 |
+
|
| 178 |
+
// In warp scan, put sum in the warp_scratch_location
|
| 179 |
+
auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
|
| 180 |
+
warp_scan =
|
| 181 |
+
details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
|
| 182 |
+
if (warp.thread_rank() + 1 == warp.size()) {
|
| 183 |
+
*warp_scratch_location = warp_scan;
|
| 184 |
+
}
|
| 185 |
+
if (TyScan == ScanType::exclusive) {
|
| 186 |
+
warp_scan = warp.shfl_up(warp_scan, 1);
|
| 187 |
+
}
|
| 188 |
+
};
|
| 189 |
+
|
| 190 |
+
// Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it
|
| 191 |
+
// to its in-warp scan result
|
| 192 |
+
auto inter_warp_lambda =
|
| 193 |
+
[&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
|
| 194 |
+
auto thread_val = *thread_scratch_location;
|
| 195 |
+
auto scan_result = details::coalesced_inclusive_scan(subwarp, thread_val, op);
|
| 196 |
+
TyRet offset;
|
| 197 |
+
// Single thread does the atomic update with sum of all contributions and reads the old value.
|
| 198 |
+
if (subwarp.thread_rank() == subwarp.size() - 1) {
|
| 199 |
+
offset = details::atomic_update(dst, scan_result, op);
|
| 200 |
+
}
|
| 201 |
+
offset = subwarp.shfl(offset, subwarp.size() - 1);
|
| 202 |
+
scan_result = convert_inclusive_to_exclusive(subwarp, scan_result, thread_val, op);
|
| 203 |
+
// Add offset read from the atomic to the scanned warp sum.
|
| 204 |
+
// Skipping first thread, since it got defautly constructed value from the conversion,
|
| 205 |
+
// it should just return the offset received from the thread that did the atomic update.
|
| 206 |
+
if (subwarp.thread_rank() != 0) {
|
| 207 |
+
offset = op(scan_result, offset);
|
| 208 |
+
}
|
| 209 |
+
*thread_scratch_location = offset;
|
| 210 |
+
};
|
| 211 |
+
|
| 212 |
+
TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
|
| 213 |
+
if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) {
|
| 214 |
+
return previous_warps_sum;
|
| 215 |
+
}
|
| 216 |
+
return op(warp_scan, previous_warps_sum);
|
| 217 |
+
}
|
| 218 |
+
};
|
| 219 |
+
#endif
|
| 220 |
+
#endif
|
| 221 |
+
|
| 222 |
+
template <typename TyGroup, typename TyInputVal, typename TyRetVal>
|
| 223 |
+
_CG_QUALIFIER void check_scan_params() {
|
| 224 |
+
static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ");
|
| 225 |
+
static_assert(details::scan_group_supported<TyGroup>::value, "This group does not exclusively represent a tile");
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
#if defined(_CG_HAS_STL_ATOMICS)
|
| 229 |
+
template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal>
|
| 230 |
+
_CG_QUALIFIER void check_scan_update_params() {
|
| 231 |
+
check_scan_params<TyGroup, TyInputVal, TyRetVal>();
|
| 232 |
+
static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ");
|
| 233 |
+
}
|
| 234 |
+
#endif
|
| 235 |
+
|
| 236 |
+
} // details
|
| 237 |
+
|
| 238 |
+
template <typename TyGroup, typename TyVal, typename TyFn>
|
| 239 |
+
_CG_QUALIFIER auto inclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 240 |
+
details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>();
|
| 241 |
+
|
| 242 |
+
using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
|
| 243 |
+
return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
template <typename TyGroup, typename TyVal>
|
| 247 |
+
_CG_QUALIFIER details::remove_qual<TyVal> inclusive_scan(const TyGroup& group, TyVal&& val) {
|
| 248 |
+
return inclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>());
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
template <typename TyGroup, typename TyVal, typename TyFn>
|
| 252 |
+
_CG_QUALIFIER auto exclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 253 |
+
details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>();
|
| 254 |
+
|
| 255 |
+
using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
|
| 256 |
+
return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
template <typename TyGroup, typename TyVal>
|
| 260 |
+
_CG_QUALIFIER details::remove_qual<TyVal> exclusive_scan(const TyGroup& group, TyVal&& val) {
|
| 261 |
+
return exclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>());
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
#if defined(_CG_HAS_STL_ATOMICS) && defined(_CG_ABI_EXPERIMENTAL)
|
| 265 |
+
|
| 266 |
+
namespace experimental {
|
| 267 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
|
| 268 |
+
_CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 269 |
+
details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
|
| 270 |
+
|
| 271 |
+
using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
|
| 272 |
+
return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
|
| 276 |
+
_CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco> & dst, TyInputVal&& val) {
|
| 277 |
+
return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
|
| 281 |
+
_CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 282 |
+
details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
|
| 283 |
+
|
| 284 |
+
using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
|
| 285 |
+
return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
|
| 289 |
+
_CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val) {
|
| 290 |
+
return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
|
| 294 |
+
_CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 295 |
+
details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
|
| 296 |
+
|
| 297 |
+
using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
|
| 298 |
+
return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
|
| 302 |
+
_CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco> & dst, TyInputVal&& val) {
|
| 303 |
+
return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
|
| 307 |
+
_CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
|
| 308 |
+
details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
|
| 309 |
+
|
| 310 |
+
using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
|
| 311 |
+
return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
|
| 315 |
+
_CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val) {
|
| 316 |
+
return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
|
| 317 |
+
}
|
| 318 |
+
}
|
| 319 |
+
|
| 320 |
+
#endif
|
| 321 |
+
|
| 322 |
+
_CG_END_NAMESPACE
|
| 323 |
+
|
| 324 |
+
#endif // _CG_SCAN_H_
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/memcpy_async.h
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _COOPERATIVE_GROUPS_MEMCPY_ASYNC
|
| 50 |
+
#define _COOPERATIVE_GROUPS_MEMCPY_ASYNC
|
| 51 |
+
|
| 52 |
+
#include "../cooperative_groups.h"
|
| 53 |
+
#include "details/info.h"
|
| 54 |
+
|
| 55 |
+
#ifdef _CG_CPP11_FEATURES
|
| 56 |
+
# include "details/async.h"
|
| 57 |
+
#else
|
| 58 |
+
# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
|
| 59 |
+
-std=c++11 compiler option.
|
| 60 |
+
#endif
|
| 61 |
+
|
| 62 |
+
#endif // _COOPERATIVE_GROUPS_MEMCPY_ASYNC
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/reduce.h
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
|
| 2 |
+
*
|
| 3 |
+
* NOTICE TO LICENSEE:
|
| 4 |
+
*
|
| 5 |
+
* The source code and/or documentation ("Licensed Deliverables") are
|
| 6 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 7 |
+
* international Copyright laws.
|
| 8 |
+
*
|
| 9 |
+
* The Licensed Deliverables contained herein are PROPRIETARY and
|
| 10 |
+
* CONFIDENTIAL to NVIDIA and are being provided under the terms and
|
| 11 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 12 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 13 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 14 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 15 |
+
* of the Licensed Deliverables to any third party without the express
|
| 16 |
+
* written consent of NVIDIA is prohibited.
|
| 17 |
+
*
|
| 18 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 19 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 20 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
|
| 21 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 22 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 23 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 25 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 26 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 27 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 28 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 29 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 30 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 31 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 32 |
+
*
|
| 33 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 34 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 35 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 36 |
+
* computer software documentation" as such terms are used in 48
|
| 37 |
+
* C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
|
| 38 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 39 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 40 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 41 |
+
* only those rights set forth herein.
|
| 42 |
+
*
|
| 43 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 44 |
+
* software must include, in the user documentation and internal
|
| 45 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 46 |
+
* Users Notice.
|
| 47 |
+
*/
|
| 48 |
+
|
| 49 |
+
#ifndef _COOPERATIVE_GROUPS_REDUCE_H
|
| 50 |
+
#define _COOPERATIVE_GROUPS_REDUCE_H
|
| 51 |
+
|
| 52 |
+
#include "../cooperative_groups.h"
|
| 53 |
+
#include "details/info.h"
|
| 54 |
+
|
| 55 |
+
#ifdef _CG_CPP11_FEATURES
|
| 56 |
+
# include "details/reduce.h"
|
| 57 |
+
#else
|
| 58 |
+
# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
|
| 59 |
+
-std=c++11 compiler option.
|
| 60 |
+
#endif
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
#endif //_COOPERATIVE_GROUPS_REDUCE_H
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda.h
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaEGLTypedefs.h
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2020-2021 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef CUDAEGLTYPEDEFS_H
|
| 51 |
+
#define CUDAEGLTYPEDEFS_H
|
| 52 |
+
|
| 53 |
+
#include <cudaEGL.h>
|
| 54 |
+
|
| 55 |
+
#ifdef __cplusplus
|
| 56 |
+
extern "C" {
|
| 57 |
+
#endif // __cplusplus
|
| 58 |
+
|
| 59 |
+
/*
|
| 60 |
+
* Macros for the latest version for each driver function in cudaEGL.h
|
| 61 |
+
*/
|
| 62 |
+
#define PFN_cuGraphicsEGLRegisterImage PFN_cuGraphicsEGLRegisterImage_v7000
|
| 63 |
+
#define PFN_cuEGLStreamConsumerConnect PFN_cuEGLStreamConsumerConnect_v7000
|
| 64 |
+
#define PFN_cuEGLStreamConsumerConnectWithFlags PFN_cuEGLStreamConsumerConnectWithFlags_v8000
|
| 65 |
+
#define PFN_cuEGLStreamConsumerDisconnect PFN_cuEGLStreamConsumerDisconnect_v7000
|
| 66 |
+
#define PFN_cuEGLStreamConsumerAcquireFrame PFN_cuEGLStreamConsumerAcquireFrame_v7000
|
| 67 |
+
#define PFN_cuEGLStreamConsumerReleaseFrame PFN_cuEGLStreamConsumerReleaseFrame_v7000
|
| 68 |
+
#define PFN_cuEGLStreamProducerConnect PFN_cuEGLStreamProducerConnect_v7000
|
| 69 |
+
#define PFN_cuEGLStreamProducerDisconnect PFN_cuEGLStreamProducerDisconnect_v7000
|
| 70 |
+
#define PFN_cuEGLStreamProducerPresentFrame PFN_cuEGLStreamProducerPresentFrame_v7000
|
| 71 |
+
#define PFN_cuEGLStreamProducerReturnFrame PFN_cuEGLStreamProducerReturnFrame_v7000
|
| 72 |
+
#define PFN_cuGraphicsResourceGetMappedEglFrame PFN_cuGraphicsResourceGetMappedEglFrame_v7000
|
| 73 |
+
#define PFN_cuEventCreateFromEGLSync PFN_cuEventCreateFromEGLSync_v9000
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
/**
|
| 77 |
+
* Type definitions for functions defined in cudaEGL.h
|
| 78 |
+
*/
|
| 79 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphicsEGLRegisterImage_v7000)(CUgraphicsResource CUDAAPI *pCudaResource, EGLImageKHR image, unsigned int flags);
|
| 80 |
+
typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerConnect_v7000)(CUeglStreamConnection CUDAAPI *conn, EGLStreamKHR stream);
|
| 81 |
+
typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerConnectWithFlags_v8000)(CUeglStreamConnection CUDAAPI *conn, EGLStreamKHR stream, unsigned int flags);
|
| 82 |
+
typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerDisconnect_v7000)(CUeglStreamConnection CUDAAPI *conn);
|
| 83 |
+
typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerAcquireFrame_v7000)(CUeglStreamConnection CUDAAPI *conn, CUgraphicsResource CUDAAPI *pCudaResource, CUstream CUDAAPI *pStream, unsigned int timeout);
|
| 84 |
+
typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerReleaseFrame_v7000)(CUeglStreamConnection CUDAAPI *conn, CUgraphicsResource pCudaResource, CUstream CUDAAPI *pStream);
|
| 85 |
+
typedef CUresult (CUDAAPI *PFN_cuEGLStreamProducerConnect_v7000)(CUeglStreamConnection CUDAAPI *conn, EGLStreamKHR stream, EGLint width, EGLint height);
|
| 86 |
+
typedef CUresult (CUDAAPI *PFN_cuEGLStreamProducerDisconnect_v7000)(CUeglStreamConnection CUDAAPI *conn);
|
| 87 |
+
typedef CUresult (CUDAAPI *PFN_cuEGLStreamProducerPresentFrame_v7000)(CUeglStreamConnection CUDAAPI *conn, CUeglFrame_v1 eglframe, CUstream CUDAAPI *pStream);
|
| 88 |
+
typedef CUresult (CUDAAPI *PFN_cuEGLStreamProducerReturnFrame_v7000)(CUeglStreamConnection CUDAAPI *conn, CUeglFrame_v1 CUDAAPI *eglframe, CUstream CUDAAPI *pStream);
|
| 89 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedEglFrame_v7000)(CUeglFrame_v1 CUDAAPI *eglFrame, CUgraphicsResource resource, unsigned int index, unsigned int mipLevel);
|
| 90 |
+
typedef CUresult (CUDAAPI *PFN_cuEventCreateFromEGLSync_v9000)(CUevent CUDAAPI *phEvent, EGLSyncKHR eglSync, unsigned int flags);
|
| 91 |
+
|
| 92 |
+
#ifdef __cplusplus
|
| 93 |
+
}
|
| 94 |
+
#endif // __cplusplus
|
| 95 |
+
|
| 96 |
+
#endif // file guard
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaGLTypedefs.h
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2020-2021 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef CUDAGLTYPEDEFS_H
|
| 51 |
+
#define CUDAGLTYPEDEFS_H
|
| 52 |
+
|
| 53 |
+
// Dependent includes for cudagl.h
|
| 54 |
+
#include <GL/gl.h>
|
| 55 |
+
|
| 56 |
+
#include <cudaGL.h>
|
| 57 |
+
|
| 58 |
+
#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
|
| 59 |
+
#define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptds
|
| 60 |
+
#define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptsz
|
| 61 |
+
#else
|
| 62 |
+
#define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## default_version
|
| 63 |
+
#define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## default_version
|
| 64 |
+
#endif
|
| 65 |
+
|
| 66 |
+
#ifdef __cplusplus
|
| 67 |
+
extern "C" {
|
| 68 |
+
#endif // __cplusplus
|
| 69 |
+
|
| 70 |
+
/*
|
| 71 |
+
* Macros for the latest version for each driver function in cudaGL.h
|
| 72 |
+
*/
|
| 73 |
+
#define PFN_cuGraphicsGLRegisterBuffer PFN_cuGraphicsGLRegisterBuffer_v3000
|
| 74 |
+
#define PFN_cuGraphicsGLRegisterImage PFN_cuGraphicsGLRegisterImage_v3000
|
| 75 |
+
#define PFN_cuWGLGetDevice PFN_cuWGLGetDevice_v2020
|
| 76 |
+
#define PFN_cuGLGetDevices PFN_cuGLGetDevices_v6050
|
| 77 |
+
#define PFN_cuGLCtxCreate PFN_cuGLCtxCreate_v3020
|
| 78 |
+
#define PFN_cuGLInit PFN_cuGLInit_v2000
|
| 79 |
+
#define PFN_cuGLRegisterBufferObject PFN_cuGLRegisterBufferObject_v2000
|
| 80 |
+
#define PFN_cuGLMapBufferObject __API_TYPEDEF_PTDS(PFN_cuGLMapBufferObject, 3020, 7000)
|
| 81 |
+
#define PFN_cuGLUnmapBufferObject PFN_cuGLUnmapBufferObject_v2000
|
| 82 |
+
#define PFN_cuGLUnregisterBufferObject PFN_cuGLUnregisterBufferObject_v2000
|
| 83 |
+
#define PFN_cuGLSetBufferObjectMapFlags PFN_cuGLSetBufferObjectMapFlags_v2030
|
| 84 |
+
#define PFN_cuGLMapBufferObjectAsync __API_TYPEDEF_PTSZ(PFN_cuGLMapBufferObjectAsync, 3020, 7000)
|
| 85 |
+
#define PFN_cuGLUnmapBufferObjectAsync PFN_cuGLUnmapBufferObjectAsync_v2030
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
/**
|
| 89 |
+
* Type definitions for functions defined in cudaGL.h
|
| 90 |
+
*/
|
| 91 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterBuffer_v3000)(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags);
|
| 92 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterImage_v3000)(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags);
|
| 93 |
+
#ifdef _WIN32
|
| 94 |
+
typedef CUresult (CUDAAPI *PFN_cuWGLGetDevice_v2020)(CUdevice_v1 *pDevice, HGPUNV hGpu);
|
| 95 |
+
#endif
|
| 96 |
+
typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v6050)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
|
| 97 |
+
typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v3020)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device);
|
| 98 |
+
typedef CUresult (CUDAAPI *PFN_cuGLInit_v2000)(void);
|
| 99 |
+
typedef CUresult (CUDAAPI *PFN_cuGLRegisterBufferObject_v2000)(GLuint buffer);
|
| 100 |
+
typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v7000_ptds)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer);
|
| 101 |
+
typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObject_v2000)(GLuint buffer);
|
| 102 |
+
typedef CUresult (CUDAAPI *PFN_cuGLUnregisterBufferObject_v2000)(GLuint buffer);
|
| 103 |
+
typedef CUresult (CUDAAPI *PFN_cuGLSetBufferObjectMapFlags_v2030)(GLuint buffer, unsigned int Flags);
|
| 104 |
+
typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v7000_ptsz)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream);
|
| 105 |
+
typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObjectAsync_v2030)(GLuint buffer, CUstream hStream);
|
| 106 |
+
typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer);
|
| 107 |
+
typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream);
|
| 108 |
+
|
| 109 |
+
/*
|
| 110 |
+
* Type definitions for older versioned functions in cuda.h
|
| 111 |
+
*/
|
| 112 |
+
#if defined(__CUDA_API_VERSION_INTERNAL)
|
| 113 |
+
typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v4010)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
|
| 114 |
+
typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v2000)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer);
|
| 115 |
+
typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v2030)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer, CUstream hStream);
|
| 116 |
+
typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v2000)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device);
|
| 117 |
+
#endif
|
| 118 |
+
|
| 119 |
+
#ifdef __cplusplus
|
| 120 |
+
}
|
| 121 |
+
#endif // __cplusplus
|
| 122 |
+
|
| 123 |
+
#endif // file guard
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaTypedefs.h
ADDED
|
@@ -0,0 +1,959 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2020-2021 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef CUDATYPEDEFS_H
|
| 51 |
+
#define CUDATYPEDEFS_H
|
| 52 |
+
|
| 53 |
+
#include <cuda.h>
|
| 54 |
+
|
| 55 |
+
#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
|
| 56 |
+
#define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptds
|
| 57 |
+
#define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptsz
|
| 58 |
+
#else
|
| 59 |
+
#define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## default_version
|
| 60 |
+
#define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## default_version
|
| 61 |
+
#endif
|
| 62 |
+
|
| 63 |
+
#ifdef __cplusplus
|
| 64 |
+
extern "C" {
|
| 65 |
+
#endif // __cplusplus
|
| 66 |
+
|
| 67 |
+
/*
|
| 68 |
+
* Macros for the latest version for each driver function in cuda.h
|
| 69 |
+
*/
|
| 70 |
+
#define PFN_cuGetErrorString PFN_cuGetErrorString_v6000
|
| 71 |
+
#define PFN_cuGetErrorName PFN_cuGetErrorName_v6000
|
| 72 |
+
#define PFN_cuInit PFN_cuInit_v2000
|
| 73 |
+
#define PFN_cuDriverGetVersion PFN_cuDriverGetVersion_v2020
|
| 74 |
+
#define PFN_cuDeviceGet PFN_cuDeviceGet_v2000
|
| 75 |
+
#define PFN_cuDeviceGetCount PFN_cuDeviceGetCount_v2000
|
| 76 |
+
#define PFN_cuDeviceGetName PFN_cuDeviceGetName_v2000
|
| 77 |
+
#define PFN_cuDeviceGetUuid PFN_cuDeviceGetUuid_v11040
|
| 78 |
+
#define PFN_cuDeviceGetLuid PFN_cuDeviceGetLuid_v10000
|
| 79 |
+
#define PFN_cuDeviceTotalMem PFN_cuDeviceTotalMem_v3020
|
| 80 |
+
#define PFN_cuDeviceGetTexture1DLinearMaxWidth PFN_cuDeviceGetTexture1DLinearMaxWidth_v11010
|
| 81 |
+
#define PFN_cuDeviceGetAttribute PFN_cuDeviceGetAttribute_v2000
|
| 82 |
+
#define PFN_cuDeviceGetNvSciSyncAttributes PFN_cuDeviceGetNvSciSyncAttributes_v10020
|
| 83 |
+
#define PFN_cuDeviceSetMemPool PFN_cuDeviceSetMemPool_v11020
|
| 84 |
+
#define PFN_cuDeviceGetMemPool PFN_cuDeviceGetMemPool_v11020
|
| 85 |
+
#define PFN_cuDeviceGetDefaultMemPool PFN_cuDeviceGetDefaultMemPool_v11020
|
| 86 |
+
#define PFN_cuDeviceGetProperties PFN_cuDeviceGetProperties_v2000
|
| 87 |
+
#define PFN_cuDeviceComputeCapability PFN_cuDeviceComputeCapability_v2000
|
| 88 |
+
#define PFN_cuDevicePrimaryCtxRetain PFN_cuDevicePrimaryCtxRetain_v7000
|
| 89 |
+
#define PFN_cuDevicePrimaryCtxRelease PFN_cuDevicePrimaryCtxRelease_v11000
|
| 90 |
+
#define PFN_cuDevicePrimaryCtxSetFlags PFN_cuDevicePrimaryCtxSetFlags_v11000
|
| 91 |
+
#define PFN_cuDevicePrimaryCtxGetState PFN_cuDevicePrimaryCtxGetState_v7000
|
| 92 |
+
#define PFN_cuDevicePrimaryCtxReset PFN_cuDevicePrimaryCtxReset_v11000
|
| 93 |
+
#define PFN_cuDeviceGetExecAffinitySupport PFN_cuDeviceGetExecAffinitySupport_v11040
|
| 94 |
+
#define PFN_cuCtxCreate PFN_cuCtxCreate_v11040
|
| 95 |
+
#define PFN_cuCtxDestroy PFN_cuCtxDestroy_v4000
|
| 96 |
+
#define PFN_cuCtxPushCurrent PFN_cuCtxPushCurrent_v4000
|
| 97 |
+
#define PFN_cuCtxPopCurrent PFN_cuCtxPopCurrent_v4000
|
| 98 |
+
#define PFN_cuCtxSetCurrent PFN_cuCtxSetCurrent_v4000
|
| 99 |
+
#define PFN_cuCtxGetCurrent PFN_cuCtxGetCurrent_v4000
|
| 100 |
+
#define PFN_cuCtxGetDevice PFN_cuCtxGetDevice_v2000
|
| 101 |
+
#define PFN_cuCtxGetFlags PFN_cuCtxGetFlags_v7000
|
| 102 |
+
#define PFN_cuCtxSynchronize PFN_cuCtxSynchronize_v2000
|
| 103 |
+
#define PFN_cuCtxSetLimit PFN_cuCtxSetLimit_v3010
|
| 104 |
+
#define PFN_cuCtxGetLimit PFN_cuCtxGetLimit_v3010
|
| 105 |
+
#define PFN_cuCtxGetCacheConfig PFN_cuCtxGetCacheConfig_v3020
|
| 106 |
+
#define PFN_cuCtxSetCacheConfig PFN_cuCtxSetCacheConfig_v3020
|
| 107 |
+
#define PFN_cuCtxGetSharedMemConfig PFN_cuCtxGetSharedMemConfig_v4020
|
| 108 |
+
#define PFN_cuCtxSetSharedMemConfig PFN_cuCtxSetSharedMemConfig_v4020
|
| 109 |
+
#define PFN_cuCtxGetApiVersion PFN_cuCtxGetApiVersion_v3020
|
| 110 |
+
#define PFN_cuCtxGetStreamPriorityRange PFN_cuCtxGetStreamPriorityRange_v5050
|
| 111 |
+
#define PFN_cuCtxResetPersistingL2Cache PFN_cuCtxResetPersistingL2Cache_v11000
|
| 112 |
+
#define PFN_cuCtxAttach PFN_cuCtxAttach_v2000
|
| 113 |
+
#define PFN_cuCtxDetach PFN_cuCtxDetach_v2000
|
| 114 |
+
#define PFN_cuCtxGetExecAffinity PFN_cuCtxGetExecAffinity_v11040
|
| 115 |
+
#define PFN_cuModuleLoad PFN_cuModuleLoad_v2000
|
| 116 |
+
#define PFN_cuModuleLoadData PFN_cuModuleLoadData_v2000
|
| 117 |
+
#define PFN_cuModuleLoadDataEx PFN_cuModuleLoadDataEx_v2010
|
| 118 |
+
#define PFN_cuModuleLoadFatBinary PFN_cuModuleLoadFatBinary_v2000
|
| 119 |
+
#define PFN_cuModuleUnload PFN_cuModuleUnload_v2000
|
| 120 |
+
#define PFN_cuModuleGetFunction PFN_cuModuleGetFunction_v2000
|
| 121 |
+
#define PFN_cuModuleGetGlobal PFN_cuModuleGetGlobal_v3020
|
| 122 |
+
#define PFN_cuModuleGetTexRef PFN_cuModuleGetTexRef_v2000
|
| 123 |
+
#define PFN_cuModuleGetSurfRef PFN_cuModuleGetSurfRef_v3000
|
| 124 |
+
#define PFN_cuLinkCreate PFN_cuLinkCreate_v6050
|
| 125 |
+
#define PFN_cuLinkAddData PFN_cuLinkAddData_v6050
|
| 126 |
+
#define PFN_cuLinkAddFile PFN_cuLinkAddFile_v6050
|
| 127 |
+
#define PFN_cuLinkComplete PFN_cuLinkComplete_v5050
|
| 128 |
+
#define PFN_cuLinkDestroy PFN_cuLinkDestroy_v5050
|
| 129 |
+
#define PFN_cuMemGetInfo PFN_cuMemGetInfo_v3020
|
| 130 |
+
#define PFN_cuMemAlloc PFN_cuMemAlloc_v3020
|
| 131 |
+
#define PFN_cuMemAllocPitch PFN_cuMemAllocPitch_v3020
|
| 132 |
+
#define PFN_cuMemFree PFN_cuMemFree_v3020
|
| 133 |
+
#define PFN_cuMemGetAddressRange PFN_cuMemGetAddressRange_v3020
|
| 134 |
+
#define PFN_cuMemAllocHost PFN_cuMemAllocHost_v3020
|
| 135 |
+
#define PFN_cuMemFreeHost PFN_cuMemFreeHost_v2000
|
| 136 |
+
#define PFN_cuMemHostAlloc PFN_cuMemHostAlloc_v2020
|
| 137 |
+
#define PFN_cuMemHostGetDevicePointer PFN_cuMemHostGetDevicePointer_v3020
|
| 138 |
+
#define PFN_cuMemHostGetFlags PFN_cuMemHostGetFlags_v2030
|
| 139 |
+
#define PFN_cuMemAllocManaged PFN_cuMemAllocManaged_v6000
|
| 140 |
+
#define PFN_cuDeviceGetByPCIBusId PFN_cuDeviceGetByPCIBusId_v4010
|
| 141 |
+
#define PFN_cuDeviceGetPCIBusId PFN_cuDeviceGetPCIBusId_v4010
|
| 142 |
+
#define PFN_cuIpcGetEventHandle PFN_cuIpcGetEventHandle_v4010
|
| 143 |
+
#define PFN_cuIpcOpenEventHandle PFN_cuIpcOpenEventHandle_v4010
|
| 144 |
+
#define PFN_cuIpcGetMemHandle PFN_cuIpcGetMemHandle_v4010
|
| 145 |
+
#define PFN_cuIpcOpenMemHandle PFN_cuIpcOpenMemHandle_v11000
|
| 146 |
+
#define PFN_cuIpcCloseMemHandle PFN_cuIpcCloseMemHandle_v4010
|
| 147 |
+
#define PFN_cuMemHostRegister PFN_cuMemHostRegister_v6050
|
| 148 |
+
#define PFN_cuMemHostUnregister PFN_cuMemHostUnregister_v4000
|
| 149 |
+
#define PFN_cuMemcpy __API_TYPEDEF_PTDS(PFN_cuMemcpy, 4000, 7000)
|
| 150 |
+
#define PFN_cuMemcpyPeer __API_TYPEDEF_PTDS(PFN_cuMemcpyPeer, 4000, 7000)
|
| 151 |
+
#define PFN_cuMemcpyHtoD __API_TYPEDEF_PTDS(PFN_cuMemcpyHtoD, 3020, 7000)
|
| 152 |
+
#define PFN_cuMemcpyDtoH __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoH, 3020, 7000)
|
| 153 |
+
#define PFN_cuMemcpyDtoD __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoD, 3020, 7000)
|
| 154 |
+
#define PFN_cuMemcpyDtoA __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoA, 3020, 7000)
|
| 155 |
+
#define PFN_cuMemcpyAtoD __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoD, 3020, 7000)
|
| 156 |
+
#define PFN_cuMemcpyHtoA __API_TYPEDEF_PTDS(PFN_cuMemcpyHtoA, 3020, 7000)
|
| 157 |
+
#define PFN_cuMemcpyAtoH __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoH, 3020, 7000)
|
| 158 |
+
#define PFN_cuMemcpyAtoA __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoA, 3020, 7000)
|
| 159 |
+
#define PFN_cuMemcpy2D __API_TYPEDEF_PTDS(PFN_cuMemcpy2D, 3020, 7000)
|
| 160 |
+
#define PFN_cuMemcpy2DUnaligned __API_TYPEDEF_PTDS(PFN_cuMemcpy2DUnaligned, 3020, 7000)
|
| 161 |
+
#define PFN_cuMemcpy3D __API_TYPEDEF_PTDS(PFN_cuMemcpy3D, 3020, 7000)
|
| 162 |
+
#define PFN_cuMemcpy3DPeer __API_TYPEDEF_PTDS(PFN_cuMemcpy3DPeer, 4000, 7000)
|
| 163 |
+
#define PFN_cuMemcpyAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyAsync, 4000, 7000)
|
| 164 |
+
#define PFN_cuMemcpyPeerAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyPeerAsync, 4000, 7000)
|
| 165 |
+
#define PFN_cuMemcpyHtoDAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyHtoDAsync, 3020, 7000)
|
| 166 |
+
#define PFN_cuMemcpyDtoHAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyDtoHAsync, 3020, 7000)
|
| 167 |
+
#define PFN_cuMemcpyDtoDAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyDtoDAsync, 3020, 7000)
|
| 168 |
+
#define PFN_cuMemcpyHtoAAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyHtoAAsync, 3020, 7000)
|
| 169 |
+
#define PFN_cuMemcpyAtoHAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyAtoHAsync, 3020, 7000)
|
| 170 |
+
#define PFN_cuMemcpy2DAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpy2DAsync, 3020, 7000)
|
| 171 |
+
#define PFN_cuMemcpy3DAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpy3DAsync, 3020, 7000)
|
| 172 |
+
#define PFN_cuMemcpy3DPeerAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpy3DPeerAsync, 4000, 7000)
|
| 173 |
+
#define PFN_cuMemsetD8 __API_TYPEDEF_PTDS(PFN_cuMemsetD8, 3020, 7000)
|
| 174 |
+
#define PFN_cuMemsetD16 __API_TYPEDEF_PTDS(PFN_cuMemsetD16, 3020, 7000)
|
| 175 |
+
#define PFN_cuMemsetD32 __API_TYPEDEF_PTDS(PFN_cuMemsetD32, 3020, 7000)
|
| 176 |
+
#define PFN_cuMemsetD2D8 __API_TYPEDEF_PTDS(PFN_cuMemsetD2D8, 3020, 7000)
|
| 177 |
+
#define PFN_cuMemsetD2D16 __API_TYPEDEF_PTDS(PFN_cuMemsetD2D16, 3020, 7000)
|
| 178 |
+
#define PFN_cuMemsetD2D32 __API_TYPEDEF_PTDS(PFN_cuMemsetD2D32, 3020, 7000)
|
| 179 |
+
#define PFN_cuMemsetD8Async __API_TYPEDEF_PTSZ(PFN_cuMemsetD8Async, 3020, 7000)
|
| 180 |
+
#define PFN_cuMemsetD16Async __API_TYPEDEF_PTSZ(PFN_cuMemsetD16Async, 3020, 7000)
|
| 181 |
+
#define PFN_cuMemsetD32Async __API_TYPEDEF_PTSZ(PFN_cuMemsetD32Async, 3020, 7000)
|
| 182 |
+
#define PFN_cuMemsetD2D8Async __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D8Async, 3020, 7000)
|
| 183 |
+
#define PFN_cuMemsetD2D16Async __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D16Async, 3020, 7000)
|
| 184 |
+
#define PFN_cuMemsetD2D32Async __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D32Async, 3020, 7000)
|
| 185 |
+
#define PFN_cuArrayCreate PFN_cuArrayCreate_v3020
|
| 186 |
+
#define PFN_cuArrayGetDescriptor PFN_cuArrayGetDescriptor_v3020
|
| 187 |
+
#define PFN_cuArrayGetSparseProperties PFN_cuArrayGetSparseProperties_v11010
|
| 188 |
+
#define PFN_cuMipmappedArrayGetSparseProperties PFN_cuMipmappedArrayGetSparseProperties_v11010
|
| 189 |
+
#define PFN_cuArrayGetMemoryRequirements PFN_cuArrayGetMemoryRequirements_v11060
|
| 190 |
+
#define PFN_cuMipmappedArrayGetMemoryRequirements PFN_cuMipmappedArrayGetMemoryRequirements_v11060
|
| 191 |
+
#define PFN_cuArrayGetPlane PFN_cuArrayGetPlane_v11020
|
| 192 |
+
#define PFN_cuArrayDestroy PFN_cuArrayDestroy_v2000
|
| 193 |
+
#define PFN_cuArray3DCreate PFN_cuArray3DCreate_v3020
|
| 194 |
+
#define PFN_cuArray3DGetDescriptor PFN_cuArray3DGetDescriptor_v3020
|
| 195 |
+
#define PFN_cuMipmappedArrayCreate PFN_cuMipmappedArrayCreate_v5000
|
| 196 |
+
#define PFN_cuMipmappedArrayGetLevel PFN_cuMipmappedArrayGetLevel_v5000
|
| 197 |
+
#define PFN_cuMipmappedArrayDestroy PFN_cuMipmappedArrayDestroy_v5000
|
| 198 |
+
#define PFN_cuMemAddressReserve PFN_cuMemAddressReserve_v10020
|
| 199 |
+
#define PFN_cuMemAddressFree PFN_cuMemAddressFree_v10020
|
| 200 |
+
#define PFN_cuMemCreate PFN_cuMemCreate_v10020
|
| 201 |
+
#define PFN_cuMemRelease PFN_cuMemRelease_v10020
|
| 202 |
+
#define PFN_cuMemMap PFN_cuMemMap_v10020
|
| 203 |
+
#define PFN_cuMemMapArrayAsync __API_TYPEDEF_PTSZ(PFN_cuMemMapArrayAsync, 11010, 11010)
|
| 204 |
+
#define PFN_cuMemUnmap PFN_cuMemUnmap_v10020
|
| 205 |
+
#define PFN_cuMemSetAccess PFN_cuMemSetAccess_v10020
|
| 206 |
+
#define PFN_cuMemGetAccess PFN_cuMemGetAccess_v10020
|
| 207 |
+
#define PFN_cuMemExportToShareableHandle PFN_cuMemExportToShareableHandle_v10020
|
| 208 |
+
#define PFN_cuMemImportFromShareableHandle PFN_cuMemImportFromShareableHandle_v10020
|
| 209 |
+
#define PFN_cuMemGetAllocationGranularity PFN_cuMemGetAllocationGranularity_v10020
|
| 210 |
+
#define PFN_cuMemGetAllocationPropertiesFromHandle PFN_cuMemGetAllocationPropertiesFromHandle_v10020
|
| 211 |
+
#define PFN_cuMemRetainAllocationHandle PFN_cuMemRetainAllocationHandle_v11000
|
| 212 |
+
#define PFN_cuMemFreeAsync __API_TYPEDEF_PTSZ(PFN_cuMemFreeAsync, 11020, 11020)
|
| 213 |
+
#define PFN_cuMemAllocAsync __API_TYPEDEF_PTSZ(PFN_cuMemAllocAsync, 11020, 11020)
|
| 214 |
+
#define PFN_cuMemPoolTrimTo PFN_cuMemPoolTrimTo_v11020
|
| 215 |
+
#define PFN_cuMemPoolSetAttribute PFN_cuMemPoolSetAttribute_v11020
|
| 216 |
+
#define PFN_cuMemPoolGetAttribute PFN_cuMemPoolGetAttribute_v11020
|
| 217 |
+
#define PFN_cuMemPoolSetAccess PFN_cuMemPoolSetAccess_v11020
|
| 218 |
+
#define PFN_cuMemPoolGetAccess PFN_cuMemPoolGetAccess_v11020
|
| 219 |
+
#define PFN_cuMemPoolCreate PFN_cuMemPoolCreate_v11020
|
| 220 |
+
#define PFN_cuMemPoolDestroy PFN_cuMemPoolDestroy_v11020
|
| 221 |
+
#define PFN_cuMemAllocFromPoolAsync __API_TYPEDEF_PTSZ(PFN_cuMemAllocFromPoolAsync, 11020, 11020)
|
| 222 |
+
#define PFN_cuMemPoolExportToShareableHandle PFN_cuMemPoolExportToShareableHandle_v11020
|
| 223 |
+
#define PFN_cuMemPoolImportFromShareableHandle PFN_cuMemPoolImportFromShareableHandle_v11020
|
| 224 |
+
#define PFN_cuMemPoolExportPointer PFN_cuMemPoolExportPointer_v11020
|
| 225 |
+
#define PFN_cuMemPoolImportPointer PFN_cuMemPoolImportPointer_v11020
|
| 226 |
+
#define PFN_cuPointerGetAttribute PFN_cuPointerGetAttribute_v4000
|
| 227 |
+
#define PFN_cuMemPrefetchAsync __API_TYPEDEF_PTSZ(PFN_cuMemPrefetchAsync, 8000, 8000)
|
| 228 |
+
#define PFN_cuMemAdvise PFN_cuMemAdvise_v8000
|
| 229 |
+
#define PFN_cuMemRangeGetAttribute PFN_cuMemRangeGetAttribute_v8000
|
| 230 |
+
#define PFN_cuMemRangeGetAttributes PFN_cuMemRangeGetAttributes_v8000
|
| 231 |
+
#define PFN_cuPointerSetAttribute PFN_cuPointerSetAttribute_v6000
|
| 232 |
+
#define PFN_cuPointerGetAttributes PFN_cuPointerGetAttributes_v7000
|
| 233 |
+
#define PFN_cuStreamCreate PFN_cuStreamCreate_v2000
|
| 234 |
+
#define PFN_cuStreamCreateWithPriority PFN_cuStreamCreateWithPriority_v5050
|
| 235 |
+
#define PFN_cuStreamGetPriority __API_TYPEDEF_PTSZ(PFN_cuStreamGetPriority, 5050, 7000)
|
| 236 |
+
#define PFN_cuStreamGetFlags __API_TYPEDEF_PTSZ(PFN_cuStreamGetFlags, 5050, 7000)
|
| 237 |
+
#define PFN_cuStreamGetCtx __API_TYPEDEF_PTSZ(PFN_cuStreamGetCtx, 9020, 9020)
|
| 238 |
+
#define PFN_cuStreamWaitEvent __API_TYPEDEF_PTSZ(PFN_cuStreamWaitEvent, 3020, 7000)
|
| 239 |
+
#define PFN_cuStreamAddCallback __API_TYPEDEF_PTSZ(PFN_cuStreamAddCallback, 5000, 7000)
|
| 240 |
+
#define PFN_cuStreamBeginCapture __API_TYPEDEF_PTSZ(PFN_cuStreamBeginCapture, 10010, 10010)
|
| 241 |
+
#define PFN_cuThreadExchangeStreamCaptureMode PFN_cuThreadExchangeStreamCaptureMode_v10010
|
| 242 |
+
#define PFN_cuStreamEndCapture __API_TYPEDEF_PTSZ(PFN_cuStreamEndCapture, 10000, 10000)
|
| 243 |
+
#define PFN_cuStreamIsCapturing __API_TYPEDEF_PTSZ(PFN_cuStreamIsCapturing, 10000, 10000)
|
| 244 |
+
#define PFN_cuStreamGetCaptureInfo __API_TYPEDEF_PTSZ(PFN_cuStreamGetCaptureInfo, 10010, 10010)
|
| 245 |
+
#define PFN_cuStreamGetCaptureInfo_v2 __API_TYPEDEF_PTSZ(PFN_cuStreamGetCaptureInfo, 11030, 11030)
|
| 246 |
+
#define PFN_cuStreamUpdateCaptureDependencies __API_TYPEDEF_PTSZ(PFN_cuStreamUpdateCaptureDependencies, 11030, 11030)
|
| 247 |
+
#define PFN_cuStreamAttachMemAsync __API_TYPEDEF_PTSZ(PFN_cuStreamAttachMemAsync, 6000, 7000)
|
| 248 |
+
#define PFN_cuStreamQuery __API_TYPEDEF_PTSZ(PFN_cuStreamQuery, 2000, 7000)
|
| 249 |
+
#define PFN_cuStreamSynchronize __API_TYPEDEF_PTSZ(PFN_cuStreamSynchronize, 2000, 7000)
|
| 250 |
+
#define PFN_cuStreamDestroy PFN_cuStreamDestroy_v4000
|
| 251 |
+
#define PFN_cuStreamCopyAttributes __API_TYPEDEF_PTSZ(PFN_cuStreamCopyAttributes, 11000, 11000)
|
| 252 |
+
#define PFN_cuStreamGetAttribute __API_TYPEDEF_PTSZ(PFN_cuStreamGetAttribute, 11000, 11000)
|
| 253 |
+
#define PFN_cuStreamSetAttribute __API_TYPEDEF_PTSZ(PFN_cuStreamSetAttribute, 11000, 11000)
|
| 254 |
+
#define PFN_cuEventCreate PFN_cuEventCreate_v2000
|
| 255 |
+
#define PFN_cuEventRecord __API_TYPEDEF_PTSZ(PFN_cuEventRecord, 2000, 7000)
|
| 256 |
+
#define PFN_cuEventRecordWithFlags __API_TYPEDEF_PTSZ(PFN_cuEventRecordWithFlags, 11010, 11010)
|
| 257 |
+
#define PFN_cuEventQuery PFN_cuEventQuery_v2000
|
| 258 |
+
#define PFN_cuEventSynchronize PFN_cuEventSynchronize_v2000
|
| 259 |
+
#define PFN_cuEventDestroy PFN_cuEventDestroy_v4000
|
| 260 |
+
#define PFN_cuEventElapsedTime PFN_cuEventElapsedTime_v2000
|
| 261 |
+
#define PFN_cuImportExternalMemory PFN_cuImportExternalMemory_v10000
|
| 262 |
+
#define PFN_cuExternalMemoryGetMappedBuffer PFN_cuExternalMemoryGetMappedBuffer_v10000
|
| 263 |
+
#define PFN_cuExternalMemoryGetMappedMipmappedArray PFN_cuExternalMemoryGetMappedMipmappedArray_v10000
|
| 264 |
+
#define PFN_cuDestroyExternalMemory PFN_cuDestroyExternalMemory_v10000
|
| 265 |
+
#define PFN_cuImportExternalSemaphore PFN_cuImportExternalSemaphore_v10000
|
| 266 |
+
#define PFN_cuSignalExternalSemaphoresAsync __API_TYPEDEF_PTSZ(PFN_cuSignalExternalSemaphoresAsync, 10000, 10000)
|
| 267 |
+
#define PFN_cuWaitExternalSemaphoresAsync __API_TYPEDEF_PTSZ(PFN_cuWaitExternalSemaphoresAsync, 10000, 10000)
|
| 268 |
+
#define PFN_cuDestroyExternalSemaphore PFN_cuDestroyExternalSemaphore_v10000
|
| 269 |
+
#define PFN_cuStreamWaitValue32 __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue32, 8000, 8000)
|
| 270 |
+
#define PFN_cuStreamWaitValue64 __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue64, 9000, 9000)
|
| 271 |
+
#define PFN_cuStreamWriteValue32 __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue32, 8000, 8000)
|
| 272 |
+
#define PFN_cuStreamWriteValue64 __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue64, 9000, 9000)
|
| 273 |
+
#define PFN_cuStreamBatchMemOp __API_TYPEDEF_PTSZ(PFN_cuStreamBatchMemOp, 8000, 8000)
|
| 274 |
+
#define PFN_cuStreamWaitValue32_v2 __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue32, 11070, 11070)
|
| 275 |
+
#define PFN_cuStreamWaitValue64_v2 __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue64, 11070, 11070)
|
| 276 |
+
#define PFN_cuStreamWriteValue32_v2 __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue32, 11070, 11070)
|
| 277 |
+
#define PFN_cuStreamWriteValue64_v2 __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue64, 11070, 11070)
|
| 278 |
+
#define PFN_cuStreamBatchMemOp_v2 __API_TYPEDEF_PTSZ(PFN_cuStreamBatchMemOp, 11070, 11070)
|
| 279 |
+
#define PFN_cuFuncGetAttribute PFN_cuFuncGetAttribute_v2020
|
| 280 |
+
#define PFN_cuFuncSetAttribute PFN_cuFuncSetAttribute_v9000
|
| 281 |
+
#define PFN_cuFuncSetCacheConfig PFN_cuFuncSetCacheConfig_v3000
|
| 282 |
+
#define PFN_cuFuncSetSharedMemConfig PFN_cuFuncSetSharedMemConfig_v4020
|
| 283 |
+
#define PFN_cuLaunchKernel __API_TYPEDEF_PTSZ(PFN_cuLaunchKernel, 4000, 7000)
|
| 284 |
+
#define PFN_cuLaunchKernelEx __API_TYPEDEF_PTSZ(PFN_cuLaunchKernelEx, 11060, 11060)
|
| 285 |
+
#define PFN_cuLaunchCooperativeKernel __API_TYPEDEF_PTSZ(PFN_cuLaunchCooperativeKernel, 9000, 9000)
|
| 286 |
+
#define PFN_cuLaunchCooperativeKernelMultiDevice PFN_cuLaunchCooperativeKernelMultiDevice_v9000
|
| 287 |
+
#define PFN_cuLaunchHostFunc __API_TYPEDEF_PTSZ(PFN_cuLaunchHostFunc, 10000, 10000)
|
| 288 |
+
#define PFN_cuFuncSetBlockShape PFN_cuFuncSetBlockShape_v2000
|
| 289 |
+
#define PFN_cuFuncSetSharedSize PFN_cuFuncSetSharedSize_v2000
|
| 290 |
+
#define PFN_cuParamSetSize PFN_cuParamSetSize_v2000
|
| 291 |
+
#define PFN_cuParamSeti PFN_cuParamSeti_v2000
|
| 292 |
+
#define PFN_cuParamSetf PFN_cuParamSetf_v2000
|
| 293 |
+
#define PFN_cuParamSetv PFN_cuParamSetv_v2000
|
| 294 |
+
#define PFN_cuLaunch PFN_cuLaunch_v2000
|
| 295 |
+
#define PFN_cuLaunchGrid PFN_cuLaunchGrid_v2000
|
| 296 |
+
#define PFN_cuLaunchGridAsync PFN_cuLaunchGridAsync_v2000
|
| 297 |
+
#define PFN_cuParamSetTexRef PFN_cuParamSetTexRef_v2000
|
| 298 |
+
#define PFN_cuGraphCreate PFN_cuGraphCreate_v10000
|
| 299 |
+
#define PFN_cuGraphAddKernelNode PFN_cuGraphAddKernelNode_v10000
|
| 300 |
+
#define PFN_cuGraphKernelNodeGetParams PFN_cuGraphKernelNodeGetParams_v10000
|
| 301 |
+
#define PFN_cuGraphKernelNodeSetParams PFN_cuGraphKernelNodeSetParams_v10000
|
| 302 |
+
#define PFN_cuGraphAddMemcpyNode PFN_cuGraphAddMemcpyNode_v10000
|
| 303 |
+
#define PFN_cuGraphMemcpyNodeGetParams PFN_cuGraphMemcpyNodeGetParams_v10000
|
| 304 |
+
#define PFN_cuGraphMemcpyNodeSetParams PFN_cuGraphMemcpyNodeSetParams_v10000
|
| 305 |
+
#define PFN_cuGraphAddMemsetNode PFN_cuGraphAddMemsetNode_v10000
|
| 306 |
+
#define PFN_cuGraphMemsetNodeGetParams PFN_cuGraphMemsetNodeGetParams_v10000
|
| 307 |
+
#define PFN_cuGraphMemsetNodeSetParams PFN_cuGraphMemsetNodeSetParams_v10000
|
| 308 |
+
#define PFN_cuGraphAddHostNode PFN_cuGraphAddHostNode_v10000
|
| 309 |
+
#define PFN_cuGraphHostNodeGetParams PFN_cuGraphHostNodeGetParams_v10000
|
| 310 |
+
#define PFN_cuGraphHostNodeSetParams PFN_cuGraphHostNodeSetParams_v10000
|
| 311 |
+
#define PFN_cuGraphAddChildGraphNode PFN_cuGraphAddChildGraphNode_v10000
|
| 312 |
+
#define PFN_cuGraphChildGraphNodeGetGraph PFN_cuGraphChildGraphNodeGetGraph_v10000
|
| 313 |
+
#define PFN_cuGraphAddEmptyNode PFN_cuGraphAddEmptyNode_v10000
|
| 314 |
+
#define PFN_cuGraphAddEventRecordNode PFN_cuGraphAddEventRecordNode_v11010
|
| 315 |
+
#define PFN_cuGraphEventRecordNodeGetEvent PFN_cuGraphEventRecordNodeGetEvent_v11010
|
| 316 |
+
#define PFN_cuGraphEventRecordNodeSetEvent PFN_cuGraphEventRecordNodeSetEvent_v11010
|
| 317 |
+
#define PFN_cuGraphAddEventWaitNode PFN_cuGraphAddEventWaitNode_v11010
|
| 318 |
+
#define PFN_cuGraphEventWaitNodeGetEvent PFN_cuGraphEventWaitNodeGetEvent_v11010
|
| 319 |
+
#define PFN_cuGraphEventWaitNodeSetEvent PFN_cuGraphEventWaitNodeSetEvent_v11010
|
| 320 |
+
#define PFN_cuGraphAddExternalSemaphoresSignalNode PFN_cuGraphAddExternalSemaphoresSignalNode_v11020
|
| 321 |
+
#define PFN_cuGraphExternalSemaphoresSignalNodeGetParams PFN_cuGraphExternalSemaphoresSignalNodeGetParams_v11020
|
| 322 |
+
#define PFN_cuGraphExternalSemaphoresSignalNodeSetParams PFN_cuGraphExternalSemaphoresSignalNodeSetParams_v11020
|
| 323 |
+
#define PFN_cuGraphAddExternalSemaphoresWaitNode PFN_cuGraphAddExternalSemaphoresWaitNode_v11020
|
| 324 |
+
#define PFN_cuGraphExternalSemaphoresWaitNodeGetParams PFN_cuGraphExternalSemaphoresWaitNodeGetParams_v11020
|
| 325 |
+
#define PFN_cuGraphExternalSemaphoresWaitNodeSetParams PFN_cuGraphExternalSemaphoresWaitNodeSetParams_v11020
|
| 326 |
+
#define PFN_cuGraphAddBatchMemOpNode PFN_cuGraphAddBatchMemOpNode_v11070
|
| 327 |
+
#define PFN_cuGraphBatchMemOpNodeGetParams PFN_cuGraphBatchMemOpNodeGetParams_v11070
|
| 328 |
+
#define PFN_cuGraphBatchMemOpNodeSetParams PFN_cuGraphBatchMemOpNodeSetParams _v11070
|
| 329 |
+
#define PFN_cuGraphExecBatchMemOpNodeSetParams PFN_cuGraphExecBatchMemOpNodeSetParams_v11070
|
| 330 |
+
#define PFN_cuGraphClone PFN_cuGraphClone_v10000
|
| 331 |
+
#define PFN_cuGraphNodeFindInClone PFN_cuGraphNodeFindInClone_v10000
|
| 332 |
+
#define PFN_cuGraphNodeGetType PFN_cuGraphNodeGetType_v10000
|
| 333 |
+
#define PFN_cuGraphGetNodes PFN_cuGraphGetNodes_v10000
|
| 334 |
+
#define PFN_cuGraphGetRootNodes PFN_cuGraphGetRootNodes_v10000
|
| 335 |
+
#define PFN_cuGraphGetEdges PFN_cuGraphGetEdges_v10000
|
| 336 |
+
#define PFN_cuGraphNodeGetDependencies PFN_cuGraphNodeGetDependencies_v10000
|
| 337 |
+
#define PFN_cuGraphNodeGetDependentNodes PFN_cuGraphNodeGetDependentNodes_v10000
|
| 338 |
+
#define PFN_cuGraphAddDependencies PFN_cuGraphAddDependencies_v10000
|
| 339 |
+
#define PFN_cuGraphRemoveDependencies PFN_cuGraphRemoveDependencies_v10000
|
| 340 |
+
#define PFN_cuGraphDestroyNode PFN_cuGraphDestroyNode_v10000
|
| 341 |
+
#define PFN_cuGraphInstantiate PFN_cuGraphInstantiate_v11000
|
| 342 |
+
#define PFN_cuGraphInstantiateWithFlags PFN_cuGraphInstantiateWithFlags_v11040
|
| 343 |
+
#define PFN_cuGraphExecKernelNodeSetParams PFN_cuGraphExecKernelNodeSetParams_v10010
|
| 344 |
+
#define PFN_cuGraphExecMemcpyNodeSetParams PFN_cuGraphExecMemcpyNodeSetParams_v10020
|
| 345 |
+
#define PFN_cuGraphExecMemsetNodeSetParams PFN_cuGraphExecMemsetNodeSetParams_v10020
|
| 346 |
+
#define PFN_cuGraphExecHostNodeSetParams PFN_cuGraphExecHostNodeSetParams_v10020
|
| 347 |
+
#define PFN_cuGraphExecChildGraphNodeSetParams PFN_cuGraphExecChildGraphNodeSetParams_v11010
|
| 348 |
+
#define PFN_cuGraphExecEventRecordNodeSetEvent PFN_cuGraphExecEventRecordNodeSetEvent_v11010
|
| 349 |
+
#define PFN_cuGraphExecEventWaitNodeSetEvent PFN_cuGraphExecEventWaitNodeSetEvent_v11010
|
| 350 |
+
#define PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams_v11020
|
| 351 |
+
#define PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams_v11020
|
| 352 |
+
#define PFN_cuGraphUpload __API_TYPEDEF_PTSZ(PFN_cuGraphUpload, 11010, 11010)
|
| 353 |
+
#define PFN_cuGraphLaunch __API_TYPEDEF_PTSZ(PFN_cuGraphLaunch, 10000, 10000)
|
| 354 |
+
#define PFN_cuGraphExecDestroy PFN_cuGraphExecDestroy_v10000
|
| 355 |
+
#define PFN_cuGraphDestroy PFN_cuGraphDestroy_v10000
|
| 356 |
+
#define PFN_cuGraphExecUpdate PFN_cuGraphExecUpdate_v10020
|
| 357 |
+
#define PFN_cuGraphKernelNodeCopyAttributes PFN_cuGraphKernelNodeCopyAttributes_v11000
|
| 358 |
+
#define PFN_cuGraphKernelNodeGetAttribute PFN_cuGraphKernelNodeGetAttribute_v11000
|
| 359 |
+
#define PFN_cuGraphKernelNodeSetAttribute PFN_cuGraphKernelNodeSetAttribute_v11000
|
| 360 |
+
#define PFN_cuGraphDebugDotPrint PFN_cuGraphDebugDotPrint_v11030
|
| 361 |
+
#define PFN_cuGraphAddMemAllocNode PFN_cuGraphAddMemAllocNode_v11040
|
| 362 |
+
#define PFN_cuGraphMemAllocNodeGetParams PFN_cuGraphMemAllocNodeGetParams_v11040
|
| 363 |
+
#define PFN_cuGraphAddMemFreeNode PFN_cuGraphAddMemFreeNode_v11040
|
| 364 |
+
#define PFN_cuGraphMemFreeNodeGetParams PFN_cuGraphMemFreeNodeGetParams_v11040
|
| 365 |
+
#define PFN_cuGraphNodeSetEnabled PFN_cuGraphNodeSetEnabled_v11060
|
| 366 |
+
#define PFN_cuGraphNodeGetEnabled PFN_cuGraphNodeGetEnabled_v11060
|
| 367 |
+
#define PFN_cuDeviceGraphMemTrim PFN_cuDeviceGraphMemTrim_v11040
|
| 368 |
+
#define PFN_cuDeviceGetGraphMemAttribute PFN_cuDeviceGetGraphMemAttribute_v11040
|
| 369 |
+
#define PFN_cuDeviceSetGraphMemAttribute PFN_cuDeviceSetGraphMemAttribute_v11040
|
| 370 |
+
#define PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor_v6050
|
| 371 |
+
#define PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000
|
| 372 |
+
#define PFN_cuOccupancyMaxPotentialBlockSize PFN_cuOccupancyMaxPotentialBlockSize_v6050
|
| 373 |
+
#define PFN_cuOccupancyMaxPotentialBlockSizeWithFlags PFN_cuOccupancyMaxPotentialBlockSizeWithFlags_v7000
|
| 374 |
+
#define PFN_cuOccupancyAvailableDynamicSMemPerBlock PFN_cuOccupancyAvailableDynamicSMemPerBlock_v10020
|
| 375 |
+
#define PFN_cuOccupancyMaxPotentialClusterSize PFN_cuOccupancyMaxPotentialClusterSize_v11070
|
| 376 |
+
#define PFN_cuOccupancyMaxActiveClusters PFN_cuOccupancyMaxActiveClusters_v11070
|
| 377 |
+
#define PFN_cuTexRefSetArray PFN_cuTexRefSetArray_v2000
|
| 378 |
+
#define PFN_cuTexRefSetMipmappedArray PFN_cuTexRefSetMipmappedArray_v5000
|
| 379 |
+
#define PFN_cuTexRefSetAddress PFN_cuTexRefSetAddress_v3020
|
| 380 |
+
#define PFN_cuTexRefSetAddress2D PFN_cuTexRefSetAddress2D_v4010
|
| 381 |
+
#define PFN_cuTexRefSetFormat PFN_cuTexRefSetFormat_v2000
|
| 382 |
+
#define PFN_cuTexRefSetAddressMode PFN_cuTexRefSetAddressMode_v2000
|
| 383 |
+
#define PFN_cuTexRefSetFilterMode PFN_cuTexRefSetFilterMode_v2000
|
| 384 |
+
#define PFN_cuTexRefSetMipmapFilterMode PFN_cuTexRefSetMipmapFilterMode_v5000
|
| 385 |
+
#define PFN_cuTexRefSetMipmapLevelBias PFN_cuTexRefSetMipmapLevelBias_v5000
|
| 386 |
+
#define PFN_cuTexRefSetMipmapLevelClamp PFN_cuTexRefSetMipmapLevelClamp_v5000
|
| 387 |
+
#define PFN_cuTexRefSetMaxAnisotropy PFN_cuTexRefSetMaxAnisotropy_v5000
|
| 388 |
+
#define PFN_cuTexRefSetBorderColor PFN_cuTexRefSetBorderColor_v8000
|
| 389 |
+
#define PFN_cuTexRefSetFlags PFN_cuTexRefSetFlags_v2000
|
| 390 |
+
#define PFN_cuTexRefGetAddress PFN_cuTexRefGetAddress_v3020
|
| 391 |
+
#define PFN_cuTexRefGetArray PFN_cuTexRefGetArray_v2000
|
| 392 |
+
#define PFN_cuTexRefGetMipmappedArray PFN_cuTexRefGetMipmappedArray_v5000
|
| 393 |
+
#define PFN_cuTexRefGetAddressMode PFN_cuTexRefGetAddressMode_v2000
|
| 394 |
+
#define PFN_cuTexRefGetFilterMode PFN_cuTexRefGetFilterMode_v2000
|
| 395 |
+
#define PFN_cuTexRefGetFormat PFN_cuTexRefGetFormat_v2000
|
| 396 |
+
#define PFN_cuTexRefGetMipmapFilterMode PFN_cuTexRefGetMipmapFilterMode_v5000
|
| 397 |
+
#define PFN_cuTexRefGetMipmapLevelBias PFN_cuTexRefGetMipmapLevelBias_v5000
|
| 398 |
+
#define PFN_cuTexRefGetMipmapLevelClamp PFN_cuTexRefGetMipmapLevelClamp_v5000
|
| 399 |
+
#define PFN_cuTexRefGetMaxAnisotropy PFN_cuTexRefGetMaxAnisotropy_v5000
|
| 400 |
+
#define PFN_cuTexRefGetBorderColor PFN_cuTexRefGetBorderColor_v8000
|
| 401 |
+
#define PFN_cuTexRefGetFlags PFN_cuTexRefGetFlags_v2000
|
| 402 |
+
#define PFN_cuTexRefCreate PFN_cuTexRefCreate_v2000
|
| 403 |
+
#define PFN_cuTexRefDestroy PFN_cuTexRefDestroy_v2000
|
| 404 |
+
#define PFN_cuSurfRefSetArray PFN_cuSurfRefSetArray_v3000
|
| 405 |
+
#define PFN_cuSurfRefGetArray PFN_cuSurfRefGetArray_v3000
|
| 406 |
+
#define PFN_cuTexObjectCreate PFN_cuTexObjectCreate_v5000
|
| 407 |
+
#define PFN_cuTexObjectDestroy PFN_cuTexObjectDestroy_v5000
|
| 408 |
+
#define PFN_cuTexObjectGetResourceDesc PFN_cuTexObjectGetResourceDesc_v5000
|
| 409 |
+
#define PFN_cuTexObjectGetTextureDesc PFN_cuTexObjectGetTextureDesc_v5000
|
| 410 |
+
#define PFN_cuTexObjectGetResourceViewDesc PFN_cuTexObjectGetResourceViewDesc_v5000
|
| 411 |
+
#define PFN_cuSurfObjectCreate PFN_cuSurfObjectCreate_v5000
|
| 412 |
+
#define PFN_cuSurfObjectDestroy PFN_cuSurfObjectDestroy_v5000
|
| 413 |
+
#define PFN_cuSurfObjectGetResourceDesc PFN_cuSurfObjectGetResourceDesc_v5000
|
| 414 |
+
#define PFN_cuDeviceCanAccessPeer PFN_cuDeviceCanAccessPeer_v4000
|
| 415 |
+
#define PFN_cuCtxEnablePeerAccess PFN_cuCtxEnablePeerAccess_v4000
|
| 416 |
+
#define PFN_cuCtxDisablePeerAccess PFN_cuCtxDisablePeerAccess_v4000
|
| 417 |
+
#define PFN_cuDeviceGetP2PAttribute PFN_cuDeviceGetP2PAttribute_v8000
|
| 418 |
+
#define PFN_cuGraphicsUnregisterResource PFN_cuGraphicsUnregisterResource_v3000
|
| 419 |
+
#define PFN_cuGraphicsSubResourceGetMappedArray PFN_cuGraphicsSubResourceGetMappedArray_v3000
|
| 420 |
+
#define PFN_cuGraphicsResourceGetMappedMipmappedArray PFN_cuGraphicsResourceGetMappedMipmappedArray_v5000
|
| 421 |
+
#define PFN_cuGraphicsResourceGetMappedPointer PFN_cuGraphicsResourceGetMappedPointer_v3020
|
| 422 |
+
#define PFN_cuGraphicsResourceSetMapFlags PFN_cuGraphicsResourceSetMapFlags_v6050
|
| 423 |
+
#define PFN_cuGraphicsMapResources __API_TYPEDEF_PTSZ(PFN_cuGraphicsMapResources, 3000, 7000)
|
| 424 |
+
#define PFN_cuGraphicsUnmapResources __API_TYPEDEF_PTSZ(PFN_cuGraphicsUnmapResources, 3000, 7000)
|
| 425 |
+
#define PFN_cuGetExportTable PFN_cuGetExportTable_v3000
|
| 426 |
+
#define PFN_cuFuncGetModule PFN_cuFuncGetModule_v11000
|
| 427 |
+
#define PFN_cuFlushGPUDirectRDMAWrites PFN_cuFlushGPUDirectRDMAWrites_v11030
|
| 428 |
+
#define PFN_cuGetProcAddress PFN_cuGetProcAddress_v11030
|
| 429 |
+
#define PFN_cuUserObjectCreate PFN_cuUserObjectCreate_v11030
|
| 430 |
+
#define PFN_cuUserObjectRetain PFN_cuUserObjectRetain_v11030
|
| 431 |
+
#define PFN_cuUserObjectRelease PFN_cuUserObjectRelease_v11030
|
| 432 |
+
#define PFN_cuGraphRetainUserObject PFN_cuGraphRetainUserObject_v11030
|
| 433 |
+
#define PFN_cuGraphReleaseUserObject PFN_cuGraphReleaseUserObject_v11030
|
| 434 |
+
#define PFN_cuModuleGetLoadingMode PFN_cuModuleGetLoadingMode_v11070
|
| 435 |
+
#define PFN_cuMemGetHandleForAddressRange PFN_cuMemGetHandleForAddressRange_v11070
|
| 436 |
+
|
| 437 |
+
/*
|
| 438 |
+
* Type definitions for functions defined in cuda.h
|
| 439 |
+
*/
|
| 440 |
+
typedef CUresult (CUDAAPI *PFN_cuGetErrorString_v6000)(CUresult error, const char **pStr);
|
| 441 |
+
typedef CUresult (CUDAAPI *PFN_cuGetErrorName_v6000)(CUresult error, const char **pStr);
|
| 442 |
+
typedef CUresult (CUDAAPI *PFN_cuInit_v2000)(unsigned int Flags);
|
| 443 |
+
typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion_v2020)(int *driverVersion);
|
| 444 |
+
typedef CUresult (CUDAAPI *PFN_cuDeviceGet_v2000)(CUdevice_v1 *device, int ordinal);
|
| 445 |
+
typedef CUresult (CUDAAPI *PFN_cuDeviceGetCount_v2000)(int *count);
|
| 446 |
+
typedef CUresult (CUDAAPI *PFN_cuDeviceGetName_v2000)(char *name, int len, CUdevice_v1 dev);
|
| 447 |
+
typedef CUresult (CUDAAPI *PFN_cuDeviceGetUuid_v9020)(CUuuid *uuid, CUdevice_v1 dev);
|
| 448 |
+
typedef CUresult (CUDAAPI *PFN_cuDeviceGetUuid_v11040)(CUuuid *uuid, CUdevice_v1 dev);
|
| 449 |
+
typedef CUresult (CUDAAPI *PFN_cuDeviceGetLuid_v10000)(char *luid, unsigned int *deviceNodeMask, CUdevice_v1 dev);
|
| 450 |
+
typedef CUresult (CUDAAPI *PFN_cuDeviceTotalMem_v3020)(size_t *bytes, CUdevice_v1 dev);
|
| 451 |
+
typedef CUresult (CUDAAPI *PFN_cuDeviceGetTexture1DLinearMaxWidth_v11010)(size_t *maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice_v1 dev);
|
| 452 |
+
typedef CUresult (CUDAAPI *PFN_cuDeviceGetAttribute_v2000)(int *pi, CUdevice_attribute attrib, CUdevice_v1 dev);
|
| 453 |
+
typedef CUresult (CUDAAPI *PFN_cuDeviceGetNvSciSyncAttributes_v10020)(void *nvSciSyncAttrList, CUdevice_v1 dev, int flags);
|
| 454 |
+
typedef CUresult (CUDAAPI *PFN_cuDeviceSetMemPool_v11020)(CUdevice_v1 dev, CUmemoryPool pool);
|
| 455 |
+
typedef CUresult (CUDAAPI *PFN_cuDeviceGetMemPool_v11020)(CUmemoryPool *pool, CUdevice_v1 dev);
|
| 456 |
+
typedef CUresult (CUDAAPI *PFN_cuDeviceGetDefaultMemPool_v11020)(CUmemoryPool *pool_out, CUdevice_v1 dev);
|
| 457 |
+
typedef CUresult (CUDAAPI *PFN_cuDeviceGetProperties_v2000)(CUdevprop_v1 *prop, CUdevice_v1 dev);
|
| 458 |
+
typedef CUresult (CUDAAPI *PFN_cuDeviceComputeCapability_v2000)(int *major, int *minor, CUdevice_v1 dev);
|
| 459 |
+
typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRetain_v7000)(CUcontext *pctx, CUdevice_v1 dev);
|
| 460 |
+
typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRelease_v11000)(CUdevice_v1 dev);
|
| 461 |
+
typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxSetFlags_v11000)(CUdevice_v1 dev, unsigned int flags);
|
| 462 |
+
typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxGetState_v7000)(CUdevice_v1 dev, unsigned int *flags, int *active);
|
| 463 |
+
typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxReset_v11000)(CUdevice_v1 dev);
|
| 464 |
+
typedef CUresult (CUDAAPI *PFN_cuDeviceGetExecAffinitySupport_v11040)(int *pi, CUexecAffinityType type, CUdevice dev);
|
| 465 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v3020)(CUcontext *pctx, unsigned int flags, CUdevice_v1 dev);
|
| 466 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v11040)(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice_v1 dev);
|
| 467 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxDestroy_v4000)(CUcontext ctx);
|
| 468 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxPushCurrent_v4000)(CUcontext ctx);
|
| 469 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxPopCurrent_v4000)(CUcontext *pctx);
|
| 470 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxSetCurrent_v4000)(CUcontext ctx);
|
| 471 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxGetCurrent_v4000)(CUcontext *pctx);
|
| 472 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxGetDevice_v2000)(CUdevice_v1 *device);
|
| 473 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxGetFlags_v7000)(unsigned int *flags);
|
| 474 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxSynchronize_v2000)(void);
|
| 475 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxSetLimit_v3010)(CUlimit limit, size_t value);
|
| 476 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxGetLimit_v3010)(size_t *pvalue, CUlimit limit);
|
| 477 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxGetCacheConfig_v3020)(CUfunc_cache *pconfig);
|
| 478 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxSetCacheConfig_v3020)(CUfunc_cache config);
|
| 479 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxGetSharedMemConfig_v4020)(CUsharedconfig *pConfig);
|
| 480 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxSetSharedMemConfig_v4020)(CUsharedconfig config);
|
| 481 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxGetApiVersion_v3020)(CUcontext ctx, unsigned int *version);
|
| 482 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxGetStreamPriorityRange_v5050)(int *leastPriority, int *greatestPriority);
|
| 483 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxResetPersistingL2Cache_v11000)(void);
|
| 484 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxAttach_v2000)(CUcontext *pctx, unsigned int flags);
|
| 485 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxDetach_v2000)(CUcontext ctx);
|
| 486 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxGetExecAffinity_v11040)(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type);
|
| 487 |
+
typedef CUresult (CUDAAPI *PFN_cuModuleLoad_v2000)(CUmodule *module, const char *fname);
|
| 488 |
+
typedef CUresult (CUDAAPI *PFN_cuModuleLoadData_v2000)(CUmodule *module, const void *image);
|
| 489 |
+
typedef CUresult (CUDAAPI *PFN_cuModuleLoadDataEx_v2010)(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
|
| 490 |
+
typedef CUresult (CUDAAPI *PFN_cuModuleLoadFatBinary_v2000)(CUmodule *module, const void *fatCubin);
|
| 491 |
+
typedef CUresult (CUDAAPI *PFN_cuModuleUnload_v2000)(CUmodule hmod);
|
| 492 |
+
typedef CUresult (CUDAAPI *PFN_cuModuleGetFunction_v2000)(CUfunction *hfunc, CUmodule hmod, const char *name);
|
| 493 |
+
typedef CUresult (CUDAAPI *PFN_cuModuleGetGlobal_v3020)(CUdeviceptr_v2 *dptr, size_t *bytes, CUmodule hmod, const char *name);
|
| 494 |
+
typedef CUresult (CUDAAPI *PFN_cuModuleGetTexRef_v2000)(CUtexref *pTexRef, CUmodule hmod, const char *name);
|
| 495 |
+
typedef CUresult (CUDAAPI *PFN_cuModuleGetSurfRef_v3000)(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
|
| 496 |
+
typedef CUresult (CUDAAPI *PFN_cuLinkCreate_v6050)(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
|
| 497 |
+
typedef CUresult (CUDAAPI *PFN_cuLinkAddData_v6050)(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues);
|
| 498 |
+
typedef CUresult (CUDAAPI *PFN_cuLinkAddFile_v6050)(CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues);
|
| 499 |
+
typedef CUresult (CUDAAPI *PFN_cuLinkComplete_v5050)(CUlinkState state, void **cubinOut, size_t *sizeOut);
|
| 500 |
+
typedef CUresult (CUDAAPI *PFN_cuLinkDestroy_v5050)(CUlinkState state);
|
| 501 |
+
typedef CUresult (CUDAAPI *PFN_cuMemGetInfo_v3020)(size_t *free, size_t *total);
|
| 502 |
+
typedef CUresult (CUDAAPI *PFN_cuMemAlloc_v3020)(CUdeviceptr_v2 *dptr, size_t bytesize);
|
| 503 |
+
typedef CUresult (CUDAAPI *PFN_cuMemAllocPitch_v3020)(CUdeviceptr_v2 *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
|
| 504 |
+
typedef CUresult (CUDAAPI *PFN_cuMemFree_v3020)(CUdeviceptr_v2 dptr);
|
| 505 |
+
typedef CUresult (CUDAAPI *PFN_cuMemGetAddressRange_v3020)(CUdeviceptr_v2 *pbase, size_t *psize, CUdeviceptr_v2 dptr);
|
| 506 |
+
typedef CUresult (CUDAAPI *PFN_cuMemAllocHost_v3020)(void **pp, size_t bytesize);
|
| 507 |
+
typedef CUresult (CUDAAPI *PFN_cuMemFreeHost_v2000)(void *p);
|
| 508 |
+
typedef CUresult (CUDAAPI *PFN_cuMemHostAlloc_v2020)(void **pp, size_t bytesize, unsigned int Flags);
|
| 509 |
+
typedef CUresult (CUDAAPI *PFN_cuMemHostGetDevicePointer_v3020)(CUdeviceptr_v2 *pdptr, void *p, unsigned int Flags);
|
| 510 |
+
typedef CUresult (CUDAAPI *PFN_cuMemHostGetFlags_v2030)(unsigned int *pFlags, void *p);
|
| 511 |
+
typedef CUresult (CUDAAPI *PFN_cuMemAllocManaged_v6000)(CUdeviceptr_v2 *dptr, size_t bytesize, unsigned int flags);
|
| 512 |
+
typedef CUresult (CUDAAPI *PFN_cuDeviceGetByPCIBusId_v4010)(CUdevice_v1 *dev, const char *pciBusId);
|
| 513 |
+
typedef CUresult (CUDAAPI *PFN_cuDeviceGetPCIBusId_v4010)(char *pciBusId, int len, CUdevice_v1 dev);
|
| 514 |
+
typedef CUresult (CUDAAPI *PFN_cuIpcGetEventHandle_v4010)(CUipcEventHandle_v1 *pHandle, CUevent event);
|
| 515 |
+
typedef CUresult (CUDAAPI *PFN_cuIpcOpenEventHandle_v4010)(CUevent *phEvent, CUipcEventHandle_v1 handle);
|
| 516 |
+
typedef CUresult (CUDAAPI *PFN_cuIpcGetMemHandle_v4010)(CUipcMemHandle_v1 *pHandle, CUdeviceptr_v2 dptr);
|
| 517 |
+
typedef CUresult (CUDAAPI *PFN_cuIpcOpenMemHandle_v11000)(CUdeviceptr_v2 *pdptr, CUipcMemHandle_v1 handle, unsigned int Flags);
|
| 518 |
+
typedef CUresult (CUDAAPI *PFN_cuIpcCloseMemHandle_v4010)(CUdeviceptr_v2 dptr);
|
| 519 |
+
typedef CUresult (CUDAAPI *PFN_cuMemHostRegister_v6050)(void *p, size_t bytesize, unsigned int Flags);
|
| 520 |
+
typedef CUresult (CUDAAPI *PFN_cuMemHostUnregister_v4000)(void *p);
|
| 521 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpy_v7000_ptds)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount);
|
| 522 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyPeer_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount);
|
| 523 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount);
|
| 524 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v7000_ptds)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount);
|
| 525 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount);
|
| 526 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, CUdeviceptr_v2 srcDevice, size_t ByteCount);
|
| 527 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
|
| 528 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
|
| 529 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v7000_ptds)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
|
| 530 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
|
| 531 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v7000_ptds)(const CUDA_MEMCPY2D_v2 *pCopy);
|
| 532 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v7000_ptds)(const CUDA_MEMCPY2D_v2 *pCopy);
|
| 533 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v7000_ptds)(const CUDA_MEMCPY3D_v2 *pCopy);
|
| 534 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeer_v7000_ptds)(const CUDA_MEMCPY3D_PEER_v1 *pCopy);
|
| 535 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyAsync_v7000_ptsz)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount, CUstream hStream);
|
| 536 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyPeerAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
|
| 537 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
|
| 538 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v7000_ptsz)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
|
| 539 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
|
| 540 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v7000_ptsz)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
|
| 541 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v7000_ptsz)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
|
| 542 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v7000_ptsz)(const CUDA_MEMCPY2D_v2 *pCopy, CUstream hStream);
|
| 543 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v7000_ptsz)(const CUDA_MEMCPY3D_v2 *pCopy, CUstream hStream);
|
| 544 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeerAsync_v7000_ptsz)(const CUDA_MEMCPY3D_PEER_v1 *pCopy, CUstream hStream);
|
| 545 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N);
|
| 546 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N);
|
| 547 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N);
|
| 548 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
|
| 549 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
|
| 550 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
|
| 551 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD8Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N, CUstream hStream);
|
| 552 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD16Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N, CUstream hStream);
|
| 553 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD32Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N, CUstream hStream);
|
| 554 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
|
| 555 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
|
| 556 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
|
| 557 |
+
typedef CUresult (CUDAAPI *PFN_cuArrayCreate_v3020)(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v2 *pAllocateArray);
|
| 558 |
+
typedef CUresult (CUDAAPI *PFN_cuArrayGetDescriptor_v3020)(CUDA_ARRAY_DESCRIPTOR_v2 *pArrayDescriptor, CUarray hArray);
|
| 559 |
+
typedef CUresult (CUDAAPI *PFN_cuArrayGetSparseProperties_v11010)(CUDA_ARRAY_SPARSE_PROPERTIES_v1 *sparseProperties, CUarray array);
|
| 560 |
+
typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetSparseProperties_v11010)(CUDA_ARRAY_SPARSE_PROPERTIES_v1 *sparseProperties, CUmipmappedArray mipmap);
|
| 561 |
+
typedef CUresult (CUDAAPI *PFN_cuArrayGetMemoryRequirements_v11060)(CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 *memoryRequirements, CUarray array, CUdevice device);
|
| 562 |
+
typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetMemoryRequirements_v11060)(CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 *memoryRequirements, CUmipmappedArray mipmap, CUdevice device);
|
| 563 |
+
typedef CUresult (CUDAAPI *PFN_cuArrayGetPlane_v11020)(CUarray *pPlaneArray, CUarray hArray, unsigned int planeIdx);
|
| 564 |
+
typedef CUresult (CUDAAPI *PFN_cuArrayDestroy_v2000)(CUarray hArray);
|
| 565 |
+
typedef CUresult (CUDAAPI *PFN_cuArray3DCreate_v3020)(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v2 *pAllocateArray);
|
| 566 |
+
typedef CUresult (CUDAAPI *PFN_cuArray3DGetDescriptor_v3020)(CUDA_ARRAY3D_DESCRIPTOR_v2 *pArrayDescriptor, CUarray hArray);
|
| 567 |
+
typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayCreate_v5000)(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v2 *pMipmappedArrayDesc, unsigned int numMipmapLevels);
|
| 568 |
+
typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetLevel_v5000)(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level);
|
| 569 |
+
typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayDestroy_v5000)(CUmipmappedArray hMipmappedArray);
|
| 570 |
+
typedef CUresult (CUDAAPI *PFN_cuMemAddressReserve_v10020)(CUdeviceptr_v2 *ptr, size_t size, size_t alignment, CUdeviceptr_v2 addr, unsigned long long flags);
|
| 571 |
+
typedef CUresult (CUDAAPI *PFN_cuMemAddressFree_v10020)(CUdeviceptr_v2 ptr, size_t size);
|
| 572 |
+
typedef CUresult (CUDAAPI *PFN_cuMemCreate_v10020)(CUmemGenericAllocationHandle_v1 *handle, size_t size, const CUmemAllocationProp_v1 *prop, unsigned long long flags);
|
| 573 |
+
typedef CUresult (CUDAAPI *PFN_cuMemRelease_v10020)(CUmemGenericAllocationHandle_v1 handle);
|
| 574 |
+
typedef CUresult (CUDAAPI *PFN_cuMemMap_v10020)(CUdeviceptr_v2 ptr, size_t size, size_t offset, CUmemGenericAllocationHandle_v1 handle, unsigned long long flags);
|
| 575 |
+
typedef CUresult (CUDAAPI *PFN_cuMemMapArrayAsync_v11010_ptsz)(CUarrayMapInfo_v1 *mapInfoList, unsigned int count, CUstream hStream);
|
| 576 |
+
typedef CUresult (CUDAAPI *PFN_cuMemUnmap_v10020)(CUdeviceptr_v2 ptr, size_t size);
|
| 577 |
+
typedef CUresult (CUDAAPI *PFN_cuMemSetAccess_v10020)(CUdeviceptr_v2 ptr, size_t size, const CUmemAccessDesc_v1 *desc, size_t count);
|
| 578 |
+
typedef CUresult (CUDAAPI *PFN_cuMemGetAccess_v10020)(unsigned long long *flags, const CUmemLocation_v1 *location, CUdeviceptr_v2 ptr);
|
| 579 |
+
typedef CUresult (CUDAAPI *PFN_cuMemExportToShareableHandle_v10020)(void *shareableHandle, CUmemGenericAllocationHandle_v1 handle, CUmemAllocationHandleType handleType, unsigned long long flags);
|
| 580 |
+
typedef CUresult (CUDAAPI *PFN_cuMemImportFromShareableHandle_v10020)(CUmemGenericAllocationHandle_v1 *handle, void *osHandle, CUmemAllocationHandleType shHandleType);
|
| 581 |
+
typedef CUresult (CUDAAPI *PFN_cuMemGetAllocationGranularity_v10020)(size_t *granularity, const CUmemAllocationProp_v1 *prop, CUmemAllocationGranularity_flags option);
|
| 582 |
+
typedef CUresult (CUDAAPI *PFN_cuMemGetAllocationPropertiesFromHandle_v10020)(CUmemAllocationProp_v1 *prop, CUmemGenericAllocationHandle_v1 handle);
|
| 583 |
+
typedef CUresult (CUDAAPI *PFN_cuMemRetainAllocationHandle_v11000)(CUmemGenericAllocationHandle_v1 *handle, void *addr);
|
| 584 |
+
typedef CUresult (CUDAAPI *PFN_cuMemFreeAsync_v11020_ptsz)(CUdeviceptr_v2 dptr, CUstream hStream);
|
| 585 |
+
typedef CUresult (CUDAAPI *PFN_cuMemAllocAsync_v11020_ptsz)(CUdeviceptr_v2 *dptr, size_t bytesize, CUstream hStream);
|
| 586 |
+
typedef CUresult (CUDAAPI *PFN_cuMemPoolTrimTo_v11020)(CUmemoryPool pool, size_t minBytesToKeep);
|
| 587 |
+
typedef CUresult (CUDAAPI *PFN_cuMemPoolSetAttribute_v11020)(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
|
| 588 |
+
typedef CUresult (CUDAAPI *PFN_cuMemPoolGetAttribute_v11020)(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
|
| 589 |
+
typedef CUresult (CUDAAPI *PFN_cuMemPoolSetAccess_v11020)(CUmemoryPool pool, const CUmemAccessDesc_v1 *map, size_t count);
|
| 590 |
+
typedef CUresult (CUDAAPI *PFN_cuMemPoolGetAccess_v11020)(CUmemAccess_flags *flags, CUmemoryPool memPool, CUmemLocation_v1 *location);
|
| 591 |
+
typedef CUresult (CUDAAPI *PFN_cuMemPoolCreate_v11020)(CUmemoryPool *pool, const CUmemPoolProps_v1 *poolProps);
|
| 592 |
+
typedef CUresult (CUDAAPI *PFN_cuMemPoolDestroy_v11020)(CUmemoryPool pool);
|
| 593 |
+
typedef CUresult (CUDAAPI *PFN_cuMemAllocFromPoolAsync_v11020_ptsz)(CUdeviceptr_v2 *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
|
| 594 |
+
typedef CUresult (CUDAAPI *PFN_cuMemPoolExportToShareableHandle_v11020)(void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType, unsigned long long flags);
|
| 595 |
+
typedef CUresult (CUDAAPI *PFN_cuMemPoolImportFromShareableHandle_v11020)(CUmemoryPool *pool_out, void *handle, CUmemAllocationHandleType handleType, unsigned long long flags);
|
| 596 |
+
typedef CUresult (CUDAAPI *PFN_cuMemPoolExportPointer_v11020)(CUmemPoolPtrExportData_v1 *shareData_out, CUdeviceptr_v2 ptr);
|
| 597 |
+
typedef CUresult (CUDAAPI *PFN_cuMemPoolImportPointer_v11020)(CUdeviceptr_v2 *ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData_v1 *shareData);
|
| 598 |
+
typedef CUresult (CUDAAPI *PFN_cuPointerGetAttribute_v4000)(void *data, CUpointer_attribute attribute, CUdeviceptr_v2 ptr);
|
| 599 |
+
typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v8000_ptsz)(CUdeviceptr_v2 devPtr, size_t count, CUdevice_v1 dstDevice, CUstream hStream);
|
| 600 |
+
typedef CUresult (CUDAAPI *PFN_cuMemAdvise_v8000)(CUdeviceptr_v2 devPtr, size_t count, CUmem_advise advice, CUdevice_v1 device);
|
| 601 |
+
typedef CUresult (CUDAAPI *PFN_cuMemRangeGetAttribute_v8000)(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr_v2 devPtr, size_t count);
|
| 602 |
+
typedef CUresult (CUDAAPI *PFN_cuMemRangeGetAttributes_v8000)(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr_v2 devPtr, size_t count);
|
| 603 |
+
typedef CUresult (CUDAAPI *PFN_cuPointerSetAttribute_v6000)(const void *value, CUpointer_attribute attribute, CUdeviceptr_v2 ptr);
|
| 604 |
+
typedef CUresult (CUDAAPI *PFN_cuPointerGetAttributes_v7000)(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr_v2 ptr);
|
| 605 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamCreate_v2000)(CUstream *phStream, unsigned int Flags);
|
| 606 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamCreateWithPriority_v5050)(CUstream *phStream, unsigned int flags, int priority);
|
| 607 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamGetPriority_v7000_ptsz)(CUstream hStream, int *priority);
|
| 608 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamGetFlags_v7000_ptsz)(CUstream hStream, unsigned int *flags);
|
| 609 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamGetCtx_v9020_ptsz)(CUstream hStream, CUcontext *pctx);
|
| 610 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamWaitEvent_v7000_ptsz)(CUstream hStream, CUevent hEvent, unsigned int Flags);
|
| 611 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamAddCallback_v7000_ptsz)(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
|
| 612 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10010_ptsz)(CUstream hStream, CUstreamCaptureMode mode);
|
| 613 |
+
typedef CUresult (CUDAAPI *PFN_cuThreadExchangeStreamCaptureMode_v10010)(CUstreamCaptureMode *mode);
|
| 614 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamEndCapture_v10000_ptsz)(CUstream hStream, CUgraph *phGraph);
|
| 615 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamIsCapturing_v10000_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus);
|
| 616 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v10010_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
|
| 617 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v11030_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
|
| 618 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v11030_ptsz)(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
|
| 619 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamAttachMemAsync_v7000_ptsz)(CUstream hStream, CUdeviceptr_v2 dptr, size_t length, unsigned int flags);
|
| 620 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamQuery_v7000_ptsz)(CUstream hStream);
|
| 621 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamSynchronize_v7000_ptsz)(CUstream hStream);
|
| 622 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamDestroy_v4000)(CUstream hStream);
|
| 623 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamCopyAttributes_v11000_ptsz)(CUstream dst, CUstream src);
|
| 624 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamGetAttribute_v11000_ptsz)(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue_v1 *value_out);
|
| 625 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamSetAttribute_v11000_ptsz)(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue_v1 *value);
|
| 626 |
+
typedef CUresult (CUDAAPI *PFN_cuEventCreate_v2000)(CUevent *phEvent, unsigned int Flags);
|
| 627 |
+
typedef CUresult (CUDAAPI *PFN_cuEventRecord_v7000_ptsz)(CUevent hEvent, CUstream hStream);
|
| 628 |
+
typedef CUresult (CUDAAPI *PFN_cuEventRecordWithFlags_v11010_ptsz)(CUevent hEvent, CUstream hStream, unsigned int flags);
|
| 629 |
+
typedef CUresult (CUDAAPI *PFN_cuEventQuery_v2000)(CUevent hEvent);
|
| 630 |
+
typedef CUresult (CUDAAPI *PFN_cuEventSynchronize_v2000)(CUevent hEvent);
|
| 631 |
+
typedef CUresult (CUDAAPI *PFN_cuEventDestroy_v4000)(CUevent hEvent);
|
| 632 |
+
typedef CUresult (CUDAAPI *PFN_cuEventElapsedTime_v2000)(float *pMilliseconds, CUevent hStart, CUevent hEnd);
|
| 633 |
+
typedef CUresult (CUDAAPI *PFN_cuImportExternalMemory_v10000)(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1 *memHandleDesc);
|
| 634 |
+
typedef CUresult (CUDAAPI *PFN_cuExternalMemoryGetMappedBuffer_v10000)(CUdeviceptr_v2 *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1 *bufferDesc);
|
| 635 |
+
typedef CUresult (CUDAAPI *PFN_cuExternalMemoryGetMappedMipmappedArray_v10000)(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1 *mipmapDesc);
|
| 636 |
+
typedef CUresult (CUDAAPI *PFN_cuDestroyExternalMemory_v10000)(CUexternalMemory extMem);
|
| 637 |
+
typedef CUresult (CUDAAPI *PFN_cuImportExternalSemaphore_v10000)(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1 *semHandleDesc);
|
| 638 |
+
typedef CUresult (CUDAAPI *PFN_cuSignalExternalSemaphoresAsync_v10000_ptsz)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
|
| 639 |
+
typedef CUresult (CUDAAPI *PFN_cuWaitExternalSemaphoresAsync_v10000_ptsz)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
|
| 640 |
+
typedef CUresult (CUDAAPI *PFN_cuDestroyExternalSemaphore_v10000)(CUexternalSemaphore extSem);
|
| 641 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v8000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
|
| 642 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v9000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
|
| 643 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v8000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
|
| 644 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v9000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
|
| 645 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v8000_ptsz)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams_v1 *paramArray, unsigned int flags);
|
| 646 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
|
| 647 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
|
| 648 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
|
| 649 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
|
| 650 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v11070_ptsz)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
|
| 651 |
+
typedef CUresult (CUDAAPI *PFN_cuFuncGetAttribute_v2020)(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
|
| 652 |
+
typedef CUresult (CUDAAPI *PFN_cuFuncSetAttribute_v9000)(CUfunction hfunc, CUfunction_attribute attrib, int value);
|
| 653 |
+
typedef CUresult (CUDAAPI *PFN_cuFuncSetCacheConfig_v3000)(CUfunction hfunc, CUfunc_cache config);
|
| 654 |
+
typedef CUresult (CUDAAPI *PFN_cuFuncSetSharedMemConfig_v4020)(CUfunction hfunc, CUsharedconfig config);
|
| 655 |
+
typedef CUresult (CUDAAPI *PFN_cuLaunchKernel_v7000_ptsz)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
|
| 656 |
+
typedef CUresult (CUDAAPI *PFN_cuLaunchKernelEx_v11060_ptsz)(const CUlaunchConfig *config, CUfunction f, void **kernelParams, void **extra);
|
| 657 |
+
typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernel_v9000_ptsz)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams);
|
| 658 |
+
typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernelMultiDevice_v9000)(CUDA_LAUNCH_PARAMS_v1 *launchParamsList, unsigned int numDevices, unsigned int flags);
|
| 659 |
+
typedef CUresult (CUDAAPI *PFN_cuLaunchHostFunc_v10000_ptsz)(CUstream hStream, CUhostFn fn, void *userData);
|
| 660 |
+
typedef CUresult (CUDAAPI *PFN_cuFuncSetBlockShape_v2000)(CUfunction hfunc, int x, int y, int z);
|
| 661 |
+
typedef CUresult (CUDAAPI *PFN_cuFuncSetSharedSize_v2000)(CUfunction hfunc, unsigned int bytes);
|
| 662 |
+
typedef CUresult (CUDAAPI *PFN_cuParamSetSize_v2000)(CUfunction hfunc, unsigned int numbytes);
|
| 663 |
+
typedef CUresult (CUDAAPI *PFN_cuParamSeti_v2000)(CUfunction hfunc, int offset, unsigned int value);
|
| 664 |
+
typedef CUresult (CUDAAPI *PFN_cuParamSetf_v2000)(CUfunction hfunc, int offset, float value);
|
| 665 |
+
typedef CUresult (CUDAAPI *PFN_cuParamSetv_v2000)(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
|
| 666 |
+
typedef CUresult (CUDAAPI *PFN_cuLaunch_v2000)(CUfunction f);
|
| 667 |
+
typedef CUresult (CUDAAPI *PFN_cuLaunchGrid_v2000)(CUfunction f, int grid_width, int grid_height);
|
| 668 |
+
typedef CUresult (CUDAAPI *PFN_cuLaunchGridAsync_v2000)(CUfunction f, int grid_width, int grid_height, CUstream hStream);
|
| 669 |
+
typedef CUresult (CUDAAPI *PFN_cuParamSetTexRef_v2000)(CUfunction hfunc, int texunit, CUtexref hTexRef);
|
| 670 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphCreate_v10000)(CUgraph *phGraph, unsigned int flags);
|
| 671 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphAddKernelNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
|
| 672 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeGetParams_v10000)(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
|
| 673 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
|
| 674 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphAddMemcpyNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D_v2 *copyParams, CUcontext ctx);
|
| 675 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphMemcpyNodeGetParams_v10000)(CUgraphNode hNode, CUDA_MEMCPY3D_v2 *nodeParams);
|
| 676 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphMemcpyNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_MEMCPY3D_v2 *nodeParams);
|
| 677 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphAddMemsetNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS_v1 *memsetParams, CUcontext ctx);
|
| 678 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphMemsetNodeGetParams_v10000)(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS_v1 *nodeParams);
|
| 679 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphMemsetNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS_v1 *nodeParams);
|
| 680 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphAddHostNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
|
| 681 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphHostNodeGetParams_v10000)(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
|
| 682 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphHostNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
|
| 683 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphAddChildGraphNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph);
|
| 684 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphChildGraphNodeGetGraph_v10000)(CUgraphNode hNode, CUgraph *phGraph);
|
| 685 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphAddEmptyNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies);
|
| 686 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphAddEventRecordNode_v11010)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
|
| 687 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphEventRecordNodeGetEvent_v11010)(CUgraphNode hNode, CUevent *event_out);
|
| 688 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphEventRecordNodeSetEvent_v11010)(CUgraphNode hNode, CUevent event);
|
| 689 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphAddEventWaitNode_v11010)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
|
| 690 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphEventWaitNodeGetEvent_v11010)(CUgraphNode hNode, CUevent *event_out);
|
| 691 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphEventWaitNodeSetEvent_v11010)(CUgraphNode hNode, CUevent event);
|
| 692 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphAddExternalSemaphoresSignalNode_v11020)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
|
| 693 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresSignalNodeGetParams_v11020)(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *params_out);
|
| 694 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresSignalNodeSetParams_v11020)(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
|
| 695 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphAddExternalSemaphoresWaitNode_v11020)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
|
| 696 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresWaitNodeGetParams_v11020)(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *params_out);
|
| 697 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresWaitNodeSetParams_v11020)(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
|
| 698 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphAddBatchMemOpNode_v11070)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
|
| 699 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphBatchMemOpNodeGetParams_v11070)(CUgraphNode hNode, CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams_out);
|
| 700 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphBatchMemOpNodeSetParams_v11070)(CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
|
| 701 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphExecBatchMemOpNodeSetParams_v11070)(CUgraphExec graphExec, CUgraphNode node, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
|
| 702 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphClone_v10000)(CUgraph *phGraphClone, CUgraph originalGraph);
|
| 703 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphNodeFindInClone_v10000)(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph);
|
| 704 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetType_v10000)(CUgraphNode hNode, CUgraphNodeType *type);
|
| 705 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphGetNodes_v10000)(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes);
|
| 706 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphGetRootNodes_v10000)(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes);
|
| 707 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphGetEdges_v10000)(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges);
|
| 708 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependencies_v10000)(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies);
|
| 709 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependentNodes_v10000)(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes);
|
| 710 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphAddDependencies_v10000)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
|
| 711 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphRemoveDependencies_v10000)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
|
| 712 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphDestroyNode_v10000)(CUgraphNode hNode);
|
| 713 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphInstantiate_v11000)(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
|
| 714 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphInstantiateWithFlags_v11040)(CUgraphExec *phGraphExec, CUgraph hGraph, unsigned long long flags);
|
| 715 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphExecKernelNodeSetParams_v10010)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
|
| 716 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphExecMemcpyNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D_v2 *copyParams, CUcontext ctx);
|
| 717 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphExecMemsetNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS_v1 *memsetParams, CUcontext ctx);
|
| 718 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphExecHostNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
|
| 719 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphExecChildGraphNodeSetParams_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph);
|
| 720 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphExecEventRecordNodeSetEvent_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
|
| 721 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphExecEventWaitNodeSetEvent_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
|
| 722 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams_v11020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
|
| 723 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams_v11020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
|
| 724 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphUpload_v11010_ptsz)(CUgraphExec hGraphExec, CUstream hStream);
|
| 725 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphLaunch_v10000_ptsz)(CUgraphExec hGraphExec, CUstream hStream);
|
| 726 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphExecDestroy_v10000)(CUgraphExec hGraphExec);
|
| 727 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphDestroy_v10000)(CUgraph hGraph);
|
| 728 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphExecUpdate_v10020)(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out);
|
| 729 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeCopyAttributes_v11000)(CUgraphNode dst, CUgraphNode src);
|
| 730 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeGetAttribute_v11000)(CUgraphNode hNode, CUkernelNodeAttrID attr, CUkernelNodeAttrValue_v1 *value_out);
|
| 731 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeSetAttribute_v11000)(CUgraphNode hNode, CUkernelNodeAttrID attr, const CUkernelNodeAttrValue_v1 *value);
|
| 732 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphDebugDotPrint_v11030)(CUgraph hGraph, const char *path, unsigned int flags);
|
| 733 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphAddMemAllocNode_v11040)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams);
|
| 734 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphMemAllocNodeGetParams_v11040)(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out);
|
| 735 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphAddMemFreeNode_v11040)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr);
|
| 736 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphMemFreeNodeGetParams_v11040)(CUgraphNode hNode, CUdeviceptr *dptr_out);
|
| 737 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphNodeSetEnabled_v11060)(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int isEnabled);
|
| 738 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetEnabled_v11060)(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int *isEnabled);
|
| 739 |
+
typedef CUresult (CUDAAPI *PFN_cuDeviceGraphMemTrim_v11040)(CUdevice device);
|
| 740 |
+
typedef CUresult (CUDAAPI *PFN_cuDeviceGetGraphMemAttribute_v11040)(CUdevice device, CUgraphMem_attribute attr, void* value);
|
| 741 |
+
typedef CUresult (CUDAAPI *PFN_cuDeviceSetGraphMemAttribute_v11040)(CUdevice device, CUgraphMem_attribute attr, void* value);
|
| 742 |
+
typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor_v6050)(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize);
|
| 743 |
+
typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000)(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags);
|
| 744 |
+
typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialBlockSize_v6050)(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit);
|
| 745 |
+
typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialBlockSizeWithFlags_v7000)(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags);
|
| 746 |
+
typedef CUresult (CUDAAPI *PFN_cuOccupancyAvailableDynamicSMemPerBlock_v10020)(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize);
|
| 747 |
+
typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialClusterSize_v11070)(int *clusterSize, CUfunction func, const CUlaunchConfig *config);
|
| 748 |
+
typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveClusters_v11070)(int *numClusters, CUfunction func, const CUlaunchConfig *config);
|
| 749 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefSetArray_v2000)(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
|
| 750 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmappedArray_v5000)(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags);
|
| 751 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress_v3020)(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr_v2 dptr, size_t bytes);
|
| 752 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v4010)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v2 *desc, CUdeviceptr_v2 dptr, size_t Pitch);
|
| 753 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefSetFormat_v2000)(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
|
| 754 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddressMode_v2000)(CUtexref hTexRef, int dim, CUaddress_mode am);
|
| 755 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefSetFilterMode_v2000)(CUtexref hTexRef, CUfilter_mode fm);
|
| 756 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapFilterMode_v5000)(CUtexref hTexRef, CUfilter_mode fm);
|
| 757 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapLevelBias_v5000)(CUtexref hTexRef, float bias);
|
| 758 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapLevelClamp_v5000)(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp);
|
| 759 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefSetMaxAnisotropy_v5000)(CUtexref hTexRef, unsigned int maxAniso);
|
| 760 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefSetBorderColor_v8000)(CUtexref hTexRef, float *pBorderColor);
|
| 761 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefSetFlags_v2000)(CUtexref hTexRef, unsigned int Flags);
|
| 762 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddress_v3020)(CUdeviceptr_v2 *pdptr, CUtexref hTexRef);
|
| 763 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefGetArray_v2000)(CUarray *phArray, CUtexref hTexRef);
|
| 764 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmappedArray_v5000)(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef);
|
| 765 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddressMode_v2000)(CUaddress_mode *pam, CUtexref hTexRef, int dim);
|
| 766 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefGetFilterMode_v2000)(CUfilter_mode *pfm, CUtexref hTexRef);
|
| 767 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefGetFormat_v2000)(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
|
| 768 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapFilterMode_v5000)(CUfilter_mode *pfm, CUtexref hTexRef);
|
| 769 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapLevelBias_v5000)(float *pbias, CUtexref hTexRef);
|
| 770 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapLevelClamp_v5000)(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef);
|
| 771 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefGetMaxAnisotropy_v5000)(int *pmaxAniso, CUtexref hTexRef);
|
| 772 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefGetBorderColor_v8000)(float *pBorderColor, CUtexref hTexRef);
|
| 773 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefGetFlags_v2000)(unsigned int *pFlags, CUtexref hTexRef);
|
| 774 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefCreate_v2000)(CUtexref *pTexRef);
|
| 775 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefDestroy_v2000)(CUtexref hTexRef);
|
| 776 |
+
typedef CUresult (CUDAAPI *PFN_cuSurfRefSetArray_v3000)(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
|
| 777 |
+
typedef CUresult (CUDAAPI *PFN_cuSurfRefGetArray_v3000)(CUarray *phArray, CUsurfref hSurfRef);
|
| 778 |
+
typedef CUresult (CUDAAPI *PFN_cuTexObjectCreate_v5000)(CUtexObject_v1 *pTexObject, const CUDA_RESOURCE_DESC_v1 *pResDesc, const CUDA_TEXTURE_DESC_v1 *pTexDesc, const CUDA_RESOURCE_VIEW_DESC_v1 *pResViewDesc);
|
| 779 |
+
typedef CUresult (CUDAAPI *PFN_cuTexObjectDestroy_v5000)(CUtexObject_v1 texObject);
|
| 780 |
+
typedef CUresult (CUDAAPI *PFN_cuTexObjectGetResourceDesc_v5000)(CUDA_RESOURCE_DESC_v1 *pResDesc, CUtexObject_v1 texObject);
|
| 781 |
+
typedef CUresult (CUDAAPI *PFN_cuTexObjectGetTextureDesc_v5000)(CUDA_TEXTURE_DESC_v1 *pTexDesc, CUtexObject_v1 texObject);
|
| 782 |
+
typedef CUresult (CUDAAPI *PFN_cuTexObjectGetResourceViewDesc_v5000)(CUDA_RESOURCE_VIEW_DESC_v1 *pResViewDesc, CUtexObject_v1 texObject);
|
| 783 |
+
typedef CUresult (CUDAAPI *PFN_cuSurfObjectCreate_v5000)(CUsurfObject_v1 *pSurfObject, const CUDA_RESOURCE_DESC_v1 *pResDesc);
|
| 784 |
+
typedef CUresult (CUDAAPI *PFN_cuSurfObjectDestroy_v5000)(CUsurfObject_v1 surfObject);
|
| 785 |
+
typedef CUresult (CUDAAPI *PFN_cuSurfObjectGetResourceDesc_v5000)(CUDA_RESOURCE_DESC_v1 *pResDesc, CUsurfObject_v1 surfObject);
|
| 786 |
+
typedef CUresult (CUDAAPI *PFN_cuDeviceCanAccessPeer_v4000)(int *canAccessPeer, CUdevice_v1 dev, CUdevice_v1 peerDev);
|
| 787 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxEnablePeerAccess_v4000)(CUcontext peerContext, unsigned int Flags);
|
| 788 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxDisablePeerAccess_v4000)(CUcontext peerContext);
|
| 789 |
+
typedef CUresult (CUDAAPI *PFN_cuDeviceGetP2PAttribute_v8000)(int *value, CUdevice_P2PAttribute attrib, CUdevice_v1 srcDevice, CUdevice_v1 dstDevice);
|
| 790 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphicsUnregisterResource_v3000)(CUgraphicsResource resource);
|
| 791 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphicsSubResourceGetMappedArray_v3000)(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
|
| 792 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedMipmappedArray_v5000)(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource);
|
| 793 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedPointer_v3020)(CUdeviceptr_v2 *pDevPtr, size_t *pSize, CUgraphicsResource resource);
|
| 794 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceSetMapFlags_v6050)(CUgraphicsResource resource, unsigned int flags);
|
| 795 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphicsMapResources_v7000_ptsz)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
|
| 796 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphicsUnmapResources_v7000_ptsz)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
|
| 797 |
+
typedef CUresult (CUDAAPI *PFN_cuGetExportTable_v3000)(const void **ppExportTable, const CUuuid *pExportTableId);
|
| 798 |
+
typedef CUresult (CUDAAPI *PFN_cuFuncGetModule_v11000)(CUmodule *hmod, CUfunction hfunc);
|
| 799 |
+
typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags);
|
| 800 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v3020)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount);
|
| 801 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v3020)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount);
|
| 802 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v3020)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount);
|
| 803 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v3020)(CUarray dstArray, size_t dstOffset, CUdeviceptr_v2 srcDevice, size_t ByteCount);
|
| 804 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v3020)(CUdeviceptr_v2 dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
|
| 805 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v3020)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
|
| 806 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v3020)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
|
| 807 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v3020)(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
|
| 808 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v3020)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
|
| 809 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v3020)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
|
| 810 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v3020)(const CUDA_MEMCPY2D_v2 *pCopy);
|
| 811 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v3020)(const CUDA_MEMCPY2D_v2 *pCopy);
|
| 812 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v3020)(const CUDA_MEMCPY3D_v2 *pCopy);
|
| 813 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v3020)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
|
| 814 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v3020)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
|
| 815 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v3020)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
|
| 816 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v3020)(const CUDA_MEMCPY2D_v2 *pCopy, CUstream hStream);
|
| 817 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v3020)(const CUDA_MEMCPY3D_v2 *pCopy, CUstream hStream);
|
| 818 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v3020)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N);
|
| 819 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v3020)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N);
|
| 820 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v3020)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N);
|
| 821 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
|
| 822 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
|
| 823 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
|
| 824 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpy_v4000)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount);
|
| 825 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyAsync_v4000)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount, CUstream hStream);
|
| 826 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyPeer_v4000)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount);
|
| 827 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyPeerAsync_v4000)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
|
| 828 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeer_v4000)(const CUDA_MEMCPY3D_PEER_v1 *pCopy);
|
| 829 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeerAsync_v4000)(const CUDA_MEMCPY3D_PEER_v1 *pCopy, CUstream hStream);
|
| 830 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD8Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N, CUstream hStream);
|
| 831 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD16Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N, CUstream hStream);
|
| 832 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD32Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N, CUstream hStream);
|
| 833 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
|
| 834 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
|
| 835 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
|
| 836 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamGetPriority_v5050)(CUstream hStream, int *priority);
|
| 837 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamGetFlags_v5050)(CUstream hStream, unsigned int *flags);
|
| 838 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamGetCtx_v9020)(CUstream hStream, CUcontext *pctx);
|
| 839 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamWaitEvent_v3020)(CUstream hStream, CUevent hEvent, unsigned int Flags);
|
| 840 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamAddCallback_v5000)(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
|
| 841 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamAttachMemAsync_v6000)(CUstream hStream, CUdeviceptr_v2 dptr, size_t length, unsigned int flags);
|
| 842 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamQuery_v2000)(CUstream hStream);
|
| 843 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamSynchronize_v2000)(CUstream hStream);
|
| 844 |
+
typedef CUresult (CUDAAPI *PFN_cuEventRecord_v2000)(CUevent hEvent, CUstream hStream);
|
| 845 |
+
typedef CUresult (CUDAAPI *PFN_cuEventRecordWithFlags_v11010)(CUevent hEvent, CUstream hStream, unsigned int flags);
|
| 846 |
+
typedef CUresult (CUDAAPI *PFN_cuLaunchKernel_v4000)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
|
| 847 |
+
typedef CUresult (CUDAAPI *PFN_cuLaunchKernelEx_v11060)(const CUlaunchConfig *config, CUfunction f, void **kernelParams, void **extra);
|
| 848 |
+
typedef CUresult (CUDAAPI *PFN_cuLaunchHostFunc_v10000)(CUstream hStream, CUhostFn fn, void *userData);
|
| 849 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphicsMapResources_v3000)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
|
| 850 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphicsUnmapResources_v3000)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
|
| 851 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v8000)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
|
| 852 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v8000)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
|
| 853 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v9000)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
|
| 854 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v9000)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
|
| 855 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v8000)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
|
| 856 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
|
| 857 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
|
| 858 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
|
| 859 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
|
| 860 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v11070)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
|
| 861 |
+
typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v8000)(CUdeviceptr_v2 devPtr, size_t count, CUdevice_v1 dstDevice, CUstream hStream);
|
| 862 |
+
typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernel_v9000)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams);
|
| 863 |
+
typedef CUresult (CUDAAPI *PFN_cuSignalExternalSemaphoresAsync_v10000)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
|
| 864 |
+
typedef CUresult (CUDAAPI *PFN_cuWaitExternalSemaphoresAsync_v10000)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
|
| 865 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10010)(CUstream hStream, CUstreamCaptureMode mode);
|
| 866 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamEndCapture_v10000)(CUstream hStream, CUgraph *phGraph);
|
| 867 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamIsCapturing_v10000)(CUstream hStream, CUstreamCaptureStatus *captureStatus);
|
| 868 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v10010)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
|
| 869 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v11030)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
|
| 870 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v11030)(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
|
| 871 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphUpload_v11010)(CUgraphExec hGraph, CUstream hStream);
|
| 872 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphLaunch_v10000)(CUgraphExec hGraph, CUstream hStream);
|
| 873 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamCopyAttributes_v11000)(CUstream dstStream, CUstream srcStream);
|
| 874 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamGetAttribute_v11000)(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue_v1 *value);
|
| 875 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamSetAttribute_v11000)(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue_v1 *param);
|
| 876 |
+
typedef CUresult (CUDAAPI *PFN_cuMemMapArrayAsync_v11010)(CUarrayMapInfo_v1 *mapInfoList, unsigned int count, CUstream hStream);
|
| 877 |
+
typedef CUresult (CUDAAPI *PFN_cuMemFreeAsync_v11020)(CUdeviceptr_v2 dptr, CUstream hStream);
|
| 878 |
+
typedef CUresult (CUDAAPI *PFN_cuMemAllocAsync_v11020)(CUdeviceptr_v2 *dptr, size_t bytesize, CUstream hStream);
|
| 879 |
+
typedef CUresult (CUDAAPI *PFN_cuMemAllocFromPoolAsync_v11020)(CUdeviceptr_v2 *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
|
| 880 |
+
typedef CUresult (CUDAAPI *PFN_cuFlushGPUDirectRDMAWrites_v11030)(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope);
|
| 881 |
+
typedef CUresult (CUDAAPI *PFN_cuUserObjectCreate_v11030)(CUuserObject *object_out, void *ptr, CUhostFn destroy, unsigned int initialRefcount, unsigned int flags);
|
| 882 |
+
typedef CUresult (CUDAAPI *PFN_cuUserObjectRetain_v11030)(CUuserObject object, unsigned int count);
|
| 883 |
+
typedef CUresult (CUDAAPI *PFN_cuUserObjectRelease_v11030)(CUuserObject object, unsigned int count);
|
| 884 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphRetainUserObject_v11030)(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags);
|
| 885 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphReleaseUserObject_v11030)(CUgraph graph, CUuserObject object, unsigned int count);
|
| 886 |
+
typedef CUresult (CUDAAPI *PFN_cuModuleGetLoadingMode_v11070)(CUmoduleLoadingMode *mode);
|
| 887 |
+
typedef CUresult (CUDAAPI *PFN_cuMemGetHandleForAddressRange_v11070)(void *handle, CUdeviceptr dptr, size_t size, CUmemRangeHandleType handleType, unsigned long long flags);
|
| 888 |
+
|
| 889 |
+
/*
|
| 890 |
+
* Type definitions for older versioned functions in cuda.h
|
| 891 |
+
*/
|
| 892 |
+
#if defined(__CUDA_API_VERSION_INTERNAL)
|
| 893 |
+
typedef CUresult (CUDAAPI *PFN_cuMemHostRegister_v4000)(void *p, size_t bytesize, unsigned int Flags);
|
| 894 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceSetMapFlags_v3000)(CUgraphicsResource resource, unsigned int flags);
|
| 895 |
+
typedef CUresult (CUDAAPI *PFN_cuLinkCreate_v5050)(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
|
| 896 |
+
typedef CUresult (CUDAAPI *PFN_cuLinkAddData_v5050)(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues);
|
| 897 |
+
typedef CUresult (CUDAAPI *PFN_cuLinkAddFile_v5050)(CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues);
|
| 898 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v3020)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v2 *desc, CUdeviceptr_v2 dptr, size_t Pitch);
|
| 899 |
+
typedef CUresult (CUDAAPI *PFN_cuDeviceTotalMem_v2000)(unsigned int *bytes, CUdevice_v1 dev);
|
| 900 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v2000)(CUcontext *pctx, unsigned int flags, CUdevice_v1 dev);
|
| 901 |
+
typedef CUresult (CUDAAPI *PFN_cuModuleGetGlobal_v2000)(CUdeviceptr_v1 *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
|
| 902 |
+
typedef CUresult (CUDAAPI *PFN_cuMemGetInfo_v2000)(unsigned int *free, unsigned int *total);
|
| 903 |
+
typedef CUresult (CUDAAPI *PFN_cuMemAlloc_v2000)(CUdeviceptr_v1 *dptr, unsigned int bytesize);
|
| 904 |
+
typedef CUresult (CUDAAPI *PFN_cuMemAllocPitch_v2000)(CUdeviceptr_v1 *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes);
|
| 905 |
+
typedef CUresult (CUDAAPI *PFN_cuMemFree_v2000)(CUdeviceptr_v1 dptr);
|
| 906 |
+
typedef CUresult (CUDAAPI *PFN_cuMemGetAddressRange_v2000)(CUdeviceptr_v1 *pbase, unsigned int *psize, CUdeviceptr_v1 dptr);
|
| 907 |
+
typedef CUresult (CUDAAPI *PFN_cuMemAllocHost_v2000)(void **pp, unsigned int bytesize);
|
| 908 |
+
typedef CUresult (CUDAAPI *PFN_cuMemHostGetDevicePointer_v2020)(CUdeviceptr_v1 *pdptr, void *p, unsigned int Flags);
|
| 909 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v2000)(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount);
|
| 910 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v2000)(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
|
| 911 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v2000)(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
|
| 912 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v2000)(CUarray dstArray, unsigned int dstOffset, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
|
| 913 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v2000)(CUdeviceptr_v1 dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
|
| 914 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v2000)(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
|
| 915 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v2000)(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
|
| 916 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v2000)(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
|
| 917 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v2000)(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream);
|
| 918 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v2000)(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream);
|
| 919 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v2000)(const CUDA_MEMCPY2D_v1 *pCopy);
|
| 920 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v2000)(const CUDA_MEMCPY2D_v1 *pCopy);
|
| 921 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v2000)(const CUDA_MEMCPY3D_v1 *pCopy);
|
| 922 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v2000)(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream);
|
| 923 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v2000)(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
|
| 924 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v3000)(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
|
| 925 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v2000)(const CUDA_MEMCPY2D_v1 *pCopy, CUstream hStream);
|
| 926 |
+
typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v2000)(const CUDA_MEMCPY3D_v1 *pCopy, CUstream hStream);
|
| 927 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v2000)(CUdeviceptr_v1 dstDevice, unsigned char uc, unsigned int N);
|
| 928 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v2000)(CUdeviceptr_v1 dstDevice, unsigned short us, unsigned int N);
|
| 929 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v2000)(CUdeviceptr_v1 dstDevice, unsigned int ui, unsigned int N);
|
| 930 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
|
| 931 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
|
| 932 |
+
typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
|
| 933 |
+
typedef CUresult (CUDAAPI *PFN_cuArrayCreate_v2000)(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray);
|
| 934 |
+
typedef CUresult (CUDAAPI *PFN_cuArrayGetDescriptor_v2000)(CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
|
| 935 |
+
typedef CUresult (CUDAAPI *PFN_cuArray3DCreate_v2000)(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray);
|
| 936 |
+
typedef CUresult (CUDAAPI *PFN_cuArray3DGetDescriptor_v2000)(CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
|
| 937 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress_v2000)(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr_v1 dptr, unsigned int bytes);
|
| 938 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v2020)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v1 *desc, CUdeviceptr_v1 dptr, unsigned int Pitch);
|
| 939 |
+
typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddress_v2000)(CUdeviceptr_v1 *pdptr, CUtexref hTexRef);
|
| 940 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedPointer_v3000)(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
|
| 941 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxDestroy_v2000)(CUcontext ctx);
|
| 942 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxPopCurrent_v2000)(CUcontext *pctx);
|
| 943 |
+
typedef CUresult (CUDAAPI *PFN_cuCtxPushCurrent_v2000)(CUcontext ctx);
|
| 944 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamDestroy_v2000)(CUstream hStream);
|
| 945 |
+
typedef CUresult (CUDAAPI *PFN_cuEventDestroy_v2000)(CUevent hEvent);
|
| 946 |
+
typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRelease_v7000)(CUdevice_v1 dev);
|
| 947 |
+
typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxReset_v7000)(CUdevice_v1 dev);
|
| 948 |
+
typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxSetFlags_v7000)(CUdevice_v1 dev, unsigned int flags);
|
| 949 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10000)(CUstream hStream);
|
| 950 |
+
typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10000_ptsz)(CUstream hStream);
|
| 951 |
+
typedef CUresult (CUDAAPI *PFN_cuIpcOpenMemHandle_v4010)(CUdeviceptr_v2 *pdptr, CUipcMemHandle_v1 handle, unsigned int Flags);
|
| 952 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphInstantiate_v10000)(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
|
| 953 |
+
#endif
|
| 954 |
+
|
| 955 |
+
#ifdef __cplusplus
|
| 956 |
+
}
|
| 957 |
+
#endif // __cplusplus
|
| 958 |
+
|
| 959 |
+
#endif // file guard
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaVDPAUTypedefs.h
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2020-2021 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef CUDAVDPAUTYPEDEFS_H
|
| 51 |
+
#define CUDAVDPAUTYPEDEFS_H
|
| 52 |
+
|
| 53 |
+
// Dependent includes for cudavdpau.h
|
| 54 |
+
#include <vdpau/vdpau.h>
|
| 55 |
+
|
| 56 |
+
#include <cudaVDPAU.h>
|
| 57 |
+
|
| 58 |
+
#ifdef __cplusplus
|
| 59 |
+
extern "C" {
|
| 60 |
+
#endif // __cplusplus
|
| 61 |
+
|
| 62 |
+
/*
|
| 63 |
+
* Macros for the latest version for each driver function in cudaVDPAU.h
|
| 64 |
+
*/
|
| 65 |
+
#define PFN_cuVDPAUGetDevice PFN_cuVDPAUGetDevice_v3010
|
| 66 |
+
#define PFN_cuVDPAUCtxCreate PFN_cuVDPAUCtxCreate_v3020
|
| 67 |
+
#define PFN_cuGraphicsVDPAURegisterVideoSurface PFN_cuGraphicsVDPAURegisterVideoSurface_v3010
|
| 68 |
+
#define PFN_cuGraphicsVDPAURegisterOutputSurface PFN_cuGraphicsVDPAURegisterOutputSurface_v3010
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
/**
|
| 72 |
+
* Type definitions for functions defined in cudaVDPAU.h
|
| 73 |
+
*/
|
| 74 |
+
typedef CUresult (CUDAAPI *PFN_cuVDPAUGetDevice_v3010)(CUdevice_v1 *pDevice, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
|
| 75 |
+
typedef CUresult (CUDAAPI *PFN_cuVDPAUCtxCreate_v3020)(CUcontext *pCtx, unsigned int flags, CUdevice_v1 device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
|
| 76 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphicsVDPAURegisterVideoSurface_v3010)(CUgraphicsResource *pCudaResource, VdpVideoSurface vdpSurface, unsigned int flags);
|
| 77 |
+
typedef CUresult (CUDAAPI *PFN_cuGraphicsVDPAURegisterOutputSurface_v3010)(CUgraphicsResource *pCudaResource, VdpOutputSurface vdpSurface, unsigned int flags);
|
| 78 |
+
|
| 79 |
+
/*
|
| 80 |
+
* Type definitions for older versioned functions in cudaVDPAU.h
|
| 81 |
+
*/
|
| 82 |
+
#if defined(__CUDA_API_VERSION_INTERNAL)
|
| 83 |
+
typedef CUresult (CUDAAPI *PFN_cuVDPAUCtxCreate_v3010)(CUcontext *pCtx, unsigned int flags, CUdevice_v1 device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
|
| 84 |
+
#endif
|
| 85 |
+
|
| 86 |
+
#ifdef __cplusplus
|
| 87 |
+
}
|
| 88 |
+
#endif // __cplusplus
|
| 89 |
+
|
| 90 |
+
#endif // file guard
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_bf16.h
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp16.h
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp16.hpp
ADDED
|
@@ -0,0 +1,2614 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_FP16_HPP__)
|
| 51 |
+
#define __CUDA_FP16_HPP__
|
| 52 |
+
|
| 53 |
+
#if !defined(__CUDA_FP16_H__)
|
| 54 |
+
#error "Do not include this file directly. Instead, include cuda_fp16.h."
|
| 55 |
+
#endif
|
| 56 |
+
|
| 57 |
+
#if !defined(_MSC_VER) && __cplusplus >= 201103L
|
| 58 |
+
# define __CPP_VERSION_AT_LEAST_11_FP16
|
| 59 |
+
#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
|
| 60 |
+
# define __CPP_VERSION_AT_LEAST_11_FP16
|
| 61 |
+
#endif
|
| 62 |
+
|
| 63 |
+
/* C++11 header for std::move.
|
| 64 |
+
* In RTC mode, std::move is provided implicitly; don't include the header
|
| 65 |
+
*/
|
| 66 |
+
#if defined(__CPP_VERSION_AT_LEAST_11_FP16) && !defined(__CUDACC_RTC__)
|
| 67 |
+
#include <utility>
|
| 68 |
+
#endif /* __cplusplus >= 201103L && !defined(__CUDACC_RTC__) */
|
| 69 |
+
|
| 70 |
+
/* C++ header for std::memcpy (used for type punning in host-side implementations).
|
| 71 |
+
* When compiling as a CUDA source file memcpy is provided implicitly.
|
| 72 |
+
* !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
|
| 73 |
+
*/
|
| 74 |
+
#if defined(__cplusplus) && !defined(__CUDACC__)
|
| 75 |
+
#include <cstring>
|
| 76 |
+
#endif /* defined(__cplusplus) && !defined(__CUDACC__) */
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
/* Set up function decorations */
|
| 80 |
+
#if defined(__CUDACC__)
|
| 81 |
+
#define __CUDA_FP16_DECL__ static __device__ __inline__
|
| 82 |
+
#define __CUDA_HOSTDEVICE_FP16_DECL__ static __host__ __device__ __inline__
|
| 83 |
+
#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
|
| 84 |
+
#define __CUDA_HOSTDEVICE__ __host__ __device__
|
| 85 |
+
#else /* !defined(__CUDACC__) */
|
| 86 |
+
#if defined(__GNUC__)
|
| 87 |
+
#define __CUDA_HOSTDEVICE_FP16_DECL__ static __attribute__ ((unused))
|
| 88 |
+
#else
|
| 89 |
+
#define __CUDA_HOSTDEVICE_FP16_DECL__ static
|
| 90 |
+
#endif /* defined(__GNUC__) */
|
| 91 |
+
#define __CUDA_HOSTDEVICE__
|
| 92 |
+
#endif /* defined(__CUDACC_) */
|
| 93 |
+
|
| 94 |
+
/* Set up structure-alignment attribute */
|
| 95 |
+
#if defined(__CUDACC__)
|
| 96 |
+
#define __CUDA_ALIGN__(align) __align__(align)
|
| 97 |
+
#else
|
| 98 |
+
/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */
|
| 99 |
+
#if __cplusplus >= 201103L
|
| 100 |
+
#define __CUDA_ALIGN__(n) alignas(n) /* C++11 kindly gives us a keyword for this */
|
| 101 |
+
#else /* !defined(__CPP_VERSION_AT_LEAST_11_FP16)*/
|
| 102 |
+
#if defined(__GNUC__)
|
| 103 |
+
#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n)))
|
| 104 |
+
#elif defined(_MSC_VER)
|
| 105 |
+
#define __CUDA_ALIGN__(n) __declspec(align(n))
|
| 106 |
+
#else
|
| 107 |
+
#define __CUDA_ALIGN__(n)
|
| 108 |
+
#endif /* defined(__GNUC__) */
|
| 109 |
+
#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
|
| 110 |
+
#endif /* defined(__CUDACC__) */
|
| 111 |
+
|
| 112 |
+
/* Macros to allow half & half2 to be used by inline assembly */
|
| 113 |
+
#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
|
| 114 |
+
#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
|
| 115 |
+
#define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))
|
| 116 |
+
#define __HALF2_TO_CUI(var) *(reinterpret_cast<const unsigned int *>(&(var)))
|
| 117 |
+
|
| 118 |
+
/* Macros for half & half2 binary arithmetic */
|
| 119 |
+
#define __BINARY_OP_HALF_MACRO(name) /* do */ {\
|
| 120 |
+
__half val; \
|
| 121 |
+
asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2;\n}" \
|
| 122 |
+
:"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b))); \
|
| 123 |
+
return val; \
|
| 124 |
+
} /* while(0) */
|
| 125 |
+
#define __BINARY_OP_HALF2_MACRO(name) /* do */ {\
|
| 126 |
+
__half2 val; \
|
| 127 |
+
asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2;\n}" \
|
| 128 |
+
:"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
|
| 129 |
+
return val; \
|
| 130 |
+
} /* while(0) */
|
| 131 |
+
#define __TERNARY_OP_HALF_MACRO(name) /* do */ {\
|
| 132 |
+
__half val; \
|
| 133 |
+
asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2,%3;\n}" \
|
| 134 |
+
:"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b)),"h"(__HALF_TO_CUS(c))); \
|
| 135 |
+
return val; \
|
| 136 |
+
} /* while(0) */
|
| 137 |
+
#define __TERNARY_OP_HALF2_MACRO(name) /* do */ {\
|
| 138 |
+
__half2 val; \
|
| 139 |
+
asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2,%3;\n}" \
|
| 140 |
+
:"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b)),"r"(__HALF2_TO_CUI(c))); \
|
| 141 |
+
return val; \
|
| 142 |
+
} /* while(0) */
|
| 143 |
+
|
| 144 |
+
/**
|
| 145 |
+
* Types which allow static initialization of "half" and "half2" until
|
| 146 |
+
* these become an actual builtin. Note this initialization is as a
|
| 147 |
+
* bitfield representation of "half", and not a conversion from short->half.
|
| 148 |
+
* Such a representation will be deprecated in a future version of CUDA.
|
| 149 |
+
* (Note these are visible to non-nvcc compilers, including C-only compilation)
|
| 150 |
+
*/
|
| 151 |
+
typedef struct __CUDA_ALIGN__(2) {
|
| 152 |
+
unsigned short x;
|
| 153 |
+
} __half_raw;
|
| 154 |
+
|
| 155 |
+
typedef struct __CUDA_ALIGN__(4) {
|
| 156 |
+
unsigned short x;
|
| 157 |
+
unsigned short y;
|
| 158 |
+
} __half2_raw;
|
| 159 |
+
|
| 160 |
+
/* All other definitions in this file are only visible to C++ compilers */
|
| 161 |
+
#if defined(__cplusplus)
|
| 162 |
+
|
| 163 |
+
/* Hide GCC member initialization list warnings because of host/device in-function init requirement */
|
| 164 |
+
#if defined(__GNUC__)
|
| 165 |
+
#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
|
| 166 |
+
#pragma GCC diagnostic push
|
| 167 |
+
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
|
| 168 |
+
#pragma GCC diagnostic ignored "-Weffc++"
|
| 169 |
+
#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
|
| 170 |
+
#endif /* defined(__GNUC__) */
|
| 171 |
+
|
| 172 |
+
/* class' : multiple assignment operators specified
|
| 173 |
+
The class has multiple assignment operators of a single type. This warning is informational */
|
| 174 |
+
#if defined(_MSC_VER) && _MSC_VER >= 1500
|
| 175 |
+
#pragma warning( push )
|
| 176 |
+
#pragma warning( disable:4522 )
|
| 177 |
+
#endif /* defined(__GNUC__) */
|
| 178 |
+
|
| 179 |
+
struct __CUDA_ALIGN__(2) __half {
|
| 180 |
+
protected:
|
| 181 |
+
unsigned short __x;
|
| 182 |
+
|
| 183 |
+
public:
|
| 184 |
+
#if defined(__CPP_VERSION_AT_LEAST_11_FP16)
|
| 185 |
+
__half() = default;
|
| 186 |
+
#else
|
| 187 |
+
__CUDA_HOSTDEVICE__ __half() { }
|
| 188 |
+
#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
|
| 189 |
+
|
| 190 |
+
/* Convert to/from __half_raw */
|
| 191 |
+
__CUDA_HOSTDEVICE__ __half(const __half_raw &hr) : __x(hr.x) { }
|
| 192 |
+
__CUDA_HOSTDEVICE__ __half &operator=(const __half_raw &hr) { __x = hr.x; return *this; }
|
| 193 |
+
__CUDA_HOSTDEVICE__ volatile __half &operator=(const __half_raw &hr) volatile { __x = hr.x; return *this; }
|
| 194 |
+
__CUDA_HOSTDEVICE__ volatile __half &operator=(const volatile __half_raw &hr) volatile { __x = hr.x; return *this; }
|
| 195 |
+
__CUDA_HOSTDEVICE__ operator __half_raw() const { __half_raw ret; ret.x = __x; return ret; }
|
| 196 |
+
__CUDA_HOSTDEVICE__ operator __half_raw() const volatile { __half_raw ret; ret.x = __x; return ret; }
|
| 197 |
+
|
| 198 |
+
#if !defined(__CUDA_NO_HALF_CONVERSIONS__)
|
| 199 |
+
|
| 200 |
+
/* Construct from float/double */
|
| 201 |
+
__CUDA_HOSTDEVICE__ __half(const float f) { __x = __float2half(f).__x; }
|
| 202 |
+
__CUDA_HOSTDEVICE__ __half(const double f) { __x = __double2half(f).__x; }
|
| 203 |
+
|
| 204 |
+
__CUDA_HOSTDEVICE__ operator float() const { return __half2float(*this); }
|
| 205 |
+
__CUDA_HOSTDEVICE__ __half &operator=(const float f) { __x = __float2half(f).__x; return *this; }
|
| 206 |
+
|
| 207 |
+
/* We omit "cast to double" operator, so as to not be ambiguous about up-cast */
|
| 208 |
+
__CUDA_HOSTDEVICE__ __half &operator=(const double f) { __x = __double2half(f).__x; return *this; }
|
| 209 |
+
|
| 210 |
+
/* Member functions only available to nvcc compilation so far */
|
| 211 |
+
#if defined(__CUDACC__)
|
| 212 |
+
/* Allow automatic construction from types supported natively in hardware */
|
| 213 |
+
/* Note we do avoid constructor init-list because of special host/device compilation rules */
|
| 214 |
+
__CUDA_HOSTDEVICE__ __half(const short val) { __x = __short2half_rn(val).__x; }
|
| 215 |
+
__CUDA_HOSTDEVICE__ __half(const unsigned short val) { __x = __ushort2half_rn(val).__x; }
|
| 216 |
+
__CUDA_HOSTDEVICE__ __half(const int val) { __x = __int2half_rn(val).__x; }
|
| 217 |
+
__CUDA_HOSTDEVICE__ __half(const unsigned int val) { __x = __uint2half_rn(val).__x; }
|
| 218 |
+
__CUDA_HOSTDEVICE__ __half(const long long val) { __x = __ll2half_rn(val).__x; }
|
| 219 |
+
__CUDA_HOSTDEVICE__ __half(const unsigned long long val) { __x = __ull2half_rn(val).__x; }
|
| 220 |
+
|
| 221 |
+
/* Allow automatic casts to supported builtin types, matching all that are permitted with float */
|
| 222 |
+
__CUDA_HOSTDEVICE__ operator short() const { return __half2short_rz(*this); }
|
| 223 |
+
__CUDA_HOSTDEVICE__ __half &operator=(const short val) { __x = __short2half_rn(val).__x; return *this; }
|
| 224 |
+
|
| 225 |
+
__CUDA_HOSTDEVICE__ operator unsigned short() const { return __half2ushort_rz(*this); }
|
| 226 |
+
__CUDA_HOSTDEVICE__ __half &operator=(const unsigned short val) { __x = __ushort2half_rn(val).__x; return *this; }
|
| 227 |
+
|
| 228 |
+
__CUDA_HOSTDEVICE__ operator int() const { return __half2int_rz(*this); }
|
| 229 |
+
__CUDA_HOSTDEVICE__ __half &operator=(const int val) { __x = __int2half_rn(val).__x; return *this; }
|
| 230 |
+
|
| 231 |
+
__CUDA_HOSTDEVICE__ operator unsigned int() const { return __half2uint_rz(*this); }
|
| 232 |
+
__CUDA_HOSTDEVICE__ __half &operator=(const unsigned int val) { __x = __uint2half_rn(val).__x; return *this; }
|
| 233 |
+
|
| 234 |
+
__CUDA_HOSTDEVICE__ operator long long() const { return __half2ll_rz(*this); }
|
| 235 |
+
__CUDA_HOSTDEVICE__ __half &operator=(const long long val) { __x = __ll2half_rn(val).__x; return *this; }
|
| 236 |
+
|
| 237 |
+
__CUDA_HOSTDEVICE__ operator unsigned long long() const { return __half2ull_rz(*this); }
|
| 238 |
+
__CUDA_HOSTDEVICE__ __half &operator=(const unsigned long long val) { __x = __ull2half_rn(val).__x; return *this; }
|
| 239 |
+
|
| 240 |
+
/* Boolean conversion - note both 0 and -0 must return false */
|
| 241 |
+
__CUDA_HOSTDEVICE__ operator bool() const { return (__x & 0x7FFFU) != 0U; }
|
| 242 |
+
#endif /* defined(__CUDACC__) */
|
| 243 |
+
#endif /* !defined(__CUDA_NO_HALF_CONVERSIONS__) */
|
| 244 |
+
};
|
| 245 |
+
|
| 246 |
+
/* Global-space operator functions are only available to nvcc compilation */
|
| 247 |
+
#if defined(__CUDACC__)
|
| 248 |
+
|
| 249 |
+
/* Arithmetic FP16 operations only supported on arch >= 5.3 */
|
| 250 |
+
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
| 251 |
+
#if !defined(__CUDA_NO_HALF_OPERATORS__)
|
| 252 |
+
/* Some basic arithmetic operations expected of a builtin */
|
| 253 |
+
__device__ __forceinline__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); }
|
| 254 |
+
__device__ __forceinline__ __half operator-(const __half &lh, const __half &rh) { return __hsub(lh, rh); }
|
| 255 |
+
__device__ __forceinline__ __half operator*(const __half &lh, const __half &rh) { return __hmul(lh, rh); }
|
| 256 |
+
__device__ __forceinline__ __half operator/(const __half &lh, const __half &rh) { return __hdiv(lh, rh); }
|
| 257 |
+
|
| 258 |
+
__device__ __forceinline__ __half &operator+=(__half &lh, const __half &rh) { lh = __hadd(lh, rh); return lh; }
|
| 259 |
+
__device__ __forceinline__ __half &operator-=(__half &lh, const __half &rh) { lh = __hsub(lh, rh); return lh; }
|
| 260 |
+
__device__ __forceinline__ __half &operator*=(__half &lh, const __half &rh) { lh = __hmul(lh, rh); return lh; }
|
| 261 |
+
__device__ __forceinline__ __half &operator/=(__half &lh, const __half &rh) { lh = __hdiv(lh, rh); return lh; }
|
| 262 |
+
|
| 263 |
+
/* Note for increment and decrement we use the raw value 0x3C00U equating to half(1.0F), to avoid the extra conversion */
|
| 264 |
+
__device__ __forceinline__ __half &operator++(__half &h) { __half_raw one; one.x = 0x3C00U; h += one; return h; }
|
| 265 |
+
__device__ __forceinline__ __half &operator--(__half &h) { __half_raw one; one.x = 0x3C00U; h -= one; return h; }
|
| 266 |
+
__device__ __forceinline__ __half operator++(__half &h, const int ignored)
|
| 267 |
+
{
|
| 268 |
+
// ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
|
| 269 |
+
static_cast<void>(ignored);
|
| 270 |
+
|
| 271 |
+
const __half ret = h;
|
| 272 |
+
__half_raw one;
|
| 273 |
+
one.x = 0x3C00U;
|
| 274 |
+
h += one;
|
| 275 |
+
return ret;
|
| 276 |
+
}
|
| 277 |
+
__device__ __forceinline__ __half operator--(__half &h, const int ignored)
|
| 278 |
+
{
|
| 279 |
+
// ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
|
| 280 |
+
static_cast<void>(ignored);
|
| 281 |
+
|
| 282 |
+
const __half ret = h;
|
| 283 |
+
__half_raw one;
|
| 284 |
+
one.x = 0x3C00U;
|
| 285 |
+
h -= one;
|
| 286 |
+
return ret;
|
| 287 |
+
}
|
| 288 |
+
|
| 289 |
+
/* Unary plus and inverse operators */
|
| 290 |
+
__device__ __forceinline__ __half operator+(const __half &h) { return h; }
|
| 291 |
+
__device__ __forceinline__ __half operator-(const __half &h) { return __hneg(h); }
|
| 292 |
+
|
| 293 |
+
/* Some basic comparison operations to make it look like a builtin */
|
| 294 |
+
__device__ __forceinline__ bool operator==(const __half &lh, const __half &rh) { return __heq(lh, rh); }
|
| 295 |
+
__device__ __forceinline__ bool operator!=(const __half &lh, const __half &rh) { return __hneu(lh, rh); }
|
| 296 |
+
__device__ __forceinline__ bool operator> (const __half &lh, const __half &rh) { return __hgt(lh, rh); }
|
| 297 |
+
__device__ __forceinline__ bool operator< (const __half &lh, const __half &rh) { return __hlt(lh, rh); }
|
| 298 |
+
__device__ __forceinline__ bool operator>=(const __half &lh, const __half &rh) { return __hge(lh, rh); }
|
| 299 |
+
__device__ __forceinline__ bool operator<=(const __half &lh, const __half &rh) { return __hle(lh, rh); }
|
| 300 |
+
#endif /* !defined(__CUDA_NO_HALF_OPERATORS__) */
|
| 301 |
+
#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) */
|
| 302 |
+
#endif /* defined(__CUDACC__) */
|
| 303 |
+
|
| 304 |
+
/* __half2 is visible to non-nvcc host compilers */
|
| 305 |
+
struct __CUDA_ALIGN__(4) __half2 {
|
| 306 |
+
__half x;
|
| 307 |
+
__half y;
|
| 308 |
+
|
| 309 |
+
// All construct/copy/assign/move
|
| 310 |
+
public:
|
| 311 |
+
#if defined(__CPP_VERSION_AT_LEAST_11_FP16)
|
| 312 |
+
__half2() = default;
|
| 313 |
+
__CUDA_HOSTDEVICE__ __half2(const __half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); }
|
| 314 |
+
__CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); return *this; }
|
| 315 |
+
#else
|
| 316 |
+
__CUDA_HOSTDEVICE__ __half2() { }
|
| 317 |
+
#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
|
| 318 |
+
__CUDA_HOSTDEVICE__ __half2(const __half &a, const __half &b) : x(a), y(b) { }
|
| 319 |
+
__CUDA_HOSTDEVICE__ __half2(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); }
|
| 320 |
+
__CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); return *this; }
|
| 321 |
+
|
| 322 |
+
/* Convert to/from __half2_raw */
|
| 323 |
+
__CUDA_HOSTDEVICE__ __half2(const __half2_raw &h2r ) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); }
|
| 324 |
+
__CUDA_HOSTDEVICE__ __half2 &operator=(const __half2_raw &h2r) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); return *this; }
|
| 325 |
+
__CUDA_HOSTDEVICE__ operator __half2_raw() const { __half2_raw ret; ret.x = 0U; ret.y = 0U; __HALF2_TO_UI(ret) = __HALF2_TO_CUI(*this); return ret; }
|
| 326 |
+
};
|
| 327 |
+
|
| 328 |
+
/* Global-space operator functions are only available to nvcc compilation */
|
| 329 |
+
#if defined(__CUDACC__)
|
| 330 |
+
|
| 331 |
+
/* Arithmetic FP16x2 operations only supported on arch >= 5.3 */
|
| 332 |
+
#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) && !defined(__CUDA_NO_HALF2_OPERATORS__)
|
| 333 |
+
|
| 334 |
+
__device__ __forceinline__ __half2 operator+(const __half2 &lh, const __half2 &rh) { return __hadd2(lh, rh); }
|
| 335 |
+
__device__ __forceinline__ __half2 operator-(const __half2 &lh, const __half2 &rh) { return __hsub2(lh, rh); }
|
| 336 |
+
__device__ __forceinline__ __half2 operator*(const __half2 &lh, const __half2 &rh) { return __hmul2(lh, rh); }
|
| 337 |
+
__device__ __forceinline__ __half2 operator/(const __half2 &lh, const __half2 &rh) { return __h2div(lh, rh); }
|
| 338 |
+
|
| 339 |
+
__device__ __forceinline__ __half2& operator+=(__half2 &lh, const __half2 &rh) { lh = __hadd2(lh, rh); return lh; }
|
| 340 |
+
__device__ __forceinline__ __half2& operator-=(__half2 &lh, const __half2 &rh) { lh = __hsub2(lh, rh); return lh; }
|
| 341 |
+
__device__ __forceinline__ __half2& operator*=(__half2 &lh, const __half2 &rh) { lh = __hmul2(lh, rh); return lh; }
|
| 342 |
+
__device__ __forceinline__ __half2& operator/=(__half2 &lh, const __half2 &rh) { lh = __h2div(lh, rh); return lh; }
|
| 343 |
+
|
| 344 |
+
__device__ __forceinline__ __half2 &operator++(__half2 &h) { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hadd2(h, one); return h; }
|
| 345 |
+
__device__ __forceinline__ __half2 &operator--(__half2 &h) { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hsub2(h, one); return h; }
|
| 346 |
+
__device__ __forceinline__ __half2 operator++(__half2 &h, const int ignored)
|
| 347 |
+
{
|
| 348 |
+
// ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
|
| 349 |
+
static_cast<void>(ignored);
|
| 350 |
+
|
| 351 |
+
const __half2 ret = h;
|
| 352 |
+
__half2_raw one;
|
| 353 |
+
one.x = 0x3C00U;
|
| 354 |
+
one.y = 0x3C00U;
|
| 355 |
+
h = __hadd2(h, one);
|
| 356 |
+
return ret;
|
| 357 |
+
}
|
| 358 |
+
__device__ __forceinline__ __half2 operator--(__half2 &h, const int ignored)
|
| 359 |
+
{
|
| 360 |
+
// ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
|
| 361 |
+
static_cast<void>(ignored);
|
| 362 |
+
|
| 363 |
+
const __half2 ret = h;
|
| 364 |
+
__half2_raw one;
|
| 365 |
+
one.x = 0x3C00U;
|
| 366 |
+
one.y = 0x3C00U;
|
| 367 |
+
h = __hsub2(h, one);
|
| 368 |
+
return ret;
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
__device__ __forceinline__ __half2 operator+(const __half2 &h) { return h; }
|
| 372 |
+
__device__ __forceinline__ __half2 operator-(const __half2 &h) { return __hneg2(h); }
|
| 373 |
+
|
| 374 |
+
__device__ __forceinline__ bool operator==(const __half2 &lh, const __half2 &rh) { return __hbeq2(lh, rh); }
|
| 375 |
+
__device__ __forceinline__ bool operator!=(const __half2 &lh, const __half2 &rh) { return __hbneu2(lh, rh); }
|
| 376 |
+
__device__ __forceinline__ bool operator>(const __half2 &lh, const __half2 &rh) { return __hbgt2(lh, rh); }
|
| 377 |
+
__device__ __forceinline__ bool operator<(const __half2 &lh, const __half2 &rh) { return __hblt2(lh, rh); }
|
| 378 |
+
__device__ __forceinline__ bool operator>=(const __half2 &lh, const __half2 &rh) { return __hbge2(lh, rh); }
|
| 379 |
+
__device__ __forceinline__ bool operator<=(const __half2 &lh, const __half2 &rh) { return __hble2(lh, rh); }
|
| 380 |
+
|
| 381 |
+
#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) */
|
| 382 |
+
#endif /* defined(__CUDACC__) */
|
| 383 |
+
|
| 384 |
+
/* Restore warning for multiple assignment operators */
|
| 385 |
+
#if defined(_MSC_VER) && _MSC_VER >= 1500
|
| 386 |
+
#pragma warning( pop )
|
| 387 |
+
#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */
|
| 388 |
+
|
| 389 |
+
/* Restore -Weffc++ warnings from here on */
|
| 390 |
+
#if defined(__GNUC__)
|
| 391 |
+
#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
|
| 392 |
+
#pragma GCC diagnostic pop
|
| 393 |
+
#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
|
| 394 |
+
#endif /* defined(__GNUC__) */
|
| 395 |
+
|
| 396 |
+
#undef __CUDA_HOSTDEVICE__
|
| 397 |
+
#undef __CUDA_ALIGN__
|
| 398 |
+
|
| 399 |
+
#ifndef __CUDACC_RTC__ /* no host functions in NVRTC mode */
|
| 400 |
+
static inline unsigned short __internal_float2half(const float f, unsigned int &sign, unsigned int &remainder)
|
| 401 |
+
{
|
| 402 |
+
unsigned int x;
|
| 403 |
+
unsigned int u;
|
| 404 |
+
unsigned int result;
|
| 405 |
+
#if defined(__CUDACC__)
|
| 406 |
+
(void)memcpy(&x, &f, sizeof(f));
|
| 407 |
+
#else
|
| 408 |
+
(void)std::memcpy(&x, &f, sizeof(f));
|
| 409 |
+
#endif
|
| 410 |
+
u = (x & 0x7fffffffU);
|
| 411 |
+
sign = ((x >> 16U) & 0x8000U);
|
| 412 |
+
// NaN/+Inf/-Inf
|
| 413 |
+
if (u >= 0x7f800000U) {
|
| 414 |
+
remainder = 0U;
|
| 415 |
+
result = ((u == 0x7f800000U) ? (sign | 0x7c00U) : 0x7fffU);
|
| 416 |
+
} else if (u > 0x477fefffU) { // Overflows
|
| 417 |
+
remainder = 0x80000000U;
|
| 418 |
+
result = (sign | 0x7bffU);
|
| 419 |
+
} else if (u >= 0x38800000U) { // Normal numbers
|
| 420 |
+
remainder = u << 19U;
|
| 421 |
+
u -= 0x38000000U;
|
| 422 |
+
result = (sign | (u >> 13U));
|
| 423 |
+
} else if (u < 0x33000001U) { // +0/-0
|
| 424 |
+
remainder = u;
|
| 425 |
+
result = sign;
|
| 426 |
+
} else { // Denormal numbers
|
| 427 |
+
const unsigned int exponent = u >> 23U;
|
| 428 |
+
const unsigned int shift = 0x7eU - exponent;
|
| 429 |
+
unsigned int mantissa = (u & 0x7fffffU);
|
| 430 |
+
mantissa |= 0x800000U;
|
| 431 |
+
remainder = mantissa << (32U - shift);
|
| 432 |
+
result = (sign | (mantissa >> shift));
|
| 433 |
+
result &= 0x0000FFFFU;
|
| 434 |
+
}
|
| 435 |
+
return static_cast<unsigned short>(result);
|
| 436 |
+
}
|
| 437 |
+
#endif /* #if !defined(__CUDACC_RTC__) */
|
| 438 |
+
|
| 439 |
+
__CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a)
|
| 440 |
+
{
|
| 441 |
+
#if defined(__CUDA_ARCH__)
|
| 442 |
+
__half val;
|
| 443 |
+
asm("{ cvt.rn.f16.f64 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "d"(a));
|
| 444 |
+
return val;
|
| 445 |
+
#else
|
| 446 |
+
__half result;
|
| 447 |
+
/*
|
| 448 |
+
// Perform rounding to 11 bits of precision, convert value
|
| 449 |
+
// to float and call existing float to half conversion.
|
| 450 |
+
// By pre-rounding to 11 bits we avoid additional rounding
|
| 451 |
+
// in float to half conversion.
|
| 452 |
+
*/
|
| 453 |
+
unsigned long long int absa;
|
| 454 |
+
unsigned long long int ua;
|
| 455 |
+
#if defined(__CUDACC__)
|
| 456 |
+
(void)memcpy(&ua, &a, sizeof(a));
|
| 457 |
+
#else
|
| 458 |
+
(void)std::memcpy(&ua, &a, sizeof(a));
|
| 459 |
+
#endif
|
| 460 |
+
absa = (ua & 0x7fffffffffffffffULL);
|
| 461 |
+
if ((absa >= 0x40f0000000000000ULL) || (absa <= 0x3e60000000000000ULL))
|
| 462 |
+
{
|
| 463 |
+
/*
|
| 464 |
+
// |a| >= 2^16 or NaN or |a| <= 2^(-25)
|
| 465 |
+
// double-rounding is not a problem
|
| 466 |
+
*/
|
| 467 |
+
result = __float2half(static_cast<float>(a));
|
| 468 |
+
}
|
| 469 |
+
else
|
| 470 |
+
{
|
| 471 |
+
/*
|
| 472 |
+
// here 2^(-25) < |a| < 2^16
|
| 473 |
+
// prepare shifter value such that a + shifter
|
| 474 |
+
// done in double precision performs round-to-nearest-even
|
| 475 |
+
// and (a + shifter) - shifter results in a rounded to
|
| 476 |
+
// 11 bits of precision. Shifter needs to have exponent of
|
| 477 |
+
// a plus 53 - 11 = 42 and a leading bit in mantissa to guard
|
| 478 |
+
// against negative values.
|
| 479 |
+
// So need to have |a| capped to avoid overflow in exponent.
|
| 480 |
+
// For inputs that are smaller than half precision minnorm
|
| 481 |
+
// we prepare fixed shifter exponent.
|
| 482 |
+
*/
|
| 483 |
+
unsigned long long shifterBits;
|
| 484 |
+
if (absa >= 0x3f10000000000000ULL)
|
| 485 |
+
{
|
| 486 |
+
/*
|
| 487 |
+
// Here if |a| >= 2^(-14)
|
| 488 |
+
// add 42 to exponent bits
|
| 489 |
+
*/
|
| 490 |
+
shifterBits = (ua & 0x7ff0000000000000ULL) + 0x02A0000000000000ULL;
|
| 491 |
+
}
|
| 492 |
+
else
|
| 493 |
+
{
|
| 494 |
+
/*
|
| 495 |
+
// 2^(-25) < |a| < 2^(-14), potentially results in denormal
|
| 496 |
+
// set exponent bits to 42 - 14 + bias
|
| 497 |
+
*/
|
| 498 |
+
shifterBits = 0x41B0000000000000ULL;
|
| 499 |
+
}
|
| 500 |
+
// set leading mantissa bit to protect against negative inputs
|
| 501 |
+
shifterBits |= 0x0008000000000000ULL;
|
| 502 |
+
double shifter;
|
| 503 |
+
#if defined(__CUDACC__)
|
| 504 |
+
(void)memcpy(&shifter, &shifterBits, sizeof(shifterBits));
|
| 505 |
+
#else
|
| 506 |
+
(void)std::memcpy(&shifter, &shifterBits, sizeof(shifterBits));
|
| 507 |
+
#endif
|
| 508 |
+
double aShiftRound = a + shifter;
|
| 509 |
+
|
| 510 |
+
/*
|
| 511 |
+
// Prevent the compiler from optimizing away a + shifter - shifter
|
| 512 |
+
// by doing intermediate memcopy and harmless bitwize operation
|
| 513 |
+
*/
|
| 514 |
+
unsigned long long int aShiftRoundBits;
|
| 515 |
+
#if defined(__CUDACC__)
|
| 516 |
+
(void)memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound));
|
| 517 |
+
#else
|
| 518 |
+
(void)std::memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound));
|
| 519 |
+
#endif
|
| 520 |
+
|
| 521 |
+
// the value is positive, so this operation doesn't change anything
|
| 522 |
+
aShiftRoundBits &= 0x7fffffffffffffffULL;
|
| 523 |
+
|
| 524 |
+
#if defined(__CUDACC__)
|
| 525 |
+
(void)memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound));
|
| 526 |
+
#else
|
| 527 |
+
(void)std::memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound));
|
| 528 |
+
#endif
|
| 529 |
+
|
| 530 |
+
result = __float2half(static_cast<float>(aShiftRound - shifter));
|
| 531 |
+
}
|
| 532 |
+
|
| 533 |
+
return result;
|
| 534 |
+
#endif
|
| 535 |
+
}
|
| 536 |
+
|
| 537 |
+
__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a)
|
| 538 |
+
{
|
| 539 |
+
__half val;
|
| 540 |
+
#if defined(__CUDA_ARCH__)
|
| 541 |
+
asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
|
| 542 |
+
#else
|
| 543 |
+
__half_raw r;
|
| 544 |
+
unsigned int sign = 0U;
|
| 545 |
+
unsigned int remainder = 0U;
|
| 546 |
+
r.x = __internal_float2half(a, sign, remainder);
|
| 547 |
+
if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
|
| 548 |
+
r.x++;
|
| 549 |
+
}
|
| 550 |
+
val = r;
|
| 551 |
+
#endif
|
| 552 |
+
return val;
|
| 553 |
+
}
|
| 554 |
+
__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a)
|
| 555 |
+
{
|
| 556 |
+
__half val;
|
| 557 |
+
#if defined(__CUDA_ARCH__)
|
| 558 |
+
asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
|
| 559 |
+
#else
|
| 560 |
+
__half_raw r;
|
| 561 |
+
unsigned int sign = 0U;
|
| 562 |
+
unsigned int remainder = 0U;
|
| 563 |
+
r.x = __internal_float2half(a, sign, remainder);
|
| 564 |
+
if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
|
| 565 |
+
r.x++;
|
| 566 |
+
}
|
| 567 |
+
val = r;
|
| 568 |
+
#endif
|
| 569 |
+
return val;
|
| 570 |
+
}
|
| 571 |
+
__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a)
|
| 572 |
+
{
|
| 573 |
+
__half val;
|
| 574 |
+
#if defined(__CUDA_ARCH__)
|
| 575 |
+
asm("{ cvt.rz.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
|
| 576 |
+
#else
|
| 577 |
+
__half_raw r;
|
| 578 |
+
unsigned int sign = 0U;
|
| 579 |
+
unsigned int remainder = 0U;
|
| 580 |
+
r.x = __internal_float2half(a, sign, remainder);
|
| 581 |
+
val = r;
|
| 582 |
+
#endif
|
| 583 |
+
return val;
|
| 584 |
+
}
|
| 585 |
+
__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a)
|
| 586 |
+
{
|
| 587 |
+
__half val;
|
| 588 |
+
#if defined(__CUDA_ARCH__)
|
| 589 |
+
asm("{ cvt.rm.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
|
| 590 |
+
#else
|
| 591 |
+
__half_raw r;
|
| 592 |
+
unsigned int sign = 0U;
|
| 593 |
+
unsigned int remainder = 0U;
|
| 594 |
+
r.x = __internal_float2half(a, sign, remainder);
|
| 595 |
+
if ((remainder != 0U) && (sign != 0U)) {
|
| 596 |
+
r.x++;
|
| 597 |
+
}
|
| 598 |
+
val = r;
|
| 599 |
+
#endif
|
| 600 |
+
return val;
|
| 601 |
+
}
|
| 602 |
+
__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a)
|
| 603 |
+
{
|
| 604 |
+
__half val;
|
| 605 |
+
#if defined(__CUDA_ARCH__)
|
| 606 |
+
asm("{ cvt.rp.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
|
| 607 |
+
#else
|
| 608 |
+
__half_raw r;
|
| 609 |
+
unsigned int sign = 0U;
|
| 610 |
+
unsigned int remainder = 0U;
|
| 611 |
+
r.x = __internal_float2half(a, sign, remainder);
|
| 612 |
+
if ((remainder != 0U) && (sign == 0U)) {
|
| 613 |
+
r.x++;
|
| 614 |
+
}
|
| 615 |
+
val = r;
|
| 616 |
+
#endif
|
| 617 |
+
return val;
|
| 618 |
+
}
|
| 619 |
+
__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a)
|
| 620 |
+
{
|
| 621 |
+
__half2 val;
|
| 622 |
+
#if defined(__CUDA_ARCH__)
|
| 623 |
+
asm("{.reg .f16 low;\n"
|
| 624 |
+
" cvt.rn.f16.f32 low, %1;\n"
|
| 625 |
+
" mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a));
|
| 626 |
+
#else
|
| 627 |
+
val = __half2(__float2half_rn(a), __float2half_rn(a));
|
| 628 |
+
#endif
|
| 629 |
+
return val;
|
| 630 |
+
}
|
| 631 |
+
__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b)
|
| 632 |
+
{
|
| 633 |
+
__half2 val;
|
| 634 |
+
#if defined(__CUDA_ARCH__)
|
| 635 |
+
#if (__CUDA_ARCH__ >= 800)
|
| 636 |
+
asm("{ cvt.rn.f16x2.f32 %0, %2, %1; }\n"
|
| 637 |
+
: "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b));
|
| 638 |
+
#else
|
| 639 |
+
asm("{.reg .f16 low,high;\n"
|
| 640 |
+
" cvt.rn.f16.f32 low, %1;\n"
|
| 641 |
+
" cvt.rn.f16.f32 high, %2;\n"
|
| 642 |
+
" mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b));
|
| 643 |
+
#endif
|
| 644 |
+
#else
|
| 645 |
+
val = __half2(__float2half_rn(a), __float2half_rn(b));
|
| 646 |
+
#endif
|
| 647 |
+
return val;
|
| 648 |
+
}
|
| 649 |
+
|
| 650 |
+
#ifndef __CUDACC_RTC__ /* no host functions in NVRTC mode */
|
| 651 |
+
static inline float __internal_half2float(const unsigned short h)
|
| 652 |
+
{
|
| 653 |
+
unsigned int sign = ((static_cast<unsigned int>(h) >> 15U) & 1U);
|
| 654 |
+
unsigned int exponent = ((static_cast<unsigned int>(h) >> 10U) & 0x1fU);
|
| 655 |
+
unsigned int mantissa = ((static_cast<unsigned int>(h) & 0x3ffU) << 13U);
|
| 656 |
+
float f;
|
| 657 |
+
if (exponent == 0x1fU) { /* NaN or Inf */
|
| 658 |
+
/* discard sign of a NaN */
|
| 659 |
+
sign = ((mantissa != 0U) ? (sign >> 1U) : sign);
|
| 660 |
+
mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U);
|
| 661 |
+
exponent = 0xffU;
|
| 662 |
+
} else if (exponent == 0U) { /* Denorm or Zero */
|
| 663 |
+
if (mantissa != 0U) {
|
| 664 |
+
unsigned int msb;
|
| 665 |
+
exponent = 0x71U;
|
| 666 |
+
do {
|
| 667 |
+
msb = (mantissa & 0x400000U);
|
| 668 |
+
mantissa <<= 1U; /* normalize */
|
| 669 |
+
--exponent;
|
| 670 |
+
} while (msb == 0U);
|
| 671 |
+
mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
|
| 672 |
+
}
|
| 673 |
+
} else {
|
| 674 |
+
exponent += 0x70U;
|
| 675 |
+
}
|
| 676 |
+
const unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa);
|
| 677 |
+
#if defined(__CUDACC__)
|
| 678 |
+
(void)memcpy(&f, &u, sizeof(u));
|
| 679 |
+
#else
|
| 680 |
+
(void)std::memcpy(&f, &u, sizeof(u));
|
| 681 |
+
#endif
|
| 682 |
+
return f;
|
| 683 |
+
}
|
| 684 |
+
#endif /* !defined(__CUDACC_RTC__) */
|
| 685 |
+
|
| 686 |
+
__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a)
|
| 687 |
+
{
|
| 688 |
+
float val;
|
| 689 |
+
#if defined(__CUDA_ARCH__)
|
| 690 |
+
asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(a)));
|
| 691 |
+
#else
|
| 692 |
+
val = __internal_half2float(static_cast<__half_raw>(a).x);
|
| 693 |
+
#endif
|
| 694 |
+
return val;
|
| 695 |
+
}
|
| 696 |
+
__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a)
|
| 697 |
+
{
|
| 698 |
+
float val;
|
| 699 |
+
#if defined(__CUDA_ARCH__)
|
| 700 |
+
asm("{.reg .f16 low,high;\n"
|
| 701 |
+
" mov.b32 {low,high},%1;\n"
|
| 702 |
+
" cvt.f32.f16 %0, low;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a)));
|
| 703 |
+
#else
|
| 704 |
+
val = __internal_half2float(static_cast<__half2_raw>(a).x);
|
| 705 |
+
#endif
|
| 706 |
+
return val;
|
| 707 |
+
}
|
| 708 |
+
__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a)
|
| 709 |
+
{
|
| 710 |
+
float val;
|
| 711 |
+
#if defined(__CUDA_ARCH__)
|
| 712 |
+
asm("{.reg .f16 low,high;\n"
|
| 713 |
+
" mov.b32 {low,high},%1;\n"
|
| 714 |
+
" cvt.f32.f16 %0, high;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a)));
|
| 715 |
+
#else
|
| 716 |
+
val = __internal_half2float(static_cast<__half2_raw>(a).y);
|
| 717 |
+
#endif
|
| 718 |
+
return val;
|
| 719 |
+
}
|
| 720 |
+
__CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h)
|
| 721 |
+
{
|
| 722 |
+
short int i;
|
| 723 |
+
#if defined __CUDA_ARCH__
|
| 724 |
+
asm("cvt.rzi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
|
| 725 |
+
#else
|
| 726 |
+
const float f = __half2float(h);
|
| 727 |
+
const short int max_val = (short int)0x7fffU;
|
| 728 |
+
const short int min_val = (short int)0x8000U;
|
| 729 |
+
const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
|
| 730 |
+
// saturation fixup
|
| 731 |
+
if (bits > (unsigned short)0xF800U) {
|
| 732 |
+
// NaN
|
| 733 |
+
i = 0;
|
| 734 |
+
} else if (f > static_cast<float>(max_val)) {
|
| 735 |
+
// saturate maximum
|
| 736 |
+
i = max_val;
|
| 737 |
+
} else if (f < static_cast<float>(min_val)) {
|
| 738 |
+
// saturate minimum
|
| 739 |
+
i = min_val;
|
| 740 |
+
} else {
|
| 741 |
+
// normal value, conversion is well-defined
|
| 742 |
+
i = static_cast<short int>(f);
|
| 743 |
+
}
|
| 744 |
+
#endif
|
| 745 |
+
return i;
|
| 746 |
+
}
|
| 747 |
+
__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h)
|
| 748 |
+
{
|
| 749 |
+
unsigned short int i;
|
| 750 |
+
#if defined __CUDA_ARCH__
|
| 751 |
+
asm("cvt.rzi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
|
| 752 |
+
#else
|
| 753 |
+
const float f = __half2float(h);
|
| 754 |
+
const unsigned short int max_val = 0xffffU;
|
| 755 |
+
const unsigned short int min_val = 0U;
|
| 756 |
+
const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
|
| 757 |
+
// saturation fixup
|
| 758 |
+
if (bits > (unsigned short)0xF800U) {
|
| 759 |
+
// NaN
|
| 760 |
+
i = 0U;
|
| 761 |
+
} else if (f > static_cast<float>(max_val)) {
|
| 762 |
+
// saturate maximum
|
| 763 |
+
i = max_val;
|
| 764 |
+
} else if (f < static_cast<float>(min_val)) {
|
| 765 |
+
// saturate minimum
|
| 766 |
+
i = min_val;
|
| 767 |
+
} else {
|
| 768 |
+
// normal value, conversion is well-defined
|
| 769 |
+
i = static_cast<unsigned short int>(f);
|
| 770 |
+
}
|
| 771 |
+
#endif
|
| 772 |
+
return i;
|
| 773 |
+
}
|
| 774 |
+
__CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h)
|
| 775 |
+
{
|
| 776 |
+
int i;
|
| 777 |
+
#if defined __CUDA_ARCH__
|
| 778 |
+
asm("cvt.rzi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
|
| 779 |
+
#else
|
| 780 |
+
const float f = __half2float(h);
|
| 781 |
+
const int max_val = (int)0x7fffffffU;
|
| 782 |
+
const int min_val = (int)0x80000000U;
|
| 783 |
+
const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
|
| 784 |
+
// saturation fixup
|
| 785 |
+
if (bits > (unsigned short)0xF800U) {
|
| 786 |
+
// NaN
|
| 787 |
+
i = 0;
|
| 788 |
+
} else if (f > static_cast<float>(max_val)) {
|
| 789 |
+
// saturate maximum
|
| 790 |
+
i = max_val;
|
| 791 |
+
} else if (f < static_cast<float>(min_val)) {
|
| 792 |
+
// saturate minimum
|
| 793 |
+
i = min_val;
|
| 794 |
+
} else {
|
| 795 |
+
// normal value, conversion is well-defined
|
| 796 |
+
i = static_cast<int>(f);
|
| 797 |
+
}
|
| 798 |
+
#endif
|
| 799 |
+
return i;
|
| 800 |
+
}
|
| 801 |
+
__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h)
|
| 802 |
+
{
|
| 803 |
+
unsigned int i;
|
| 804 |
+
#if defined __CUDA_ARCH__
|
| 805 |
+
asm("cvt.rzi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
|
| 806 |
+
#else
|
| 807 |
+
const float f = __half2float(h);
|
| 808 |
+
const unsigned int max_val = 0xffffffffU;
|
| 809 |
+
const unsigned int min_val = 0U;
|
| 810 |
+
const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
|
| 811 |
+
// saturation fixup
|
| 812 |
+
if (bits > (unsigned short)0xF800U) {
|
| 813 |
+
// NaN
|
| 814 |
+
i = 0U;
|
| 815 |
+
} else if (f > static_cast<float>(max_val)) {
|
| 816 |
+
// saturate maximum
|
| 817 |
+
i = max_val;
|
| 818 |
+
} else if (f < static_cast<float>(min_val)) {
|
| 819 |
+
// saturate minimum
|
| 820 |
+
i = min_val;
|
| 821 |
+
} else {
|
| 822 |
+
// normal value, conversion is well-defined
|
| 823 |
+
i = static_cast<unsigned int>(f);
|
| 824 |
+
}
|
| 825 |
+
#endif
|
| 826 |
+
return i;
|
| 827 |
+
}
|
| 828 |
+
__CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h)
|
| 829 |
+
{
|
| 830 |
+
long long int i;
|
| 831 |
+
#if defined __CUDA_ARCH__
|
| 832 |
+
asm("cvt.rzi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
|
| 833 |
+
#else
|
| 834 |
+
const float f = __half2float(h);
|
| 835 |
+
const long long int max_val = (long long int)0x7fffffffffffffffULL;
|
| 836 |
+
const long long int min_val = (long long int)0x8000000000000000ULL;
|
| 837 |
+
const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
|
| 838 |
+
// saturation fixup
|
| 839 |
+
if (bits > (unsigned short)0xF800U) {
|
| 840 |
+
// NaN
|
| 841 |
+
i = min_val;
|
| 842 |
+
} else if (f > static_cast<float>(max_val)) {
|
| 843 |
+
// saturate maximum
|
| 844 |
+
i = max_val;
|
| 845 |
+
} else if (f < static_cast<float>(min_val)) {
|
| 846 |
+
// saturate minimum
|
| 847 |
+
i = min_val;
|
| 848 |
+
} else {
|
| 849 |
+
// normal value, conversion is well-defined
|
| 850 |
+
i = static_cast<long long int>(f);
|
| 851 |
+
}
|
| 852 |
+
#endif
|
| 853 |
+
return i;
|
| 854 |
+
}
|
| 855 |
+
__CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h)
|
| 856 |
+
{
|
| 857 |
+
unsigned long long int i;
|
| 858 |
+
#if defined __CUDA_ARCH__
|
| 859 |
+
asm("cvt.rzi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
|
| 860 |
+
#else
|
| 861 |
+
const float f = __half2float(h);
|
| 862 |
+
const unsigned long long int max_val = 0xffffffffffffffffULL;
|
| 863 |
+
const unsigned long long int min_val = 0ULL;
|
| 864 |
+
const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
|
| 865 |
+
// saturation fixup
|
| 866 |
+
if (bits > (unsigned short)0xF800U) {
|
| 867 |
+
// NaN
|
| 868 |
+
i = 0x8000000000000000ULL;
|
| 869 |
+
} else if (f > static_cast<float>(max_val)) {
|
| 870 |
+
// saturate maximum
|
| 871 |
+
i = max_val;
|
| 872 |
+
} else if (f < static_cast<float>(min_val)) {
|
| 873 |
+
// saturate minimum
|
| 874 |
+
i = min_val;
|
| 875 |
+
} else {
|
| 876 |
+
// normal value, conversion is well-defined
|
| 877 |
+
i = static_cast<unsigned long long int>(f);
|
| 878 |
+
}
|
| 879 |
+
#endif
|
| 880 |
+
return i;
|
| 881 |
+
}
|
| 882 |
+
|
| 883 |
+
/* Intrinsic functions only available to nvcc compilers */
|
| 884 |
+
#if defined(__CUDACC__)
|
| 885 |
+
|
| 886 |
+
/* CUDA vector-types compatible vector creation function (note returns __half2, not half2) */
|
| 887 |
+
__VECTOR_FUNCTIONS_DECL__ __half2 make_half2(const __half x, const __half y)
|
| 888 |
+
{
|
| 889 |
+
__half2 t; t.x = x; t.y = y; return t;
|
| 890 |
+
}
|
| 891 |
+
#undef __VECTOR_FUNCTIONS_DECL__
|
| 892 |
+
|
| 893 |
+
|
| 894 |
+
/* Definitions of intrinsics */
|
| 895 |
+
__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a)
|
| 896 |
+
{
|
| 897 |
+
const __half2 val = __floats2half2_rn(a.x, a.y);
|
| 898 |
+
return val;
|
| 899 |
+
}
|
| 900 |
+
__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a)
|
| 901 |
+
{
|
| 902 |
+
float hi_float;
|
| 903 |
+
float lo_float;
|
| 904 |
+
#if defined(__CUDA_ARCH__)
|
| 905 |
+
asm("{.reg .f16 low,high;\n"
|
| 906 |
+
" mov.b32 {low,high},%1;\n"
|
| 907 |
+
" cvt.f32.f16 %0, low;}\n" : "=f"(lo_float) : "r"(__HALF2_TO_CUI(a)));
|
| 908 |
+
|
| 909 |
+
asm("{.reg .f16 low,high;\n"
|
| 910 |
+
" mov.b32 {low,high},%1;\n"
|
| 911 |
+
" cvt.f32.f16 %0, high;}\n" : "=f"(hi_float) : "r"(__HALF2_TO_CUI(a)));
|
| 912 |
+
#else
|
| 913 |
+
lo_float = __internal_half2float(((__half2_raw)a).x);
|
| 914 |
+
hi_float = __internal_half2float(((__half2_raw)a).y);
|
| 915 |
+
#endif
|
| 916 |
+
return make_float2(lo_float, hi_float);
|
| 917 |
+
}
|
| 918 |
+
__CUDA_FP16_DECL__ int __half2int_rn(const __half h)
|
| 919 |
+
{
|
| 920 |
+
int i;
|
| 921 |
+
asm("cvt.rni.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
|
| 922 |
+
return i;
|
| 923 |
+
}
|
| 924 |
+
__CUDA_FP16_DECL__ int __half2int_rd(const __half h)
|
| 925 |
+
{
|
| 926 |
+
int i;
|
| 927 |
+
asm("cvt.rmi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
|
| 928 |
+
return i;
|
| 929 |
+
}
|
| 930 |
+
__CUDA_FP16_DECL__ int __half2int_ru(const __half h)
|
| 931 |
+
{
|
| 932 |
+
int i;
|
| 933 |
+
asm("cvt.rpi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
|
| 934 |
+
return i;
|
| 935 |
+
}
|
| 936 |
+
__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i)
|
| 937 |
+
{
|
| 938 |
+
__half h;
|
| 939 |
+
#if defined(__CUDA_ARCH__)
|
| 940 |
+
asm("cvt.rn.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
|
| 941 |
+
#else
|
| 942 |
+
// double-rounding is not a problem here: if integer
|
| 943 |
+
// has more than 24 bits, it is already too large to
|
| 944 |
+
// be represented in half precision, and result will
|
| 945 |
+
// be infinity.
|
| 946 |
+
const float f = static_cast<float>(i);
|
| 947 |
+
h = __float2half_rn(f);
|
| 948 |
+
#endif
|
| 949 |
+
return h;
|
| 950 |
+
}
|
| 951 |
+
__CUDA_FP16_DECL__ __half __int2half_rz(const int i)
|
| 952 |
+
{
|
| 953 |
+
__half h;
|
| 954 |
+
asm("cvt.rz.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
|
| 955 |
+
return h;
|
| 956 |
+
}
|
| 957 |
+
__CUDA_FP16_DECL__ __half __int2half_rd(const int i)
|
| 958 |
+
{
|
| 959 |
+
__half h;
|
| 960 |
+
asm("cvt.rm.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
|
| 961 |
+
return h;
|
| 962 |
+
}
|
| 963 |
+
__CUDA_FP16_DECL__ __half __int2half_ru(const int i)
|
| 964 |
+
{
|
| 965 |
+
__half h;
|
| 966 |
+
asm("cvt.rp.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
|
| 967 |
+
return h;
|
| 968 |
+
}
|
| 969 |
+
|
| 970 |
+
__CUDA_FP16_DECL__ short int __half2short_rn(const __half h)
|
| 971 |
+
{
|
| 972 |
+
short int i;
|
| 973 |
+
asm("cvt.rni.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
|
| 974 |
+
return i;
|
| 975 |
+
}
|
| 976 |
+
__CUDA_FP16_DECL__ short int __half2short_rd(const __half h)
|
| 977 |
+
{
|
| 978 |
+
short int i;
|
| 979 |
+
asm("cvt.rmi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
|
| 980 |
+
return i;
|
| 981 |
+
}
|
| 982 |
+
__CUDA_FP16_DECL__ short int __half2short_ru(const __half h)
|
| 983 |
+
{
|
| 984 |
+
short int i;
|
| 985 |
+
asm("cvt.rpi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
|
| 986 |
+
return i;
|
| 987 |
+
}
|
| 988 |
+
__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i)
|
| 989 |
+
{
|
| 990 |
+
__half h;
|
| 991 |
+
#if defined __CUDA_ARCH__
|
| 992 |
+
asm("cvt.rn.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
|
| 993 |
+
#else
|
| 994 |
+
const float f = static_cast<float>(i);
|
| 995 |
+
h = __float2half_rn(f);
|
| 996 |
+
#endif
|
| 997 |
+
return h;
|
| 998 |
+
}
|
| 999 |
+
__CUDA_FP16_DECL__ __half __short2half_rz(const short int i)
|
| 1000 |
+
{
|
| 1001 |
+
__half h;
|
| 1002 |
+
asm("cvt.rz.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
|
| 1003 |
+
return h;
|
| 1004 |
+
}
|
| 1005 |
+
__CUDA_FP16_DECL__ __half __short2half_rd(const short int i)
|
| 1006 |
+
{
|
| 1007 |
+
__half h;
|
| 1008 |
+
asm("cvt.rm.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
|
| 1009 |
+
return h;
|
| 1010 |
+
}
|
| 1011 |
+
__CUDA_FP16_DECL__ __half __short2half_ru(const short int i)
|
| 1012 |
+
{
|
| 1013 |
+
__half h;
|
| 1014 |
+
asm("cvt.rp.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
|
| 1015 |
+
return h;
|
| 1016 |
+
}
|
| 1017 |
+
|
| 1018 |
+
__CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h)
|
| 1019 |
+
{
|
| 1020 |
+
unsigned int i;
|
| 1021 |
+
asm("cvt.rni.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
|
| 1022 |
+
return i;
|
| 1023 |
+
}
|
| 1024 |
+
__CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h)
|
| 1025 |
+
{
|
| 1026 |
+
unsigned int i;
|
| 1027 |
+
asm("cvt.rmi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
|
| 1028 |
+
return i;
|
| 1029 |
+
}
|
| 1030 |
+
__CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h)
|
| 1031 |
+
{
|
| 1032 |
+
unsigned int i;
|
| 1033 |
+
asm("cvt.rpi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
|
| 1034 |
+
return i;
|
| 1035 |
+
}
|
| 1036 |
+
__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i)
|
| 1037 |
+
{
|
| 1038 |
+
__half h;
|
| 1039 |
+
#if defined __CUDA_ARCH__
|
| 1040 |
+
asm("cvt.rn.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
|
| 1041 |
+
#else
|
| 1042 |
+
// double-rounding is not a problem here: if integer
|
| 1043 |
+
// has more than 24 bits, it is already too large to
|
| 1044 |
+
// be represented in half precision, and result will
|
| 1045 |
+
// be infinity.
|
| 1046 |
+
const float f = static_cast<float>(i);
|
| 1047 |
+
h = __float2half_rn(f);
|
| 1048 |
+
#endif
|
| 1049 |
+
return h;
|
| 1050 |
+
}
|
| 1051 |
+
__CUDA_FP16_DECL__ __half __uint2half_rz(const unsigned int i)
|
| 1052 |
+
{
|
| 1053 |
+
__half h;
|
| 1054 |
+
asm("cvt.rz.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
|
| 1055 |
+
return h;
|
| 1056 |
+
}
|
| 1057 |
+
__CUDA_FP16_DECL__ __half __uint2half_rd(const unsigned int i)
|
| 1058 |
+
{
|
| 1059 |
+
__half h;
|
| 1060 |
+
asm("cvt.rm.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
|
| 1061 |
+
return h;
|
| 1062 |
+
}
|
| 1063 |
+
__CUDA_FP16_DECL__ __half __uint2half_ru(const unsigned int i)
|
| 1064 |
+
{
|
| 1065 |
+
__half h;
|
| 1066 |
+
asm("cvt.rp.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
|
| 1067 |
+
return h;
|
| 1068 |
+
}
|
| 1069 |
+
|
| 1070 |
+
__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h)
|
| 1071 |
+
{
|
| 1072 |
+
unsigned short int i;
|
| 1073 |
+
asm("cvt.rni.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
|
| 1074 |
+
return i;
|
| 1075 |
+
}
|
| 1076 |
+
__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h)
|
| 1077 |
+
{
|
| 1078 |
+
unsigned short int i;
|
| 1079 |
+
asm("cvt.rmi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
|
| 1080 |
+
return i;
|
| 1081 |
+
}
|
| 1082 |
+
__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h)
|
| 1083 |
+
{
|
| 1084 |
+
unsigned short int i;
|
| 1085 |
+
asm("cvt.rpi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
|
| 1086 |
+
return i;
|
| 1087 |
+
}
|
| 1088 |
+
__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i)
|
| 1089 |
+
{
|
| 1090 |
+
__half h;
|
| 1091 |
+
#if defined __CUDA_ARCH__
|
| 1092 |
+
asm("cvt.rn.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
|
| 1093 |
+
#else
|
| 1094 |
+
const float f = static_cast<float>(i);
|
| 1095 |
+
h = __float2half_rn(f);
|
| 1096 |
+
#endif
|
| 1097 |
+
return h;
|
| 1098 |
+
}
|
| 1099 |
+
__CUDA_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i)
|
| 1100 |
+
{
|
| 1101 |
+
__half h;
|
| 1102 |
+
asm("cvt.rz.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
|
| 1103 |
+
return h;
|
| 1104 |
+
}
|
| 1105 |
+
__CUDA_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i)
|
| 1106 |
+
{
|
| 1107 |
+
__half h;
|
| 1108 |
+
asm("cvt.rm.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
|
| 1109 |
+
return h;
|
| 1110 |
+
}
|
| 1111 |
+
__CUDA_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i)
|
| 1112 |
+
{
|
| 1113 |
+
__half h;
|
| 1114 |
+
asm("cvt.rp.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
|
| 1115 |
+
return h;
|
| 1116 |
+
}
|
| 1117 |
+
|
| 1118 |
+
__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h)
|
| 1119 |
+
{
|
| 1120 |
+
unsigned long long int i;
|
| 1121 |
+
asm("cvt.rni.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
|
| 1122 |
+
return i;
|
| 1123 |
+
}
|
| 1124 |
+
__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h)
|
| 1125 |
+
{
|
| 1126 |
+
unsigned long long int i;
|
| 1127 |
+
asm("cvt.rmi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
|
| 1128 |
+
return i;
|
| 1129 |
+
}
|
| 1130 |
+
__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h)
|
| 1131 |
+
{
|
| 1132 |
+
unsigned long long int i;
|
| 1133 |
+
asm("cvt.rpi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
|
| 1134 |
+
return i;
|
| 1135 |
+
}
|
| 1136 |
+
__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int i)
|
| 1137 |
+
{
|
| 1138 |
+
__half h;
|
| 1139 |
+
#if defined(__CUDA_ARCH__)
|
| 1140 |
+
asm("cvt.rn.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
|
| 1141 |
+
#else
|
| 1142 |
+
// double-rounding is not a problem here: if integer
|
| 1143 |
+
// has more than 24 bits, it is already too large to
|
| 1144 |
+
// be represented in half precision, and result will
|
| 1145 |
+
// be infinity.
|
| 1146 |
+
const float f = static_cast<float>(i);
|
| 1147 |
+
h = __float2half_rn(f);
|
| 1148 |
+
#endif
|
| 1149 |
+
return h;
|
| 1150 |
+
}
|
| 1151 |
+
__CUDA_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i)
|
| 1152 |
+
{
|
| 1153 |
+
__half h;
|
| 1154 |
+
asm("cvt.rz.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
|
| 1155 |
+
return h;
|
| 1156 |
+
}
|
| 1157 |
+
__CUDA_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i)
|
| 1158 |
+
{
|
| 1159 |
+
__half h;
|
| 1160 |
+
asm("cvt.rm.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
|
| 1161 |
+
return h;
|
| 1162 |
+
}
|
| 1163 |
+
__CUDA_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i)
|
| 1164 |
+
{
|
| 1165 |
+
__half h;
|
| 1166 |
+
asm("cvt.rp.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
|
| 1167 |
+
return h;
|
| 1168 |
+
}
|
| 1169 |
+
|
| 1170 |
+
__CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h)
|
| 1171 |
+
{
|
| 1172 |
+
long long int i;
|
| 1173 |
+
asm("cvt.rni.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
|
| 1174 |
+
return i;
|
| 1175 |
+
}
|
| 1176 |
+
__CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h)
|
| 1177 |
+
{
|
| 1178 |
+
long long int i;
|
| 1179 |
+
asm("cvt.rmi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
|
| 1180 |
+
return i;
|
| 1181 |
+
}
|
| 1182 |
+
__CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h)
|
| 1183 |
+
{
|
| 1184 |
+
long long int i;
|
| 1185 |
+
asm("cvt.rpi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
|
| 1186 |
+
return i;
|
| 1187 |
+
}
|
| 1188 |
+
__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i)
|
| 1189 |
+
{
|
| 1190 |
+
__half h;
|
| 1191 |
+
#if defined(__CUDA_ARCH__)
|
| 1192 |
+
asm("cvt.rn.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
|
| 1193 |
+
#else
|
| 1194 |
+
// double-rounding is not a problem here: if integer
|
| 1195 |
+
// has more than 24 bits, it is already too large to
|
| 1196 |
+
// be represented in half precision, and result will
|
| 1197 |
+
// be infinity.
|
| 1198 |
+
const float f = static_cast<float>(i);
|
| 1199 |
+
h = __float2half_rn(f);
|
| 1200 |
+
#endif
|
| 1201 |
+
return h;
|
| 1202 |
+
}
|
| 1203 |
+
__CUDA_FP16_DECL__ __half __ll2half_rz(const long long int i)
|
| 1204 |
+
{
|
| 1205 |
+
__half h;
|
| 1206 |
+
asm("cvt.rz.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
|
| 1207 |
+
return h;
|
| 1208 |
+
}
|
| 1209 |
+
__CUDA_FP16_DECL__ __half __ll2half_rd(const long long int i)
|
| 1210 |
+
{
|
| 1211 |
+
__half h;
|
| 1212 |
+
asm("cvt.rm.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
|
| 1213 |
+
return h;
|
| 1214 |
+
}
|
| 1215 |
+
__CUDA_FP16_DECL__ __half __ll2half_ru(const long long int i)
|
| 1216 |
+
{
|
| 1217 |
+
__half h;
|
| 1218 |
+
asm("cvt.rp.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
|
| 1219 |
+
return h;
|
| 1220 |
+
}
|
| 1221 |
+
|
| 1222 |
+
__CUDA_FP16_DECL__ __half htrunc(const __half h)
|
| 1223 |
+
{
|
| 1224 |
+
__half r;
|
| 1225 |
+
asm("cvt.rzi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
|
| 1226 |
+
return r;
|
| 1227 |
+
}
|
| 1228 |
+
__CUDA_FP16_DECL__ __half hceil(const __half h)
|
| 1229 |
+
{
|
| 1230 |
+
__half r;
|
| 1231 |
+
asm("cvt.rpi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
|
| 1232 |
+
return r;
|
| 1233 |
+
}
|
| 1234 |
+
__CUDA_FP16_DECL__ __half hfloor(const __half h)
|
| 1235 |
+
{
|
| 1236 |
+
__half r;
|
| 1237 |
+
asm("cvt.rmi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
|
| 1238 |
+
return r;
|
| 1239 |
+
}
|
| 1240 |
+
__CUDA_FP16_DECL__ __half hrint(const __half h)
|
| 1241 |
+
{
|
| 1242 |
+
__half r;
|
| 1243 |
+
asm("cvt.rni.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
|
| 1244 |
+
return r;
|
| 1245 |
+
}
|
| 1246 |
+
|
| 1247 |
+
__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h)
|
| 1248 |
+
{
|
| 1249 |
+
__half2 val;
|
| 1250 |
+
asm("{.reg .f16 low,high;\n"
|
| 1251 |
+
" mov.b32 {low,high}, %1;\n"
|
| 1252 |
+
" cvt.rzi.f16.f16 low, low;\n"
|
| 1253 |
+
" cvt.rzi.f16.f16 high, high;\n"
|
| 1254 |
+
" mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
|
| 1255 |
+
return val;
|
| 1256 |
+
}
|
| 1257 |
+
__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h)
|
| 1258 |
+
{
|
| 1259 |
+
__half2 val;
|
| 1260 |
+
asm("{.reg .f16 low,high;\n"
|
| 1261 |
+
" mov.b32 {low,high}, %1;\n"
|
| 1262 |
+
" cvt.rpi.f16.f16 low, low;\n"
|
| 1263 |
+
" cvt.rpi.f16.f16 high, high;\n"
|
| 1264 |
+
" mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
|
| 1265 |
+
return val;
|
| 1266 |
+
}
|
| 1267 |
+
__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h)
|
| 1268 |
+
{
|
| 1269 |
+
__half2 val;
|
| 1270 |
+
asm("{.reg .f16 low,high;\n"
|
| 1271 |
+
" mov.b32 {low,high}, %1;\n"
|
| 1272 |
+
" cvt.rmi.f16.f16 low, low;\n"
|
| 1273 |
+
" cvt.rmi.f16.f16 high, high;\n"
|
| 1274 |
+
" mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
|
| 1275 |
+
return val;
|
| 1276 |
+
}
|
| 1277 |
+
__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h)
|
| 1278 |
+
{
|
| 1279 |
+
__half2 val;
|
| 1280 |
+
asm("{.reg .f16 low,high;\n"
|
| 1281 |
+
" mov.b32 {low,high}, %1;\n"
|
| 1282 |
+
" cvt.rni.f16.f16 low, low;\n"
|
| 1283 |
+
" cvt.rni.f16.f16 high, high;\n"
|
| 1284 |
+
" mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
|
| 1285 |
+
return val;
|
| 1286 |
+
}
|
| 1287 |
+
__CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b)
|
| 1288 |
+
{
|
| 1289 |
+
__half2 val;
|
| 1290 |
+
asm("{.reg .f16 alow,ahigh,blow,bhigh;\n"
|
| 1291 |
+
" mov.b32 {alow,ahigh}, %1;\n"
|
| 1292 |
+
" mov.b32 {blow,bhigh}, %2;\n"
|
| 1293 |
+
" mov.b32 %0, {alow,blow};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b)));
|
| 1294 |
+
return val;
|
| 1295 |
+
}
|
| 1296 |
+
__CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b)
|
| 1297 |
+
{
|
| 1298 |
+
__half2 val;
|
| 1299 |
+
asm("{.reg .f16 alow,ahigh,blow,bhigh;\n"
|
| 1300 |
+
" mov.b32 {alow,ahigh}, %1;\n"
|
| 1301 |
+
" mov.b32 {blow,bhigh}, %2;\n"
|
| 1302 |
+
" mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b)));
|
| 1303 |
+
return val;
|
| 1304 |
+
}
|
| 1305 |
+
__CUDA_FP16_DECL__ __half __low2half(const __half2 a)
|
| 1306 |
+
{
|
| 1307 |
+
__half ret;
|
| 1308 |
+
asm("{.reg .f16 low,high;\n"
|
| 1309 |
+
" mov.b32 {low,high}, %1;\n"
|
| 1310 |
+
" mov.b16 %0, low;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a)));
|
| 1311 |
+
return ret;
|
| 1312 |
+
}
|
| 1313 |
+
__CUDA_FP16_DECL__ int __hisinf(const __half a)
|
| 1314 |
+
{
|
| 1315 |
+
int retval;
|
| 1316 |
+
if (__HALF_TO_CUS(a) == 0xFC00U) {
|
| 1317 |
+
retval = -1;
|
| 1318 |
+
} else if (__HALF_TO_CUS(a) == 0x7C00U) {
|
| 1319 |
+
retval = 1;
|
| 1320 |
+
} else {
|
| 1321 |
+
retval = 0;
|
| 1322 |
+
}
|
| 1323 |
+
return retval;
|
| 1324 |
+
}
|
| 1325 |
+
__CUDA_FP16_DECL__ __half2 __low2half2(const __half2 a)
|
| 1326 |
+
{
|
| 1327 |
+
__half2 val;
|
| 1328 |
+
asm("{.reg .f16 low,high;\n"
|
| 1329 |
+
" mov.b32 {low,high}, %1;\n"
|
| 1330 |
+
" mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
|
| 1331 |
+
return val;
|
| 1332 |
+
}
|
| 1333 |
+
__CUDA_FP16_DECL__ __half2 __high2half2(const __half2 a)
|
| 1334 |
+
{
|
| 1335 |
+
__half2 val;
|
| 1336 |
+
asm("{.reg .f16 low,high;\n"
|
| 1337 |
+
" mov.b32 {low,high}, %1;\n"
|
| 1338 |
+
" mov.b32 %0, {high,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
|
| 1339 |
+
return val;
|
| 1340 |
+
}
|
| 1341 |
+
__CUDA_FP16_DECL__ __half __high2half(const __half2 a)
|
| 1342 |
+
{
|
| 1343 |
+
__half ret;
|
| 1344 |
+
asm("{.reg .f16 low,high;\n"
|
| 1345 |
+
" mov.b32 {low,high}, %1;\n"
|
| 1346 |
+
" mov.b16 %0, high;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a)));
|
| 1347 |
+
return ret;
|
| 1348 |
+
}
|
| 1349 |
+
__CUDA_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b)
|
| 1350 |
+
{
|
| 1351 |
+
__half2 val;
|
| 1352 |
+
asm("{ mov.b32 %0, {%1,%2};}\n"
|
| 1353 |
+
: "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b)));
|
| 1354 |
+
return val;
|
| 1355 |
+
}
|
| 1356 |
+
__CUDA_FP16_DECL__ __half2 __half2half2(const __half a)
|
| 1357 |
+
{
|
| 1358 |
+
__half2 val;
|
| 1359 |
+
asm("{ mov.b32 %0, {%1,%1};}\n"
|
| 1360 |
+
: "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a)));
|
| 1361 |
+
return val;
|
| 1362 |
+
}
|
| 1363 |
+
__CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a)
|
| 1364 |
+
{
|
| 1365 |
+
__half2 val;
|
| 1366 |
+
asm("{.reg .f16 low,high;\n"
|
| 1367 |
+
" mov.b32 {low,high}, %1;\n"
|
| 1368 |
+
" mov.b32 %0, {high,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
|
| 1369 |
+
return val;
|
| 1370 |
+
}
|
| 1371 |
+
__CUDA_FP16_DECL__ short int __half_as_short(const __half h)
|
| 1372 |
+
{
|
| 1373 |
+
return static_cast<short int>(__HALF_TO_CUS(h));
|
| 1374 |
+
}
|
| 1375 |
+
__CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h)
|
| 1376 |
+
{
|
| 1377 |
+
return __HALF_TO_CUS(h);
|
| 1378 |
+
}
|
| 1379 |
+
__CUDA_FP16_DECL__ __half __short_as_half(const short int i)
|
| 1380 |
+
{
|
| 1381 |
+
__half h;
|
| 1382 |
+
__HALF_TO_US(h) = static_cast<unsigned short int>(i);
|
| 1383 |
+
return h;
|
| 1384 |
+
}
|
| 1385 |
+
__CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i)
|
| 1386 |
+
{
|
| 1387 |
+
__half h;
|
| 1388 |
+
__HALF_TO_US(h) = i;
|
| 1389 |
+
return h;
|
| 1390 |
+
}
|
| 1391 |
+
|
| 1392 |
+
/******************************************************************************
|
| 1393 |
+
* __half arithmetic *
|
| 1394 |
+
******************************************************************************/
|
| 1395 |
+
__CUDA_FP16_DECL__ __half __hmax(const __half a, const __half b)
|
| 1396 |
+
{
|
| 1397 |
+
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
|
| 1398 |
+
__BINARY_OP_HALF_MACRO(max)
|
| 1399 |
+
#else
|
| 1400 |
+
const float fa = __half2float(a);
|
| 1401 |
+
const float fb = __half2float(b);
|
| 1402 |
+
float fr;
|
| 1403 |
+
asm("{max.f32 %0,%1,%2;\n}"
|
| 1404 |
+
:"=f"(fr) : "f"(fa), "f"(fb));
|
| 1405 |
+
const __half hr = __float2half(fr);
|
| 1406 |
+
return hr;
|
| 1407 |
+
#endif
|
| 1408 |
+
}
|
| 1409 |
+
__CUDA_FP16_DECL__ __half __hmin(const __half a, const __half b)
|
| 1410 |
+
{
|
| 1411 |
+
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
|
| 1412 |
+
__BINARY_OP_HALF_MACRO(min)
|
| 1413 |
+
#else
|
| 1414 |
+
const float fa = __half2float(a);
|
| 1415 |
+
const float fb = __half2float(b);
|
| 1416 |
+
float fr;
|
| 1417 |
+
asm("{min.f32 %0,%1,%2;\n}"
|
| 1418 |
+
:"=f"(fr) : "f"(fa), "f"(fb));
|
| 1419 |
+
const __half hr = __float2half(fr);
|
| 1420 |
+
return hr;
|
| 1421 |
+
#endif
|
| 1422 |
+
}
|
| 1423 |
+
|
| 1424 |
+
/******************************************************************************
|
| 1425 |
+
* __half2 arithmetic *
|
| 1426 |
+
******************************************************************************/
|
| 1427 |
+
__CUDA_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b)
|
| 1428 |
+
{
|
| 1429 |
+
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
|
| 1430 |
+
__BINARY_OP_HALF2_MACRO(max)
|
| 1431 |
+
#else
|
| 1432 |
+
const float2 fa = __half22float2(a);
|
| 1433 |
+
const float2 fb = __half22float2(b);
|
| 1434 |
+
float2 fr;
|
| 1435 |
+
asm("{max.f32 %0,%1,%2;\n}"
|
| 1436 |
+
:"=f"(fr.x) : "f"(fa.x), "f"(fb.x));
|
| 1437 |
+
asm("{max.f32 %0,%1,%2;\n}"
|
| 1438 |
+
:"=f"(fr.y) : "f"(fa.y), "f"(fb.y));
|
| 1439 |
+
const __half2 hr = __float22half2_rn(fr);
|
| 1440 |
+
return hr;
|
| 1441 |
+
#endif
|
| 1442 |
+
}
|
| 1443 |
+
__CUDA_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b)
|
| 1444 |
+
{
|
| 1445 |
+
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
|
| 1446 |
+
__BINARY_OP_HALF2_MACRO(min)
|
| 1447 |
+
#else
|
| 1448 |
+
const float2 fa = __half22float2(a);
|
| 1449 |
+
const float2 fb = __half22float2(b);
|
| 1450 |
+
float2 fr;
|
| 1451 |
+
asm("{min.f32 %0,%1,%2;\n}"
|
| 1452 |
+
:"=f"(fr.x) : "f"(fa.x), "f"(fb.x));
|
| 1453 |
+
asm("{min.f32 %0,%1,%2;\n}"
|
| 1454 |
+
:"=f"(fr.y) : "f"(fa.y), "f"(fb.y));
|
| 1455 |
+
const __half2 hr = __float22half2_rn(fr);
|
| 1456 |
+
return hr;
|
| 1457 |
+
#endif
|
| 1458 |
+
}
|
| 1459 |
+
|
| 1460 |
+
|
| 1461 |
+
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300)
|
| 1462 |
+
/******************************************************************************
|
| 1463 |
+
* __half, __half2 warp shuffle *
|
| 1464 |
+
******************************************************************************/
|
| 1465 |
+
#define __SHUFFLE_HALF2_MACRO(name) /* do */ {\
|
| 1466 |
+
__half2 r; \
|
| 1467 |
+
asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3;\n}" \
|
| 1468 |
+
:"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c)); \
|
| 1469 |
+
return r; \
|
| 1470 |
+
} /* while(0) */
|
| 1471 |
+
|
| 1472 |
+
#define __SHUFFLE_SYNC_HALF2_MACRO(name) /* do */ {\
|
| 1473 |
+
__half2 r; \
|
| 1474 |
+
asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3,%4;\n}" \
|
| 1475 |
+
:"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \
|
| 1476 |
+
return r; \
|
| 1477 |
+
} /* while(0) */
|
| 1478 |
+
|
| 1479 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
|
| 1480 |
+
|
| 1481 |
+
__CUDA_FP16_DECL__ __half2 __shfl(const __half2 var, const int delta, const int width)
|
| 1482 |
+
{
|
| 1483 |
+
unsigned int warp_size;
|
| 1484 |
+
asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
|
| 1485 |
+
const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
|
| 1486 |
+
__SHUFFLE_HALF2_MACRO(shfl.idx.b32)
|
| 1487 |
+
}
|
| 1488 |
+
__CUDA_FP16_DECL__ __half2 __shfl_up(const __half2 var, const unsigned int delta, const int width)
|
| 1489 |
+
{
|
| 1490 |
+
unsigned int warp_size;
|
| 1491 |
+
asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
|
| 1492 |
+
const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U;
|
| 1493 |
+
__SHUFFLE_HALF2_MACRO(shfl.up.b32)
|
| 1494 |
+
}
|
| 1495 |
+
__CUDA_FP16_DECL__ __half2 __shfl_down(const __half2 var, const unsigned int delta, const int width)
|
| 1496 |
+
{
|
| 1497 |
+
unsigned int warp_size;
|
| 1498 |
+
asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
|
| 1499 |
+
const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
|
| 1500 |
+
__SHUFFLE_HALF2_MACRO(shfl.down.b32)
|
| 1501 |
+
}
|
| 1502 |
+
__CUDA_FP16_DECL__ __half2 __shfl_xor(const __half2 var, const int delta, const int width)
|
| 1503 |
+
{
|
| 1504 |
+
unsigned int warp_size;
|
| 1505 |
+
asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
|
| 1506 |
+
const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
|
| 1507 |
+
__SHUFFLE_HALF2_MACRO(shfl.bfly.b32)
|
| 1508 |
+
}
|
| 1509 |
+
|
| 1510 |
+
#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */
|
| 1511 |
+
|
| 1512 |
+
__CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned mask, const __half2 var, const int delta, const int width)
|
| 1513 |
+
{
|
| 1514 |
+
unsigned int warp_size;
|
| 1515 |
+
asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
|
| 1516 |
+
const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
|
| 1517 |
+
__SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.idx.b32)
|
| 1518 |
+
}
|
| 1519 |
+
__CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width)
|
| 1520 |
+
{
|
| 1521 |
+
unsigned int warp_size;
|
| 1522 |
+
asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
|
| 1523 |
+
const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U;
|
| 1524 |
+
__SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.up.b32)
|
| 1525 |
+
}
|
| 1526 |
+
__CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width)
|
| 1527 |
+
{
|
| 1528 |
+
unsigned int warp_size;
|
| 1529 |
+
asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
|
| 1530 |
+
const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
|
| 1531 |
+
__SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.down.b32)
|
| 1532 |
+
}
|
| 1533 |
+
__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned mask, const __half2 var, const int delta, const int width)
|
| 1534 |
+
{
|
| 1535 |
+
unsigned int warp_size;
|
| 1536 |
+
asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
|
| 1537 |
+
const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
|
| 1538 |
+
__SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.bfly.b32)
|
| 1539 |
+
}
|
| 1540 |
+
|
| 1541 |
+
#undef __SHUFFLE_HALF2_MACRO
|
| 1542 |
+
#undef __SHUFFLE_SYNC_HALF2_MACRO
|
| 1543 |
+
|
| 1544 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
|
| 1545 |
+
|
| 1546 |
+
__CUDA_FP16_DECL__ __half __shfl(const __half var, const int delta, const int width)
|
| 1547 |
+
{
|
| 1548 |
+
const __half2 temp1 = __halves2half2(var, var);
|
| 1549 |
+
const __half2 temp2 = __shfl(temp1, delta, width);
|
| 1550 |
+
return __low2half(temp2);
|
| 1551 |
+
}
|
| 1552 |
+
__CUDA_FP16_DECL__ __half __shfl_up(const __half var, const unsigned int delta, const int width)
|
| 1553 |
+
{
|
| 1554 |
+
const __half2 temp1 = __halves2half2(var, var);
|
| 1555 |
+
const __half2 temp2 = __shfl_up(temp1, delta, width);
|
| 1556 |
+
return __low2half(temp2);
|
| 1557 |
+
}
|
| 1558 |
+
__CUDA_FP16_DECL__ __half __shfl_down(const __half var, const unsigned int delta, const int width)
|
| 1559 |
+
{
|
| 1560 |
+
const __half2 temp1 = __halves2half2(var, var);
|
| 1561 |
+
const __half2 temp2 = __shfl_down(temp1, delta, width);
|
| 1562 |
+
return __low2half(temp2);
|
| 1563 |
+
}
|
| 1564 |
+
__CUDA_FP16_DECL__ __half __shfl_xor(const __half var, const int delta, const int width)
|
| 1565 |
+
{
|
| 1566 |
+
const __half2 temp1 = __halves2half2(var, var);
|
| 1567 |
+
const __half2 temp2 = __shfl_xor(temp1, delta, width);
|
| 1568 |
+
return __low2half(temp2);
|
| 1569 |
+
}
|
| 1570 |
+
|
| 1571 |
+
#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */
|
| 1572 |
+
|
| 1573 |
+
__CUDA_FP16_DECL__ __half __shfl_sync(const unsigned mask, const __half var, const int delta, const int width)
|
| 1574 |
+
{
|
| 1575 |
+
const __half2 temp1 = __halves2half2(var, var);
|
| 1576 |
+
const __half2 temp2 = __shfl_sync(mask, temp1, delta, width);
|
| 1577 |
+
return __low2half(temp2);
|
| 1578 |
+
}
|
| 1579 |
+
__CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned mask, const __half var, const unsigned int delta, const int width)
|
| 1580 |
+
{
|
| 1581 |
+
const __half2 temp1 = __halves2half2(var, var);
|
| 1582 |
+
const __half2 temp2 = __shfl_up_sync(mask, temp1, delta, width);
|
| 1583 |
+
return __low2half(temp2);
|
| 1584 |
+
}
|
| 1585 |
+
__CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned mask, const __half var, const unsigned int delta, const int width)
|
| 1586 |
+
{
|
| 1587 |
+
const __half2 temp1 = __halves2half2(var, var);
|
| 1588 |
+
const __half2 temp2 = __shfl_down_sync(mask, temp1, delta, width);
|
| 1589 |
+
return __low2half(temp2);
|
| 1590 |
+
}
|
| 1591 |
+
__CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned mask, const __half var, const int delta, const int width)
|
| 1592 |
+
{
|
| 1593 |
+
const __half2 temp1 = __halves2half2(var, var);
|
| 1594 |
+
const __half2 temp2 = __shfl_xor_sync(mask, temp1, delta, width);
|
| 1595 |
+
return __low2half(temp2);
|
| 1596 |
+
}
|
| 1597 |
+
|
| 1598 |
+
#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300)*/
|
| 1599 |
+
/******************************************************************************
|
| 1600 |
+
* __half and __half2 __ldg,__ldcg,__ldca,__ldcs *
|
| 1601 |
+
******************************************************************************/
|
| 1602 |
+
|
| 1603 |
+
#if defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))
|
| 1604 |
+
#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
|
| 1605 |
+
#define __LDG_PTR "l"
|
| 1606 |
+
#else
|
| 1607 |
+
#define __LDG_PTR "r"
|
| 1608 |
+
#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
|
| 1609 |
+
__CUDA_FP16_DECL__ __half2 __ldg(const __half2 *const ptr)
|
| 1610 |
+
{
|
| 1611 |
+
__half2 ret;
|
| 1612 |
+
asm ("ld.global.nc.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
|
| 1613 |
+
return ret;
|
| 1614 |
+
}
|
| 1615 |
+
__CUDA_FP16_DECL__ __half __ldg(const __half *const ptr)
|
| 1616 |
+
{
|
| 1617 |
+
__half ret;
|
| 1618 |
+
asm ("ld.global.nc.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
|
| 1619 |
+
return ret;
|
| 1620 |
+
}
|
| 1621 |
+
__CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *const ptr)
|
| 1622 |
+
{
|
| 1623 |
+
__half2 ret;
|
| 1624 |
+
asm ("ld.global.cg.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
|
| 1625 |
+
return ret;
|
| 1626 |
+
}
|
| 1627 |
+
__CUDA_FP16_DECL__ __half __ldcg(const __half *const ptr)
|
| 1628 |
+
{
|
| 1629 |
+
__half ret;
|
| 1630 |
+
asm ("ld.global.cg.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
|
| 1631 |
+
return ret;
|
| 1632 |
+
}
|
| 1633 |
+
__CUDA_FP16_DECL__ __half2 __ldca(const __half2 *const ptr)
|
| 1634 |
+
{
|
| 1635 |
+
__half2 ret;
|
| 1636 |
+
asm ("ld.global.ca.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
|
| 1637 |
+
return ret;
|
| 1638 |
+
}
|
| 1639 |
+
__CUDA_FP16_DECL__ __half __ldca(const __half *const ptr)
|
| 1640 |
+
{
|
| 1641 |
+
__half ret;
|
| 1642 |
+
asm ("ld.global.ca.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
|
| 1643 |
+
return ret;
|
| 1644 |
+
}
|
| 1645 |
+
__CUDA_FP16_DECL__ __half2 __ldcs(const __half2 *const ptr)
|
| 1646 |
+
{
|
| 1647 |
+
__half2 ret;
|
| 1648 |
+
asm ("ld.global.cs.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
|
| 1649 |
+
return ret;
|
| 1650 |
+
}
|
| 1651 |
+
__CUDA_FP16_DECL__ __half __ldcs(const __half *const ptr)
|
| 1652 |
+
{
|
| 1653 |
+
__half ret;
|
| 1654 |
+
asm ("ld.global.cs.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
|
| 1655 |
+
return ret;
|
| 1656 |
+
}
|
| 1657 |
+
__CUDA_FP16_DECL__ __half2 __ldlu(const __half2 *const ptr)
|
| 1658 |
+
{
|
| 1659 |
+
__half2 ret;
|
| 1660 |
+
asm ("ld.global.lu.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
|
| 1661 |
+
return ret;
|
| 1662 |
+
}
|
| 1663 |
+
__CUDA_FP16_DECL__ __half __ldlu(const __half *const ptr)
|
| 1664 |
+
{
|
| 1665 |
+
__half ret;
|
| 1666 |
+
asm ("ld.global.lu.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
|
| 1667 |
+
return ret;
|
| 1668 |
+
}
|
| 1669 |
+
__CUDA_FP16_DECL__ __half2 __ldcv(const __half2 *const ptr)
|
| 1670 |
+
{
|
| 1671 |
+
__half2 ret;
|
| 1672 |
+
asm ("ld.global.cv.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
|
| 1673 |
+
return ret;
|
| 1674 |
+
}
|
| 1675 |
+
__CUDA_FP16_DECL__ __half __ldcv(const __half *const ptr)
|
| 1676 |
+
{
|
| 1677 |
+
__half ret;
|
| 1678 |
+
asm ("ld.global.cv.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
|
| 1679 |
+
return ret;
|
| 1680 |
+
}
|
| 1681 |
+
__CUDA_FP16_DECL__ void __stwb(__half2 *const ptr, const __half2 value)
|
| 1682 |
+
{
|
| 1683 |
+
asm ("st.global.wb.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
|
| 1684 |
+
}
|
| 1685 |
+
__CUDA_FP16_DECL__ void __stwb(__half *const ptr, const __half value)
|
| 1686 |
+
{
|
| 1687 |
+
asm ("st.global.wb.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory");
|
| 1688 |
+
}
|
| 1689 |
+
__CUDA_FP16_DECL__ void __stcg(__half2 *const ptr, const __half2 value)
|
| 1690 |
+
{
|
| 1691 |
+
asm ("st.global.cg.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
|
| 1692 |
+
}
|
| 1693 |
+
__CUDA_FP16_DECL__ void __stcg(__half *const ptr, const __half value)
|
| 1694 |
+
{
|
| 1695 |
+
asm ("st.global.cg.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory");
|
| 1696 |
+
}
|
| 1697 |
+
__CUDA_FP16_DECL__ void __stcs(__half2 *const ptr, const __half2 value)
|
| 1698 |
+
{
|
| 1699 |
+
asm ("st.global.cs.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
|
| 1700 |
+
}
|
| 1701 |
+
__CUDA_FP16_DECL__ void __stcs(__half *const ptr, const __half value)
|
| 1702 |
+
{
|
| 1703 |
+
asm ("st.global.cs.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory");
|
| 1704 |
+
}
|
| 1705 |
+
__CUDA_FP16_DECL__ void __stwt(__half2 *const ptr, const __half2 value)
|
| 1706 |
+
{
|
| 1707 |
+
asm ("st.global.wt.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
|
| 1708 |
+
}
|
| 1709 |
+
__CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value)
|
| 1710 |
+
{
|
| 1711 |
+
asm ("st.global.wt.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory");
|
| 1712 |
+
}
|
| 1713 |
+
#undef __LDG_PTR
|
| 1714 |
+
#endif /*defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))*/
|
| 1715 |
+
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
| 1716 |
+
/******************************************************************************
|
| 1717 |
+
* __half2 comparison *
|
| 1718 |
+
******************************************************************************/
|
| 1719 |
+
#define __COMPARISON_OP_HALF2_MACRO(name) /* do */ {\
|
| 1720 |
+
__half2 val; \
|
| 1721 |
+
asm( "{ " __CUDA_FP16_STRINGIFY(name) ".f16x2.f16x2 %0,%1,%2;\n}" \
|
| 1722 |
+
:"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
|
| 1723 |
+
return val; \
|
| 1724 |
+
} /* while(0) */
|
| 1725 |
+
__CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b)
|
| 1726 |
+
{
|
| 1727 |
+
__COMPARISON_OP_HALF2_MACRO(set.eq)
|
| 1728 |
+
}
|
| 1729 |
+
__CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b)
|
| 1730 |
+
{
|
| 1731 |
+
__COMPARISON_OP_HALF2_MACRO(set.ne)
|
| 1732 |
+
}
|
| 1733 |
+
__CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b)
|
| 1734 |
+
{
|
| 1735 |
+
__COMPARISON_OP_HALF2_MACRO(set.le)
|
| 1736 |
+
}
|
| 1737 |
+
__CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b)
|
| 1738 |
+
{
|
| 1739 |
+
__COMPARISON_OP_HALF2_MACRO(set.ge)
|
| 1740 |
+
}
|
| 1741 |
+
__CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b)
|
| 1742 |
+
{
|
| 1743 |
+
__COMPARISON_OP_HALF2_MACRO(set.lt)
|
| 1744 |
+
}
|
| 1745 |
+
__CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b)
|
| 1746 |
+
{
|
| 1747 |
+
__COMPARISON_OP_HALF2_MACRO(set.gt)
|
| 1748 |
+
}
|
| 1749 |
+
__CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b)
|
| 1750 |
+
{
|
| 1751 |
+
__COMPARISON_OP_HALF2_MACRO(set.equ)
|
| 1752 |
+
}
|
| 1753 |
+
__CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b)
|
| 1754 |
+
{
|
| 1755 |
+
__COMPARISON_OP_HALF2_MACRO(set.neu)
|
| 1756 |
+
}
|
| 1757 |
+
__CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b)
|
| 1758 |
+
{
|
| 1759 |
+
__COMPARISON_OP_HALF2_MACRO(set.leu)
|
| 1760 |
+
}
|
| 1761 |
+
__CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b)
|
| 1762 |
+
{
|
| 1763 |
+
__COMPARISON_OP_HALF2_MACRO(set.geu)
|
| 1764 |
+
}
|
| 1765 |
+
__CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b)
|
| 1766 |
+
{
|
| 1767 |
+
__COMPARISON_OP_HALF2_MACRO(set.ltu)
|
| 1768 |
+
}
|
| 1769 |
+
__CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b)
|
| 1770 |
+
{
|
| 1771 |
+
__COMPARISON_OP_HALF2_MACRO(set.gtu)
|
| 1772 |
+
}
|
| 1773 |
+
#undef __COMPARISON_OP_HALF2_MACRO
|
| 1774 |
+
#define __BOOL_COMPARISON_OP_HALF2_MACRO(name) /* do */ {\
|
| 1775 |
+
__half2 val; \
|
| 1776 |
+
bool retval; \
|
| 1777 |
+
asm( "{ " __CUDA_FP16_STRINGIFY(name) ".f16x2.f16x2 %0,%1,%2;\n}" \
|
| 1778 |
+
:"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
|
| 1779 |
+
if (__HALF2_TO_CUI(val) == 0x3C003C00U) {\
|
| 1780 |
+
retval = true; \
|
| 1781 |
+
} else { \
|
| 1782 |
+
retval = false; \
|
| 1783 |
+
}\
|
| 1784 |
+
return retval;\
|
| 1785 |
+
} /* while(0) */
|
| 1786 |
+
__CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b)
|
| 1787 |
+
{
|
| 1788 |
+
__BOOL_COMPARISON_OP_HALF2_MACRO(set.eq)
|
| 1789 |
+
}
|
| 1790 |
+
__CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b)
|
| 1791 |
+
{
|
| 1792 |
+
__BOOL_COMPARISON_OP_HALF2_MACRO(set.ne)
|
| 1793 |
+
}
|
| 1794 |
+
__CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b)
|
| 1795 |
+
{
|
| 1796 |
+
__BOOL_COMPARISON_OP_HALF2_MACRO(set.le)
|
| 1797 |
+
}
|
| 1798 |
+
__CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b)
|
| 1799 |
+
{
|
| 1800 |
+
__BOOL_COMPARISON_OP_HALF2_MACRO(set.ge)
|
| 1801 |
+
}
|
| 1802 |
+
__CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b)
|
| 1803 |
+
{
|
| 1804 |
+
__BOOL_COMPARISON_OP_HALF2_MACRO(set.lt)
|
| 1805 |
+
}
|
| 1806 |
+
__CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b)
|
| 1807 |
+
{
|
| 1808 |
+
__BOOL_COMPARISON_OP_HALF2_MACRO(set.gt)
|
| 1809 |
+
}
|
| 1810 |
+
__CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b)
|
| 1811 |
+
{
|
| 1812 |
+
__BOOL_COMPARISON_OP_HALF2_MACRO(set.equ)
|
| 1813 |
+
}
|
| 1814 |
+
__CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b)
|
| 1815 |
+
{
|
| 1816 |
+
__BOOL_COMPARISON_OP_HALF2_MACRO(set.neu)
|
| 1817 |
+
}
|
| 1818 |
+
__CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b)
|
| 1819 |
+
{
|
| 1820 |
+
__BOOL_COMPARISON_OP_HALF2_MACRO(set.leu)
|
| 1821 |
+
}
|
| 1822 |
+
__CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b)
|
| 1823 |
+
{
|
| 1824 |
+
__BOOL_COMPARISON_OP_HALF2_MACRO(set.geu)
|
| 1825 |
+
}
|
| 1826 |
+
__CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b)
|
| 1827 |
+
{
|
| 1828 |
+
__BOOL_COMPARISON_OP_HALF2_MACRO(set.ltu)
|
| 1829 |
+
}
|
| 1830 |
+
__CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b)
|
| 1831 |
+
{
|
| 1832 |
+
__BOOL_COMPARISON_OP_HALF2_MACRO(set.gtu)
|
| 1833 |
+
}
|
| 1834 |
+
#undef __BOOL_COMPARISON_OP_HALF2_MACRO
|
| 1835 |
+
/******************************************************************************
|
| 1836 |
+
* __half comparison *
|
| 1837 |
+
******************************************************************************/
|
| 1838 |
+
#define __COMPARISON_OP_HALF_MACRO(name) /* do */ {\
|
| 1839 |
+
unsigned short val; \
|
| 1840 |
+
asm( "{ .reg .pred __$temp3;\n" \
|
| 1841 |
+
" setp." __CUDA_FP16_STRINGIFY(name) ".f16 __$temp3, %1, %2;\n" \
|
| 1842 |
+
" selp.u16 %0, 1, 0, __$temp3;}" \
|
| 1843 |
+
: "=h"(val) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); \
|
| 1844 |
+
return (val != 0U) ? true : false; \
|
| 1845 |
+
} /* while(0) */
|
| 1846 |
+
__CUDA_FP16_DECL__ bool __heq(const __half a, const __half b)
|
| 1847 |
+
{
|
| 1848 |
+
__COMPARISON_OP_HALF_MACRO(eq)
|
| 1849 |
+
}
|
| 1850 |
+
__CUDA_FP16_DECL__ bool __hne(const __half a, const __half b)
|
| 1851 |
+
{
|
| 1852 |
+
__COMPARISON_OP_HALF_MACRO(ne)
|
| 1853 |
+
}
|
| 1854 |
+
__CUDA_FP16_DECL__ bool __hle(const __half a, const __half b)
|
| 1855 |
+
{
|
| 1856 |
+
__COMPARISON_OP_HALF_MACRO(le)
|
| 1857 |
+
}
|
| 1858 |
+
__CUDA_FP16_DECL__ bool __hge(const __half a, const __half b)
|
| 1859 |
+
{
|
| 1860 |
+
__COMPARISON_OP_HALF_MACRO(ge)
|
| 1861 |
+
}
|
| 1862 |
+
__CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b)
|
| 1863 |
+
{
|
| 1864 |
+
__COMPARISON_OP_HALF_MACRO(lt)
|
| 1865 |
+
}
|
| 1866 |
+
__CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b)
|
| 1867 |
+
{
|
| 1868 |
+
__COMPARISON_OP_HALF_MACRO(gt)
|
| 1869 |
+
}
|
| 1870 |
+
__CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b)
|
| 1871 |
+
{
|
| 1872 |
+
__COMPARISON_OP_HALF_MACRO(equ)
|
| 1873 |
+
}
|
| 1874 |
+
__CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b)
|
| 1875 |
+
{
|
| 1876 |
+
__COMPARISON_OP_HALF_MACRO(neu)
|
| 1877 |
+
}
|
| 1878 |
+
__CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b)
|
| 1879 |
+
{
|
| 1880 |
+
__COMPARISON_OP_HALF_MACRO(leu)
|
| 1881 |
+
}
|
| 1882 |
+
__CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b)
|
| 1883 |
+
{
|
| 1884 |
+
__COMPARISON_OP_HALF_MACRO(geu)
|
| 1885 |
+
}
|
| 1886 |
+
__CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b)
|
| 1887 |
+
{
|
| 1888 |
+
__COMPARISON_OP_HALF_MACRO(ltu)
|
| 1889 |
+
}
|
| 1890 |
+
__CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b)
|
| 1891 |
+
{
|
| 1892 |
+
__COMPARISON_OP_HALF_MACRO(gtu)
|
| 1893 |
+
}
|
| 1894 |
+
#undef __COMPARISON_OP_HALF_MACRO
|
| 1895 |
+
/******************************************************************************
|
| 1896 |
+
* __half2 arithmetic *
|
| 1897 |
+
******************************************************************************/
|
| 1898 |
+
__CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b)
|
| 1899 |
+
{
|
| 1900 |
+
__BINARY_OP_HALF2_MACRO(add)
|
| 1901 |
+
}
|
| 1902 |
+
__CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b)
|
| 1903 |
+
{
|
| 1904 |
+
__BINARY_OP_HALF2_MACRO(sub)
|
| 1905 |
+
}
|
| 1906 |
+
__CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b)
|
| 1907 |
+
{
|
| 1908 |
+
__BINARY_OP_HALF2_MACRO(mul)
|
| 1909 |
+
}
|
| 1910 |
+
__CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b)
|
| 1911 |
+
{
|
| 1912 |
+
__BINARY_OP_HALF2_MACRO(add.sat)
|
| 1913 |
+
}
|
| 1914 |
+
__CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b)
|
| 1915 |
+
{
|
| 1916 |
+
__BINARY_OP_HALF2_MACRO(sub.sat)
|
| 1917 |
+
}
|
| 1918 |
+
__CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b)
|
| 1919 |
+
{
|
| 1920 |
+
__BINARY_OP_HALF2_MACRO(mul.sat)
|
| 1921 |
+
}
|
| 1922 |
+
__CUDA_FP16_DECL__ __half2 __hadd2_rn(const __half2 a, const __half2 b)
|
| 1923 |
+
{
|
| 1924 |
+
__BINARY_OP_HALF2_MACRO(add.rn)
|
| 1925 |
+
}
|
| 1926 |
+
__CUDA_FP16_DECL__ __half2 __hsub2_rn(const __half2 a, const __half2 b)
|
| 1927 |
+
{
|
| 1928 |
+
__BINARY_OP_HALF2_MACRO(sub.rn)
|
| 1929 |
+
}
|
| 1930 |
+
__CUDA_FP16_DECL__ __half2 __hmul2_rn(const __half2 a, const __half2 b)
|
| 1931 |
+
{
|
| 1932 |
+
__BINARY_OP_HALF2_MACRO(mul.rn)
|
| 1933 |
+
}
|
| 1934 |
+
__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c)
|
| 1935 |
+
{
|
| 1936 |
+
__TERNARY_OP_HALF2_MACRO(fma.rn)
|
| 1937 |
+
}
|
| 1938 |
+
__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c)
|
| 1939 |
+
{
|
| 1940 |
+
__TERNARY_OP_HALF2_MACRO(fma.rn.sat)
|
| 1941 |
+
}
|
| 1942 |
+
__CUDA_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b) {
|
| 1943 |
+
__half ha = __low2half(a);
|
| 1944 |
+
__half hb = __low2half(b);
|
| 1945 |
+
|
| 1946 |
+
const __half v1 = __hdiv(ha, hb);
|
| 1947 |
+
|
| 1948 |
+
ha = __high2half(a);
|
| 1949 |
+
hb = __high2half(b);
|
| 1950 |
+
|
| 1951 |
+
const __half v2 = __hdiv(ha, hb);
|
| 1952 |
+
|
| 1953 |
+
return __halves2half2(v1, v2);
|
| 1954 |
+
}
|
| 1955 |
+
/******************************************************************************
|
| 1956 |
+
* __half arithmetic *
|
| 1957 |
+
******************************************************************************/
|
| 1958 |
+
__CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b)
|
| 1959 |
+
{
|
| 1960 |
+
__BINARY_OP_HALF_MACRO(add)
|
| 1961 |
+
}
|
| 1962 |
+
__CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b)
|
| 1963 |
+
{
|
| 1964 |
+
__BINARY_OP_HALF_MACRO(sub)
|
| 1965 |
+
}
|
| 1966 |
+
__CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b)
|
| 1967 |
+
{
|
| 1968 |
+
__BINARY_OP_HALF_MACRO(mul)
|
| 1969 |
+
}
|
| 1970 |
+
__CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b)
|
| 1971 |
+
{
|
| 1972 |
+
__BINARY_OP_HALF_MACRO(add.sat)
|
| 1973 |
+
}
|
| 1974 |
+
__CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b)
|
| 1975 |
+
{
|
| 1976 |
+
__BINARY_OP_HALF_MACRO(sub.sat)
|
| 1977 |
+
}
|
| 1978 |
+
__CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b)
|
| 1979 |
+
{
|
| 1980 |
+
__BINARY_OP_HALF_MACRO(mul.sat)
|
| 1981 |
+
}
|
| 1982 |
+
__CUDA_FP16_DECL__ __half __hadd_rn(const __half a, const __half b)
|
| 1983 |
+
{
|
| 1984 |
+
__BINARY_OP_HALF_MACRO(add.rn)
|
| 1985 |
+
}
|
| 1986 |
+
__CUDA_FP16_DECL__ __half __hsub_rn(const __half a, const __half b)
|
| 1987 |
+
{
|
| 1988 |
+
__BINARY_OP_HALF_MACRO(sub.rn)
|
| 1989 |
+
}
|
| 1990 |
+
__CUDA_FP16_DECL__ __half __hmul_rn(const __half a, const __half b)
|
| 1991 |
+
{
|
| 1992 |
+
__BINARY_OP_HALF_MACRO(mul.rn)
|
| 1993 |
+
}
|
| 1994 |
+
__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c)
|
| 1995 |
+
{
|
| 1996 |
+
__TERNARY_OP_HALF_MACRO(fma.rn)
|
| 1997 |
+
}
|
| 1998 |
+
__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c)
|
| 1999 |
+
{
|
| 2000 |
+
__TERNARY_OP_HALF_MACRO(fma.rn.sat)
|
| 2001 |
+
}
|
| 2002 |
+
__CUDA_FP16_DECL__ __half __hdiv(const __half a, const __half b) {
|
| 2003 |
+
__half v;
|
| 2004 |
+
__half abs;
|
| 2005 |
+
__half den;
|
| 2006 |
+
__HALF_TO_US(den) = 0x008FU;
|
| 2007 |
+
|
| 2008 |
+
float rcp;
|
| 2009 |
+
const float fa = __half2float(a);
|
| 2010 |
+
const float fb = __half2float(b);
|
| 2011 |
+
|
| 2012 |
+
asm("{rcp.approx.ftz.f32 %0, %1;\n}" :"=f"(rcp) : "f"(fb));
|
| 2013 |
+
|
| 2014 |
+
float fv = rcp * fa;
|
| 2015 |
+
|
| 2016 |
+
v = __float2half(fv);
|
| 2017 |
+
__HALF_TO_US(abs) = static_cast<unsigned short>(static_cast<unsigned int>(__HALF_TO_CUS(v)) & 0x00007FFFU);
|
| 2018 |
+
if (__hlt(abs, den) && (!(__HALF_TO_CUS(abs) == 0x0000U))) {
|
| 2019 |
+
const float err = __fmaf_rn(-fb, fv, fa);
|
| 2020 |
+
fv = __fmaf_rn(rcp, err, fv);
|
| 2021 |
+
v = __float2half(fv);
|
| 2022 |
+
}
|
| 2023 |
+
return v;
|
| 2024 |
+
}
|
| 2025 |
+
|
| 2026 |
+
/******************************************************************************
|
| 2027 |
+
* __half2 functions *
|
| 2028 |
+
******************************************************************************/
|
| 2029 |
+
#define __SPEC_CASE2(i,r, spc, ulp) \
|
| 2030 |
+
"{.reg.b32 spc, ulp, p;\n"\
|
| 2031 |
+
" mov.b32 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\
|
| 2032 |
+
" mov.b32 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\
|
| 2033 |
+
" set.eq.f16x2.f16x2 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\
|
| 2034 |
+
" fma.rn.f16x2 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n"
|
| 2035 |
+
#define __SPEC_CASE(i,r, spc, ulp) \
|
| 2036 |
+
"{.reg.b16 spc, ulp, p;\n"\
|
| 2037 |
+
" mov.b16 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\
|
| 2038 |
+
" mov.b16 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\
|
| 2039 |
+
" set.eq.f16.f16 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\
|
| 2040 |
+
" fma.rn.f16 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n"
|
| 2041 |
+
#define __APPROX_FCAST(fun) /* do */ {\
|
| 2042 |
+
__half val;\
|
| 2043 |
+
asm("{.reg.b32 f; \n"\
|
| 2044 |
+
" .reg.b16 r; \n"\
|
| 2045 |
+
" mov.b16 r,%1; \n"\
|
| 2046 |
+
" cvt.f32.f16 f,r; \n"\
|
| 2047 |
+
" " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 f,f; \n"\
|
| 2048 |
+
" cvt.rn.f16.f32 r,f; \n"\
|
| 2049 |
+
" mov.b16 %0,r; \n"\
|
| 2050 |
+
"}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));\
|
| 2051 |
+
return val;\
|
| 2052 |
+
} /* while(0) */
|
| 2053 |
+
#define __APPROX_FCAST2(fun) /* do */ {\
|
| 2054 |
+
__half2 val;\
|
| 2055 |
+
asm("{.reg.b16 hl, hu; \n"\
|
| 2056 |
+
" .reg.b32 fl, fu; \n"\
|
| 2057 |
+
" mov.b32 {hl, hu}, %1; \n"\
|
| 2058 |
+
" cvt.f32.f16 fl, hl; \n"\
|
| 2059 |
+
" cvt.f32.f16 fu, hu; \n"\
|
| 2060 |
+
" " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 fl, fl; \n"\
|
| 2061 |
+
" " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 fu, fu; \n"\
|
| 2062 |
+
" cvt.rn.f16.f32 hl, fl; \n"\
|
| 2063 |
+
" cvt.rn.f16.f32 hu, fu; \n"\
|
| 2064 |
+
" mov.b32 %0, {hl, hu}; \n"\
|
| 2065 |
+
"}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); \
|
| 2066 |
+
return val;\
|
| 2067 |
+
} /* while(0) */
|
| 2068 |
+
static __device__ __forceinline__ float __float_simpl_sinf(float a);
|
| 2069 |
+
static __device__ __forceinline__ float __float_simpl_cosf(float a);
|
| 2070 |
+
__CUDA_FP16_DECL__ __half hsin(const __half a) {
|
| 2071 |
+
const float sl = __float_simpl_sinf(__half2float(a));
|
| 2072 |
+
__half r = __float2half_rn(sl);
|
| 2073 |
+
asm("{\n\t"
|
| 2074 |
+
" .reg.b16 i,r,t; \n\t"
|
| 2075 |
+
" mov.b16 r, %0; \n\t"
|
| 2076 |
+
" mov.b16 i, %1; \n\t"
|
| 2077 |
+
" and.b16 t, r, 0x8000U; \n\t"
|
| 2078 |
+
" abs.f16 r, r; \n\t"
|
| 2079 |
+
" abs.f16 i, i; \n\t"
|
| 2080 |
+
__SPEC_CASE(i, r, 0X32B3U, 0x0800U)
|
| 2081 |
+
__SPEC_CASE(i, r, 0X5CB0U, 0x9000U)
|
| 2082 |
+
" or.b16 r,r,t; \n\t"
|
| 2083 |
+
" mov.b16 %0, r; \n"
|
| 2084 |
+
"}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
|
| 2085 |
+
return r;
|
| 2086 |
+
}
|
| 2087 |
+
__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a) {
|
| 2088 |
+
const float sl = __float_simpl_sinf(__half2float(a.x));
|
| 2089 |
+
const float sh = __float_simpl_sinf(__half2float(a.y));
|
| 2090 |
+
__half2 r = __floats2half2_rn(sl, sh);
|
| 2091 |
+
asm("{\n\t"
|
| 2092 |
+
" .reg.b32 i,r,t; \n\t"
|
| 2093 |
+
" mov.b32 r, %0; \n\t"
|
| 2094 |
+
" mov.b32 i, %1; \n\t"
|
| 2095 |
+
" and.b32 t, r, 0x80008000U; \n\t"
|
| 2096 |
+
" abs.f16x2 r, r; \n\t"
|
| 2097 |
+
" abs.f16x2 i, i; \n\t"
|
| 2098 |
+
__SPEC_CASE2(i, r, 0X32B332B3U, 0x08000800U)
|
| 2099 |
+
__SPEC_CASE2(i, r, 0X5CB05CB0U, 0x90009000U)
|
| 2100 |
+
" or.b32 r, r, t; \n\t"
|
| 2101 |
+
" mov.b32 %0, r; \n"
|
| 2102 |
+
"}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
|
| 2103 |
+
return r;
|
| 2104 |
+
}
|
| 2105 |
+
__CUDA_FP16_DECL__ __half hcos(const __half a) {
|
| 2106 |
+
const float cl = __float_simpl_cosf(__half2float(a));
|
| 2107 |
+
__half r = __float2half_rn(cl);
|
| 2108 |
+
asm("{\n\t"
|
| 2109 |
+
" .reg.b16 i,r; \n\t"
|
| 2110 |
+
" mov.b16 r, %0; \n\t"
|
| 2111 |
+
" mov.b16 i, %1; \n\t"
|
| 2112 |
+
" abs.f16 i, i; \n\t"
|
| 2113 |
+
__SPEC_CASE(i, r, 0X2B7CU, 0x1000U)
|
| 2114 |
+
" mov.b16 %0, r; \n"
|
| 2115 |
+
"}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
|
| 2116 |
+
return r;
|
| 2117 |
+
}
|
| 2118 |
+
__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a) {
|
| 2119 |
+
const float cl = __float_simpl_cosf(__half2float(a.x));
|
| 2120 |
+
const float ch = __float_simpl_cosf(__half2float(a.y));
|
| 2121 |
+
__half2 r = __floats2half2_rn(cl, ch);
|
| 2122 |
+
asm("{\n\t"
|
| 2123 |
+
" .reg.b32 i,r; \n\t"
|
| 2124 |
+
" mov.b32 r, %0; \n\t"
|
| 2125 |
+
" mov.b32 i, %1; \n\t"
|
| 2126 |
+
" abs.f16x2 i, i; \n\t"
|
| 2127 |
+
__SPEC_CASE2(i, r, 0X2B7C2B7CU, 0x10001000U)
|
| 2128 |
+
" mov.b32 %0, r; \n"
|
| 2129 |
+
"}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
|
| 2130 |
+
return r;
|
| 2131 |
+
}
|
| 2132 |
+
static __device__ __forceinline__ float __internal_trig_reduction_kernel(const float a, unsigned int *const quadrant)
|
| 2133 |
+
{
|
| 2134 |
+
const float ar = __fmaf_rn(a, 0.636619772F, 12582912.0F);
|
| 2135 |
+
const unsigned q = __float_as_uint(ar);
|
| 2136 |
+
const float j = __fsub_rn(ar, 12582912.0F);
|
| 2137 |
+
float t = __fmaf_rn(j, -1.5707962512969971e+000F, a);
|
| 2138 |
+
t = __fmaf_rn(j, -7.5497894158615964e-008F, t);
|
| 2139 |
+
*quadrant = q;
|
| 2140 |
+
return t;
|
| 2141 |
+
}
|
| 2142 |
+
static __device__ __forceinline__ float __internal_sin_cos_kernel(const float x, const unsigned int i)
|
| 2143 |
+
{
|
| 2144 |
+
float z;
|
| 2145 |
+
const float x2 = x*x;
|
| 2146 |
+
float a8;
|
| 2147 |
+
float a6;
|
| 2148 |
+
float a4;
|
| 2149 |
+
float a2;
|
| 2150 |
+
float a1;
|
| 2151 |
+
float a0;
|
| 2152 |
+
|
| 2153 |
+
if ((i & 1U) != 0U) {
|
| 2154 |
+
// cos
|
| 2155 |
+
a8 = 2.44331571e-5F;
|
| 2156 |
+
a6 = -1.38873163e-3F;
|
| 2157 |
+
a4 = 4.16666457e-2F;
|
| 2158 |
+
a2 = -5.00000000e-1F;
|
| 2159 |
+
a1 = x2;
|
| 2160 |
+
a0 = 1.0F;
|
| 2161 |
+
}
|
| 2162 |
+
else {
|
| 2163 |
+
// sin
|
| 2164 |
+
a8 = -1.95152959e-4F;
|
| 2165 |
+
a6 = 8.33216087e-3F;
|
| 2166 |
+
a4 = -1.66666546e-1F;
|
| 2167 |
+
a2 = 0.0F;
|
| 2168 |
+
a1 = x;
|
| 2169 |
+
a0 = x;
|
| 2170 |
+
}
|
| 2171 |
+
|
| 2172 |
+
z = __fmaf_rn(a8, x2, a6);
|
| 2173 |
+
z = __fmaf_rn(z, x2, a4);
|
| 2174 |
+
z = __fmaf_rn(z, x2, a2);
|
| 2175 |
+
z = __fmaf_rn(z, a1, a0);
|
| 2176 |
+
|
| 2177 |
+
if ((i & 2U) != 0U) {
|
| 2178 |
+
z = -z;
|
| 2179 |
+
}
|
| 2180 |
+
return z;
|
| 2181 |
+
}
|
| 2182 |
+
static __device__ __forceinline__ float __float_simpl_sinf(float a)
|
| 2183 |
+
{
|
| 2184 |
+
float z;
|
| 2185 |
+
unsigned i;
|
| 2186 |
+
a = __internal_trig_reduction_kernel(a, &i);
|
| 2187 |
+
z = __internal_sin_cos_kernel(a, i);
|
| 2188 |
+
return z;
|
| 2189 |
+
}
|
| 2190 |
+
static __device__ __forceinline__ float __float_simpl_cosf(float a)
|
| 2191 |
+
{
|
| 2192 |
+
float z;
|
| 2193 |
+
unsigned i;
|
| 2194 |
+
a = __internal_trig_reduction_kernel(a, &i);
|
| 2195 |
+
z = __internal_sin_cos_kernel(a, (i & 0x3U) + 1U);
|
| 2196 |
+
return z;
|
| 2197 |
+
}
|
| 2198 |
+
|
| 2199 |
+
__CUDA_FP16_DECL__ __half hexp(const __half a) {
|
| 2200 |
+
__half val;
|
| 2201 |
+
asm("{.reg.b32 f, C, nZ; \n"
|
| 2202 |
+
" .reg.b16 h,r; \n"
|
| 2203 |
+
" mov.b16 h,%1; \n"
|
| 2204 |
+
" cvt.f32.f16 f,h; \n"
|
| 2205 |
+
" mov.b32 C, 0x3fb8aa3bU; \n"
|
| 2206 |
+
" mov.b32 nZ, 0x80000000U;\n"
|
| 2207 |
+
" fma.rn.f32 f,f,C,nZ; \n"
|
| 2208 |
+
" ex2.approx.ftz.f32 f,f; \n"
|
| 2209 |
+
" cvt.rn.f16.f32 r,f; \n"
|
| 2210 |
+
__SPEC_CASE(h, r, 0X1F79U, 0x9400U)
|
| 2211 |
+
__SPEC_CASE(h, r, 0X25CFU, 0x9400U)
|
| 2212 |
+
__SPEC_CASE(h, r, 0XC13BU, 0x0400U)
|
| 2213 |
+
__SPEC_CASE(h, r, 0XC1EFU, 0x0200U)
|
| 2214 |
+
" mov.b16 %0,r; \n"
|
| 2215 |
+
"}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
|
| 2216 |
+
return val;
|
| 2217 |
+
}
|
| 2218 |
+
__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a) {
|
| 2219 |
+
__half2 val;
|
| 2220 |
+
asm("{.reg.b16 hl, hu; \n"
|
| 2221 |
+
" .reg.b32 h,r,fl,fu,C,nZ; \n"
|
| 2222 |
+
" mov.b32 {hl, hu}, %1; \n"
|
| 2223 |
+
" mov.b32 h, %1; \n"
|
| 2224 |
+
" cvt.f32.f16 fl, hl; \n"
|
| 2225 |
+
" cvt.f32.f16 fu, hu; \n"
|
| 2226 |
+
" mov.b32 C, 0x3fb8aa3bU; \n"
|
| 2227 |
+
" mov.b32 nZ, 0x80000000U;\n"
|
| 2228 |
+
" fma.rn.f32 fl,fl,C,nZ; \n"
|
| 2229 |
+
" fma.rn.f32 fu,fu,C,nZ; \n"
|
| 2230 |
+
" ex2.approx.ftz.f32 fl, fl; \n"
|
| 2231 |
+
" ex2.approx.ftz.f32 fu, fu; \n"
|
| 2232 |
+
" cvt.rn.f16.f32 hl, fl; \n"
|
| 2233 |
+
" cvt.rn.f16.f32 hu, fu; \n"
|
| 2234 |
+
" mov.b32 r, {hl, hu}; \n"
|
| 2235 |
+
__SPEC_CASE2(h, r, 0X1F791F79U, 0x94009400U)
|
| 2236 |
+
__SPEC_CASE2(h, r, 0X25CF25CFU, 0x94009400U)
|
| 2237 |
+
__SPEC_CASE2(h, r, 0XC13BC13BU, 0x04000400U)
|
| 2238 |
+
__SPEC_CASE2(h, r, 0XC1EFC1EFU, 0x02000200U)
|
| 2239 |
+
" mov.b32 %0, r; \n"
|
| 2240 |
+
"}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
|
| 2241 |
+
return val;
|
| 2242 |
+
}
|
| 2243 |
+
__CUDA_FP16_DECL__ __half hexp2(const __half a) {
|
| 2244 |
+
__half val;
|
| 2245 |
+
asm("{.reg.b32 f, ULP; \n"
|
| 2246 |
+
" .reg.b16 r; \n"
|
| 2247 |
+
" mov.b16 r,%1; \n"
|
| 2248 |
+
" cvt.f32.f16 f,r; \n"
|
| 2249 |
+
" ex2.approx.ftz.f32 f,f; \n"
|
| 2250 |
+
" mov.b32 ULP, 0x33800000U;\n"
|
| 2251 |
+
" fma.rn.f32 f,f,ULP,f; \n"
|
| 2252 |
+
" cvt.rn.f16.f32 r,f; \n"
|
| 2253 |
+
" mov.b16 %0,r; \n"
|
| 2254 |
+
"}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
|
| 2255 |
+
return val;
|
| 2256 |
+
}
|
| 2257 |
+
__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a) {
|
| 2258 |
+
__half2 val;
|
| 2259 |
+
asm("{.reg.b16 hl, hu; \n"
|
| 2260 |
+
" .reg.b32 fl, fu, ULP; \n"
|
| 2261 |
+
" mov.b32 {hl, hu}, %1; \n"
|
| 2262 |
+
" cvt.f32.f16 fl, hl; \n"
|
| 2263 |
+
" cvt.f32.f16 fu, hu; \n"
|
| 2264 |
+
" ex2.approx.ftz.f32 fl, fl; \n"
|
| 2265 |
+
" ex2.approx.ftz.f32 fu, fu; \n"
|
| 2266 |
+
" mov.b32 ULP, 0x33800000U;\n"
|
| 2267 |
+
" fma.rn.f32 fl,fl,ULP,fl; \n"
|
| 2268 |
+
" fma.rn.f32 fu,fu,ULP,fu; \n"
|
| 2269 |
+
" cvt.rn.f16.f32 hl, fl; \n"
|
| 2270 |
+
" cvt.rn.f16.f32 hu, fu; \n"
|
| 2271 |
+
" mov.b32 %0, {hl, hu}; \n"
|
| 2272 |
+
"}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
|
| 2273 |
+
return val;
|
| 2274 |
+
}
|
| 2275 |
+
__CUDA_FP16_DECL__ __half hexp10(const __half a) {
|
| 2276 |
+
__half val;
|
| 2277 |
+
asm("{.reg.b16 h,r; \n"
|
| 2278 |
+
" .reg.b32 f, C, nZ; \n"
|
| 2279 |
+
" mov.b16 h, %1; \n"
|
| 2280 |
+
" cvt.f32.f16 f, h; \n"
|
| 2281 |
+
" mov.b32 C, 0x40549A78U; \n"
|
| 2282 |
+
" mov.b32 nZ, 0x80000000U;\n"
|
| 2283 |
+
" fma.rn.f32 f,f,C,nZ; \n"
|
| 2284 |
+
" ex2.approx.ftz.f32 f, f; \n"
|
| 2285 |
+
" cvt.rn.f16.f32 r, f; \n"
|
| 2286 |
+
__SPEC_CASE(h, r, 0x34DEU, 0x9800U)
|
| 2287 |
+
__SPEC_CASE(h, r, 0x9766U, 0x9000U)
|
| 2288 |
+
__SPEC_CASE(h, r, 0x9972U, 0x1000U)
|
| 2289 |
+
__SPEC_CASE(h, r, 0xA5C4U, 0x1000U)
|
| 2290 |
+
__SPEC_CASE(h, r, 0xBF0AU, 0x8100U)
|
| 2291 |
+
" mov.b16 %0, r; \n"
|
| 2292 |
+
"}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
|
| 2293 |
+
return val;
|
| 2294 |
+
}
|
| 2295 |
+
__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a) {
|
| 2296 |
+
__half2 val;
|
| 2297 |
+
asm("{.reg.b16 hl, hu; \n"
|
| 2298 |
+
" .reg.b32 h,r,fl,fu,C,nZ; \n"
|
| 2299 |
+
" mov.b32 {hl, hu}, %1; \n"
|
| 2300 |
+
" mov.b32 h, %1; \n"
|
| 2301 |
+
" cvt.f32.f16 fl, hl; \n"
|
| 2302 |
+
" cvt.f32.f16 fu, hu; \n"
|
| 2303 |
+
" mov.b32 C, 0x40549A78U; \n"
|
| 2304 |
+
" mov.b32 nZ, 0x80000000U;\n"
|
| 2305 |
+
" fma.rn.f32 fl,fl,C,nZ; \n"
|
| 2306 |
+
" fma.rn.f32 fu,fu,C,nZ; \n"
|
| 2307 |
+
" ex2.approx.ftz.f32 fl, fl; \n"
|
| 2308 |
+
" ex2.approx.ftz.f32 fu, fu; \n"
|
| 2309 |
+
" cvt.rn.f16.f32 hl, fl; \n"
|
| 2310 |
+
" cvt.rn.f16.f32 hu, fu; \n"
|
| 2311 |
+
" mov.b32 r, {hl, hu}; \n"
|
| 2312 |
+
__SPEC_CASE2(h, r, 0x34DE34DEU, 0x98009800U)
|
| 2313 |
+
__SPEC_CASE2(h, r, 0x97669766U, 0x90009000U)
|
| 2314 |
+
__SPEC_CASE2(h, r, 0x99729972U, 0x10001000U)
|
| 2315 |
+
__SPEC_CASE2(h, r, 0xA5C4A5C4U, 0x10001000U)
|
| 2316 |
+
__SPEC_CASE2(h, r, 0xBF0ABF0AU, 0x81008100U)
|
| 2317 |
+
" mov.b32 %0, r; \n"
|
| 2318 |
+
"}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
|
| 2319 |
+
return val;
|
| 2320 |
+
}
|
| 2321 |
+
__CUDA_FP16_DECL__ __half hlog2(const __half a) {
|
| 2322 |
+
__half val;
|
| 2323 |
+
asm("{.reg.b16 h, r; \n"
|
| 2324 |
+
" .reg.b32 f; \n"
|
| 2325 |
+
" mov.b16 h, %1; \n"
|
| 2326 |
+
" cvt.f32.f16 f, h; \n"
|
| 2327 |
+
" lg2.approx.ftz.f32 f, f; \n"
|
| 2328 |
+
" cvt.rn.f16.f32 r, f; \n"
|
| 2329 |
+
__SPEC_CASE(r, r, 0xA2E2U, 0x8080U)
|
| 2330 |
+
__SPEC_CASE(r, r, 0xBF46U, 0x9400U)
|
| 2331 |
+
" mov.b16 %0, r; \n"
|
| 2332 |
+
"}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
|
| 2333 |
+
return val;
|
| 2334 |
+
}
|
| 2335 |
+
__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a) {
|
| 2336 |
+
__half2 val;
|
| 2337 |
+
asm("{.reg.b16 hl, hu; \n"
|
| 2338 |
+
" .reg.b32 fl, fu, r, p; \n"
|
| 2339 |
+
" mov.b32 {hl, hu}, %1; \n"
|
| 2340 |
+
" cvt.f32.f16 fl, hl; \n"
|
| 2341 |
+
" cvt.f32.f16 fu, hu; \n"
|
| 2342 |
+
" lg2.approx.ftz.f32 fl, fl; \n"
|
| 2343 |
+
" lg2.approx.ftz.f32 fu, fu; \n"
|
| 2344 |
+
" cvt.rn.f16.f32 hl, fl; \n"
|
| 2345 |
+
" cvt.rn.f16.f32 hu, fu; \n"
|
| 2346 |
+
" mov.b32 r, {hl, hu}; \n"
|
| 2347 |
+
__SPEC_CASE2(r, r, 0xA2E2A2E2U, 0x80808080U)
|
| 2348 |
+
__SPEC_CASE2(r, r, 0xBF46BF46U, 0x94009400U)
|
| 2349 |
+
" mov.b32 %0, r; \n"
|
| 2350 |
+
"}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
|
| 2351 |
+
return val;
|
| 2352 |
+
}
|
| 2353 |
+
__CUDA_FP16_DECL__ __half hlog(const __half a) {
|
| 2354 |
+
__half val;
|
| 2355 |
+
asm("{.reg.b32 f, C; \n"
|
| 2356 |
+
" .reg.b16 r,h; \n"
|
| 2357 |
+
" mov.b16 h,%1; \n"
|
| 2358 |
+
" cvt.f32.f16 f,h; \n"
|
| 2359 |
+
" lg2.approx.ftz.f32 f,f; \n"
|
| 2360 |
+
" mov.b32 C, 0x3f317218U; \n"
|
| 2361 |
+
" mul.f32 f,f,C; \n"
|
| 2362 |
+
" cvt.rn.f16.f32 r,f; \n"
|
| 2363 |
+
__SPEC_CASE(h, r, 0X160DU, 0x9C00U)
|
| 2364 |
+
__SPEC_CASE(h, r, 0X3BFEU, 0x8010U)
|
| 2365 |
+
__SPEC_CASE(h, r, 0X3C0BU, 0x8080U)
|
| 2366 |
+
__SPEC_CASE(h, r, 0X6051U, 0x1C00U)
|
| 2367 |
+
" mov.b16 %0,r; \n"
|
| 2368 |
+
"}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
|
| 2369 |
+
return val;
|
| 2370 |
+
}
|
| 2371 |
+
__CUDA_FP16_DECL__ __half2 h2log(const __half2 a) {
|
| 2372 |
+
__half2 val;
|
| 2373 |
+
asm("{.reg.b16 hl, hu; \n"
|
| 2374 |
+
" .reg.b32 r, fl, fu, C, h; \n"
|
| 2375 |
+
" mov.b32 {hl, hu}, %1; \n"
|
| 2376 |
+
" mov.b32 h, %1; \n"
|
| 2377 |
+
" cvt.f32.f16 fl, hl; \n"
|
| 2378 |
+
" cvt.f32.f16 fu, hu; \n"
|
| 2379 |
+
" lg2.approx.ftz.f32 fl, fl; \n"
|
| 2380 |
+
" lg2.approx.ftz.f32 fu, fu; \n"
|
| 2381 |
+
" mov.b32 C, 0x3f317218U; \n"
|
| 2382 |
+
" mul.f32 fl,fl,C; \n"
|
| 2383 |
+
" mul.f32 fu,fu,C; \n"
|
| 2384 |
+
" cvt.rn.f16.f32 hl, fl; \n"
|
| 2385 |
+
" cvt.rn.f16.f32 hu, fu; \n"
|
| 2386 |
+
" mov.b32 r, {hl, hu}; \n"
|
| 2387 |
+
__SPEC_CASE2(h, r, 0X160D160DU, 0x9C009C00U)
|
| 2388 |
+
__SPEC_CASE2(h, r, 0X3BFE3BFEU, 0x80108010U)
|
| 2389 |
+
__SPEC_CASE2(h, r, 0X3C0B3C0BU, 0x80808080U)
|
| 2390 |
+
__SPEC_CASE2(h, r, 0X60516051U, 0x1C001C00U)
|
| 2391 |
+
" mov.b32 %0, r; \n"
|
| 2392 |
+
"}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
|
| 2393 |
+
return val;
|
| 2394 |
+
}
|
| 2395 |
+
__CUDA_FP16_DECL__ __half hlog10(const __half a) {
|
| 2396 |
+
__half val;
|
| 2397 |
+
asm("{.reg.b16 h, r; \n"
|
| 2398 |
+
" .reg.b32 f, C; \n"
|
| 2399 |
+
" mov.b16 h, %1; \n"
|
| 2400 |
+
" cvt.f32.f16 f, h; \n"
|
| 2401 |
+
" lg2.approx.ftz.f32 f, f; \n"
|
| 2402 |
+
" mov.b32 C, 0x3E9A209BU; \n"
|
| 2403 |
+
" mul.f32 f,f,C; \n"
|
| 2404 |
+
" cvt.rn.f16.f32 r, f; \n"
|
| 2405 |
+
__SPEC_CASE(h, r, 0x338FU, 0x1000U)
|
| 2406 |
+
__SPEC_CASE(h, r, 0x33F8U, 0x9000U)
|
| 2407 |
+
__SPEC_CASE(h, r, 0x57E1U, 0x9800U)
|
| 2408 |
+
__SPEC_CASE(h, r, 0x719DU, 0x9C00U)
|
| 2409 |
+
" mov.b16 %0, r; \n"
|
| 2410 |
+
"}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
|
| 2411 |
+
return val;
|
| 2412 |
+
}
|
| 2413 |
+
__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a) {
|
| 2414 |
+
__half2 val;
|
| 2415 |
+
asm("{.reg.b16 hl, hu; \n"
|
| 2416 |
+
" .reg.b32 r, fl, fu, C, h; \n"
|
| 2417 |
+
" mov.b32 {hl, hu}, %1; \n"
|
| 2418 |
+
" mov.b32 h, %1; \n"
|
| 2419 |
+
" cvt.f32.f16 fl, hl; \n"
|
| 2420 |
+
" cvt.f32.f16 fu, hu; \n"
|
| 2421 |
+
" lg2.approx.ftz.f32 fl, fl; \n"
|
| 2422 |
+
" lg2.approx.ftz.f32 fu, fu; \n"
|
| 2423 |
+
" mov.b32 C, 0x3E9A209BU; \n"
|
| 2424 |
+
" mul.f32 fl,fl,C; \n"
|
| 2425 |
+
" mul.f32 fu,fu,C; \n"
|
| 2426 |
+
" cvt.rn.f16.f32 hl, fl; \n"
|
| 2427 |
+
" cvt.rn.f16.f32 hu, fu; \n"
|
| 2428 |
+
" mov.b32 r, {hl, hu}; \n"
|
| 2429 |
+
__SPEC_CASE2(h, r, 0x338F338FU, 0x10001000U)
|
| 2430 |
+
__SPEC_CASE2(h, r, 0x33F833F8U, 0x90009000U)
|
| 2431 |
+
__SPEC_CASE2(h, r, 0x57E157E1U, 0x98009800U)
|
| 2432 |
+
__SPEC_CASE2(h, r, 0x719D719DU, 0x9C009C00U)
|
| 2433 |
+
" mov.b32 %0, r; \n"
|
| 2434 |
+
"}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
|
| 2435 |
+
return val;
|
| 2436 |
+
}
|
| 2437 |
+
#undef __SPEC_CASE2
|
| 2438 |
+
#undef __SPEC_CASE
|
| 2439 |
+
__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a) {
|
| 2440 |
+
__APPROX_FCAST2(rcp)
|
| 2441 |
+
}
|
| 2442 |
+
__CUDA_FP16_DECL__ __half hrcp(const __half a) {
|
| 2443 |
+
__APPROX_FCAST(rcp)
|
| 2444 |
+
}
|
| 2445 |
+
__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a) {
|
| 2446 |
+
__APPROX_FCAST2(rsqrt)
|
| 2447 |
+
}
|
| 2448 |
+
__CUDA_FP16_DECL__ __half hrsqrt(const __half a) {
|
| 2449 |
+
__APPROX_FCAST(rsqrt)
|
| 2450 |
+
}
|
| 2451 |
+
__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a) {
|
| 2452 |
+
__APPROX_FCAST2(sqrt)
|
| 2453 |
+
}
|
| 2454 |
+
__CUDA_FP16_DECL__ __half hsqrt(const __half a) {
|
| 2455 |
+
__APPROX_FCAST(sqrt)
|
| 2456 |
+
}
|
| 2457 |
+
#undef __APPROX_FCAST
|
| 2458 |
+
#undef __APPROX_FCAST2
|
| 2459 |
+
__CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a)
|
| 2460 |
+
{
|
| 2461 |
+
__half2 r;
|
| 2462 |
+
asm("{set.nan.f16x2.f16x2 %0,%1,%2;\n}"
|
| 2463 |
+
:"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(a)));
|
| 2464 |
+
return r;
|
| 2465 |
+
}
|
| 2466 |
+
__CUDA_FP16_DECL__ bool __hisnan(const __half a)
|
| 2467 |
+
{
|
| 2468 |
+
__half r;
|
| 2469 |
+
asm("{set.nan.f16.f16 %0,%1,%2;\n}"
|
| 2470 |
+
:"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(a)));
|
| 2471 |
+
return __HALF_TO_CUS(r) != 0U;
|
| 2472 |
+
}
|
| 2473 |
+
__CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a)
|
| 2474 |
+
{
|
| 2475 |
+
__half2 r;
|
| 2476 |
+
asm("{neg.f16x2 %0,%1;\n}"
|
| 2477 |
+
:"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
|
| 2478 |
+
return r;
|
| 2479 |
+
}
|
| 2480 |
+
__CUDA_FP16_DECL__ __half __hneg(const __half a)
|
| 2481 |
+
{
|
| 2482 |
+
__half r;
|
| 2483 |
+
asm("{neg.f16 %0,%1;\n}"
|
| 2484 |
+
:"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
|
| 2485 |
+
return r;
|
| 2486 |
+
}
|
| 2487 |
+
__CUDA_FP16_DECL__ __half2 __habs2(const __half2 a)
|
| 2488 |
+
{
|
| 2489 |
+
__half2 r;
|
| 2490 |
+
asm("{abs.f16x2 %0,%1;\n}"
|
| 2491 |
+
:"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
|
| 2492 |
+
return r;
|
| 2493 |
+
}
|
| 2494 |
+
__CUDA_FP16_DECL__ __half __habs(const __half a)
|
| 2495 |
+
{
|
| 2496 |
+
__half r;
|
| 2497 |
+
asm("{abs.f16 %0,%1;\n}"
|
| 2498 |
+
:"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
|
| 2499 |
+
return r;
|
| 2500 |
+
}
|
| 2501 |
+
|
| 2502 |
+
__CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c)
|
| 2503 |
+
{
|
| 2504 |
+
// fast version of complex multiply-accumulate
|
| 2505 |
+
// (a.re, a.im) * (b.re, b.im) + (c.re, c.im)
|
| 2506 |
+
// acc.re = (c.re + a.re*b.re) - a.im*b.im
|
| 2507 |
+
// acc.im = (c.im + a.re*b.im) + a.im*b.re
|
| 2508 |
+
__half real_tmp = __hfma(a.x, b.x, c.x);
|
| 2509 |
+
__half img_tmp = __hfma(a.x, b.y, c.y);
|
| 2510 |
+
real_tmp = __hfma(__hneg(a.y), b.y, real_tmp);
|
| 2511 |
+
img_tmp = __hfma(a.y, b.x, img_tmp);
|
| 2512 |
+
return make_half2(real_tmp, img_tmp);
|
| 2513 |
+
}
|
| 2514 |
+
|
| 2515 |
+
#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)*/
|
| 2516 |
+
|
| 2517 |
+
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
|
| 2518 |
+
__CUDA_FP16_DECL__ __half __hmax_nan(const __half a, const __half b)
|
| 2519 |
+
{
|
| 2520 |
+
__BINARY_OP_HALF_MACRO(max.NaN)
|
| 2521 |
+
}
|
| 2522 |
+
__CUDA_FP16_DECL__ __half __hmin_nan(const __half a, const __half b)
|
| 2523 |
+
{
|
| 2524 |
+
__BINARY_OP_HALF_MACRO(min.NaN)
|
| 2525 |
+
}
|
| 2526 |
+
__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c)
|
| 2527 |
+
{
|
| 2528 |
+
__TERNARY_OP_HALF_MACRO(fma.rn.relu)
|
| 2529 |
+
}
|
| 2530 |
+
|
| 2531 |
+
__CUDA_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b)
|
| 2532 |
+
{
|
| 2533 |
+
__BINARY_OP_HALF2_MACRO(max.NaN)
|
| 2534 |
+
}
|
| 2535 |
+
__CUDA_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b)
|
| 2536 |
+
{
|
| 2537 |
+
__BINARY_OP_HALF2_MACRO(min.NaN)
|
| 2538 |
+
}
|
| 2539 |
+
__CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c)
|
| 2540 |
+
{
|
| 2541 |
+
__TERNARY_OP_HALF2_MACRO(fma.rn.relu)
|
| 2542 |
+
}
|
| 2543 |
+
#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)*/
|
| 2544 |
+
|
| 2545 |
+
/* Define __PTR for atomicAdd prototypes below, undef after done */
|
| 2546 |
+
#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
|
| 2547 |
+
#define __PTR "l"
|
| 2548 |
+
#else
|
| 2549 |
+
#define __PTR "r"
|
| 2550 |
+
#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
|
| 2551 |
+
|
| 2552 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
|
| 2553 |
+
|
| 2554 |
+
__CUDA_FP16_DECL__ __half2 atomicAdd(__half2 *const address, const __half2 val) {
|
| 2555 |
+
__half2 r;
|
| 2556 |
+
asm volatile ("{ atom.add.noftz.f16x2 %0,[%1],%2; }\n"
|
| 2557 |
+
: "=r"(__HALF2_TO_UI(r)) : __PTR(address), "r"(__HALF2_TO_CUI(val))
|
| 2558 |
+
: "memory");
|
| 2559 |
+
return r;
|
| 2560 |
+
}
|
| 2561 |
+
|
| 2562 |
+
#endif /*!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600*/
|
| 2563 |
+
|
| 2564 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
|
| 2565 |
+
|
| 2566 |
+
__CUDA_FP16_DECL__ __half atomicAdd(__half *const address, const __half val) {
|
| 2567 |
+
__half r;
|
| 2568 |
+
asm volatile ("{ atom.add.noftz.f16 %0,[%1],%2; }\n"
|
| 2569 |
+
: "=h"(__HALF_TO_US(r))
|
| 2570 |
+
: __PTR(address), "h"(__HALF_TO_CUS(val))
|
| 2571 |
+
: "memory");
|
| 2572 |
+
return r;
|
| 2573 |
+
}
|
| 2574 |
+
|
| 2575 |
+
#endif /*!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700*/
|
| 2576 |
+
|
| 2577 |
+
#undef __PTR
|
| 2578 |
+
|
| 2579 |
+
#undef __CUDA_FP16_DECL__
|
| 2580 |
+
#endif /* defined(__CUDACC__) */
|
| 2581 |
+
#endif /* defined(__cplusplus) */
|
| 2582 |
+
|
| 2583 |
+
#undef __TERNARY_OP_HALF2_MACRO
|
| 2584 |
+
#undef __TERNARY_OP_HALF_MACRO
|
| 2585 |
+
#undef __BINARY_OP_HALF2_MACRO
|
| 2586 |
+
#undef __BINARY_OP_HALF_MACRO
|
| 2587 |
+
|
| 2588 |
+
#undef __CUDA_HOSTDEVICE_FP16_DECL__
|
| 2589 |
+
#undef __CUDA_FP16_DECL__
|
| 2590 |
+
|
| 2591 |
+
#undef __HALF_TO_US
|
| 2592 |
+
#undef __HALF_TO_CUS
|
| 2593 |
+
#undef __HALF2_TO_UI
|
| 2594 |
+
#undef __HALF2_TO_CUI
|
| 2595 |
+
|
| 2596 |
+
/* Define first-class types "half" and "half2", unless user specifies otherwise via "#define CUDA_NO_HALF" */
|
| 2597 |
+
/* C cannot ever have these types defined here, because __half and __half2 are C++ classes */
|
| 2598 |
+
#if defined(__cplusplus) && !defined(CUDA_NO_HALF)
|
| 2599 |
+
typedef __half half;
|
| 2600 |
+
typedef __half2 half2;
|
| 2601 |
+
// for consistency with __nv_bfloat16
|
| 2602 |
+
typedef __half __nv_half;
|
| 2603 |
+
typedef __half2 __nv_half2;
|
| 2604 |
+
typedef __half_raw __nv_half_raw;
|
| 2605 |
+
typedef __half2_raw __nv_half2_raw;
|
| 2606 |
+
typedef __half nv_half;
|
| 2607 |
+
typedef __half2 nv_half2;
|
| 2608 |
+
#endif /* defined(__cplusplus) && !defined(CUDA_NO_HALF) */
|
| 2609 |
+
|
| 2610 |
+
#if defined(__CPP_VERSION_AT_LEAST_11_FP16)
|
| 2611 |
+
#undef __CPP_VERSION_AT_LEAST_11_FP16
|
| 2612 |
+
#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
|
| 2613 |
+
|
| 2614 |
+
#endif /* end of include guard: __CUDA_FP16_HPP__ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp8.h
ADDED
|
@@ -0,0 +1,360 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2022 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#ifndef __CUDA_FP8_H__
|
| 51 |
+
#define __CUDA_FP8_H__
|
| 52 |
+
|
| 53 |
+
/* Set up function decorations */
|
| 54 |
+
#if defined(__CUDACC__)
|
| 55 |
+
#define __CUDA_FP8_DECL__ static __device__ __inline__
|
| 56 |
+
#define __CUDA_HOSTDEVICE_FP8__ __host__ __device__
|
| 57 |
+
#define __CUDA_HOSTDEVICE_FP8_DECL__ static __host__ __device__ __inline__
|
| 58 |
+
#else /* !defined(__CUDACC__) */
|
| 59 |
+
#if defined(__GNUC__)
|
| 60 |
+
#define __CUDA_HOSTDEVICE_FP8_DECL__ static __attribute__((unused))
|
| 61 |
+
#else
|
| 62 |
+
#define __CUDA_HOSTDEVICE_FP8_DECL__ static
|
| 63 |
+
#endif /* defined(__GNUC__) */
|
| 64 |
+
#define __CUDA_HOSTDEVICE_FP8__
|
| 65 |
+
#endif /* defined(__CUDACC_) */
|
| 66 |
+
|
| 67 |
+
#if !defined(_MSC_VER) && __cplusplus >= 201103L
|
| 68 |
+
#define __CPP_VERSION_AT_LEAST_11_FP8
|
| 69 |
+
#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
|
| 70 |
+
#define __CPP_VERSION_AT_LEAST_11_FP8
|
| 71 |
+
#endif
|
| 72 |
+
|
| 73 |
+
/* bring in __half_raw data type */
|
| 74 |
+
#include "cuda_fp16.h"
|
| 75 |
+
/* bring in __nv_bfloat16_raw data type */
|
| 76 |
+
#include "cuda_bf16.h"
|
| 77 |
+
/* bring in float2, double4, etc vector types */
|
| 78 |
+
#include "vector_types.h"
|
| 79 |
+
|
| 80 |
+
/**
|
| 81 |
+
* \defgroup CUDA_MATH_INTRINSIC_FP8 FP8 Intrinsics
|
| 82 |
+
* This section describes fp8 intrinsic functions.
|
| 83 |
+
* To use these functions, include the header file \p cuda_fp8.h in your
|
| 84 |
+
* program.
|
| 85 |
+
*/
|
| 86 |
+
|
| 87 |
+
/**
|
| 88 |
+
* \defgroup CUDA_MATH_FP8_MISC FP8 Conversion and Data Movement
|
| 89 |
+
* \ingroup CUDA_MATH_INTRINSIC_FP8
|
| 90 |
+
* To use these functions, include the header file \p cuda_fp8.h in your
|
| 91 |
+
* program.
|
| 92 |
+
*/
|
| 93 |
+
|
| 94 |
+
/**
|
| 95 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 96 |
+
* \brief 8-bit \p unsigned \p integer
|
| 97 |
+
* type abstraction used to for \p fp8 floating-point
|
| 98 |
+
* numbers storage.
|
| 99 |
+
*/
|
| 100 |
+
typedef unsigned char __nv_fp8_storage_t;
|
| 101 |
+
|
| 102 |
+
/**
|
| 103 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 104 |
+
* \brief 16-bit \p unsigned \p integer
|
| 105 |
+
* type abstraction used to for storage of pairs of
|
| 106 |
+
* \p fp8 floating-point numbers.
|
| 107 |
+
*/
|
| 108 |
+
typedef unsigned short int __nv_fp8x2_storage_t;
|
| 109 |
+
|
| 110 |
+
/**
|
| 111 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 112 |
+
* \brief 32-bit \p unsigned \p integer
|
| 113 |
+
* type abstraction used to for storage of tetrads of
|
| 114 |
+
* \p fp8 floating-point numbers.
|
| 115 |
+
*/
|
| 116 |
+
typedef unsigned int __nv_fp8x4_storage_t;
|
| 117 |
+
|
| 118 |
+
/**
|
| 119 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 120 |
+
* \brief Enumerates the modes applicable when
|
| 121 |
+
* performing a narrowing conversion to \p fp8 destination types.
|
| 122 |
+
*/
|
| 123 |
+
typedef enum __nv_saturation_t {
|
| 124 |
+
/**
|
| 125 |
+
* Means no saturation to finite is performed when conversion
|
| 126 |
+
* results in rounding values outside the range of destination
|
| 127 |
+
* type.
|
| 128 |
+
* NOTE: for fp8 type of e4m3 kind, the results that are larger
|
| 129 |
+
* than the maximum representable finite number of the target
|
| 130 |
+
* format become NaN.
|
| 131 |
+
*/
|
| 132 |
+
__NV_NOSAT,
|
| 133 |
+
/**
|
| 134 |
+
* Means input larger than the maximum representable
|
| 135 |
+
* finite number MAXNORM of the target format round to the
|
| 136 |
+
* MAXNORM of the same sign as input.
|
| 137 |
+
*/
|
| 138 |
+
__NV_SATFINITE,
|
| 139 |
+
} __nv_saturation_t;
|
| 140 |
+
|
| 141 |
+
/**
|
| 142 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 143 |
+
* \brief Enumerates the possible
|
| 144 |
+
* interpretations of the 8-bit values when referring to them as
|
| 145 |
+
* \p fp8 types.
|
| 146 |
+
*/
|
| 147 |
+
typedef enum __nv_fp8_interpretation_t {
|
| 148 |
+
__NV_E4M3, /**< Stands for \p fp8 numbers of \p e4m3 kind. */
|
| 149 |
+
__NV_E5M2, /**< Stands for \p fp8 numbers of \p e5m2 kind. */
|
| 150 |
+
} __nv_fp8_interpretation_t;
|
| 151 |
+
|
| 152 |
+
/* Forward-declaration of C-style APIs */
|
| 153 |
+
|
| 154 |
+
/**
|
| 155 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 156 |
+
* \brief Converts input \p double precision \p x to \p fp8 type of the
|
| 157 |
+
* requested kind using round-to-nearest-even rounding and requested saturation
|
| 158 |
+
* mode.
|
| 159 |
+
*
|
| 160 |
+
* \details Converts input \p x to \p fp8 type of the kind specified by
|
| 161 |
+
* \p fp8_interpretation parameter,
|
| 162 |
+
* using round-to-nearest-even rounding and
|
| 163 |
+
* saturation mode specified by \p saturate parameter.
|
| 164 |
+
*
|
| 165 |
+
* \returns
|
| 166 |
+
* - The \p __nv_fp8_storage_t value holds the result of conversion.
|
| 167 |
+
*/
|
| 168 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
|
| 169 |
+
__nv_cvt_double_to_fp8(const double x, const __nv_saturation_t saturate,
|
| 170 |
+
const __nv_fp8_interpretation_t fp8_interpretation);
|
| 171 |
+
|
| 172 |
+
/**
|
| 173 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 174 |
+
* \brief Converts input vector of two \p double precision numbers packed
|
| 175 |
+
* in \p double2 \p x into a vector of two values of \p fp8 type of
|
| 176 |
+
* the requested kind using round-to-nearest-even rounding and requested
|
| 177 |
+
* saturation mode.
|
| 178 |
+
*
|
| 179 |
+
* \details Converts input vector \p x to a vector of two \p fp8 values of the
|
| 180 |
+
* kind specified by \p fp8_interpretation parameter, using
|
| 181 |
+
* round-to-nearest-even rounding and saturation mode specified by \p saturate
|
| 182 |
+
* parameter.
|
| 183 |
+
*
|
| 184 |
+
* \returns
|
| 185 |
+
* - The \p __nv_fp8x2_storage_t value holds the result of conversion.
|
| 186 |
+
*/
|
| 187 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
|
| 188 |
+
__nv_cvt_double2_to_fp8x2(const double2 x, const __nv_saturation_t saturate,
|
| 189 |
+
const __nv_fp8_interpretation_t fp8_interpretation);
|
| 190 |
+
|
| 191 |
+
/**
|
| 192 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 193 |
+
* \brief Converts input \p single precision \p x to \p fp8 type of the
|
| 194 |
+
* requested kind using round-to-nearest-even rounding and requested saturation
|
| 195 |
+
* mode.
|
| 196 |
+
*
|
| 197 |
+
* \details Converts input \p x to \p fp8 type of the kind specified by
|
| 198 |
+
* \p fp8_interpretation parameter,
|
| 199 |
+
* using round-to-nearest-even rounding and
|
| 200 |
+
* saturation mode specified by \p saturate parameter.
|
| 201 |
+
*
|
| 202 |
+
* \returns
|
| 203 |
+
* - The \p __nv_fp8_storage_t value holds the result of conversion.
|
| 204 |
+
*/
|
| 205 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
|
| 206 |
+
__nv_cvt_float_to_fp8(const float x, const __nv_saturation_t saturate,
|
| 207 |
+
const __nv_fp8_interpretation_t fp8_interpretation);
|
| 208 |
+
|
| 209 |
+
/**
|
| 210 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 211 |
+
* \brief Converts input vector of two \p single precision numbers packed
|
| 212 |
+
* in \p float2 \p x into a vector of two values of \p fp8 type of
|
| 213 |
+
* the requested kind using round-to-nearest-even rounding and requested
|
| 214 |
+
* saturation mode.
|
| 215 |
+
*
|
| 216 |
+
* \details Converts input vector \p x to a vector of two \p fp8 values of the
|
| 217 |
+
* kind specified by \p fp8_interpretation parameter, using
|
| 218 |
+
* round-to-nearest-even rounding and saturation mode specified by \p saturate
|
| 219 |
+
* parameter.
|
| 220 |
+
*
|
| 221 |
+
* \returns
|
| 222 |
+
* - The \p __nv_fp8x2_storage_t value holds the result of conversion.
|
| 223 |
+
*/
|
| 224 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
|
| 225 |
+
__nv_cvt_float2_to_fp8x2(const float2 x, const __nv_saturation_t saturate,
|
| 226 |
+
const __nv_fp8_interpretation_t fp8_interpretation);
|
| 227 |
+
|
| 228 |
+
/**
|
| 229 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 230 |
+
* \brief Converts input \p half precision \p x to \p fp8 type of the requested
|
| 231 |
+
* kind using round-to-nearest-even rounding and requested saturation mode.
|
| 232 |
+
*
|
| 233 |
+
* \details Converts input \p x to \p fp8 type of the kind specified by
|
| 234 |
+
* \p fp8_interpretation parameter,
|
| 235 |
+
* using round-to-nearest-even rounding and
|
| 236 |
+
* saturation mode specified by \p saturate parameter.
|
| 237 |
+
*
|
| 238 |
+
* \returns
|
| 239 |
+
* - The \p __nv_fp8_storage_t value holds the result of conversion.
|
| 240 |
+
*/
|
| 241 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
|
| 242 |
+
__nv_cvt_halfraw_to_fp8(const __half_raw x, const __nv_saturation_t saturate,
|
| 243 |
+
const __nv_fp8_interpretation_t fp8_interpretation);
|
| 244 |
+
|
| 245 |
+
/**
|
| 246 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 247 |
+
* \brief Converts input vector of two \p half precision numbers packed
|
| 248 |
+
* in \p __half2_raw \p x into a vector of two values of \p fp8 type of
|
| 249 |
+
* the requested kind using round-to-nearest-even rounding and requested
|
| 250 |
+
* saturation mode.
|
| 251 |
+
*
|
| 252 |
+
* \details Converts input vector \p x to a vector of two \p fp8 values of the
|
| 253 |
+
* kind specified by \p fp8_interpretation parameter, using
|
| 254 |
+
* round-to-nearest-even rounding and saturation mode specified by \p saturate
|
| 255 |
+
* parameter.
|
| 256 |
+
*
|
| 257 |
+
* \returns
|
| 258 |
+
* - The \p __nv_fp8x2_storage_t value holds the result of conversion.
|
| 259 |
+
*/
|
| 260 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_halfraw2_to_fp8x2(
|
| 261 |
+
const __half2_raw x, const __nv_saturation_t saturate,
|
| 262 |
+
const __nv_fp8_interpretation_t fp8_interpretation);
|
| 263 |
+
|
| 264 |
+
/**
|
| 265 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 266 |
+
* \brief Converts input \p nv_bfloat16 precision \p x to \p fp8 type of the
|
| 267 |
+
* requested kind using round-to-nearest-even rounding and requested saturation
|
| 268 |
+
* mode.
|
| 269 |
+
*
|
| 270 |
+
* \details Converts input \p x to \p fp8 type of the kind specified by
|
| 271 |
+
* \p fp8_interpretation parameter,
|
| 272 |
+
* using round-to-nearest-even rounding and
|
| 273 |
+
* saturation mode specified by \p saturate parameter.
|
| 274 |
+
*
|
| 275 |
+
* \returns
|
| 276 |
+
* - The \p __nv_fp8_storage_t value holds the result of conversion.
|
| 277 |
+
*/
|
| 278 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_bfloat16raw_to_fp8(
|
| 279 |
+
const __nv_bfloat16_raw x, const __nv_saturation_t saturate,
|
| 280 |
+
const __nv_fp8_interpretation_t fp8_interpretation);
|
| 281 |
+
|
| 282 |
+
/**
|
| 283 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 284 |
+
* \brief Converts input vector of two \p nv_bfloat16 precision numbers packed
|
| 285 |
+
* in \p __nv_bfloat162_raw \p x into a vector of two values of \p fp8 type of
|
| 286 |
+
* the requested kind using round-to-nearest-even rounding and requested
|
| 287 |
+
* saturation mode.
|
| 288 |
+
*
|
| 289 |
+
* \details Converts input vector \p x to a vector of two \p fp8 values of the
|
| 290 |
+
* kind specified by \p fp8_interpretation parameter, using
|
| 291 |
+
* round-to-nearest-even rounding and saturation mode specified by \p saturate
|
| 292 |
+
* parameter.
|
| 293 |
+
*
|
| 294 |
+
* \returns
|
| 295 |
+
* - The \p __nv_fp8x2_storage_t value holds the result of conversion.
|
| 296 |
+
*/
|
| 297 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
|
| 298 |
+
__nv_cvt_bfloat16raw2_to_fp8x2(
|
| 299 |
+
const __nv_bfloat162_raw x, const __nv_saturation_t saturate,
|
| 300 |
+
const __nv_fp8_interpretation_t fp8_interpretation);
|
| 301 |
+
|
| 302 |
+
/**
|
| 303 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 304 |
+
* \brief Converts input \p fp8 \p x of the specified kind
|
| 305 |
+
* to \p half precision.
|
| 306 |
+
*
|
| 307 |
+
* \details Converts input \p x of \p fp8 type of the kind specified by
|
| 308 |
+
* \p fp8_interpretation parameter
|
| 309 |
+
* to \p half precision.
|
| 310 |
+
*
|
| 311 |
+
* \returns
|
| 312 |
+
* - The \p __half_raw value holds the result of conversion.
|
| 313 |
+
*/
|
| 314 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __half_raw
|
| 315 |
+
__nv_cvt_fp8_to_halfraw(const __nv_fp8_storage_t x,
|
| 316 |
+
const __nv_fp8_interpretation_t fp8_interpretation);
|
| 317 |
+
/**
|
| 318 |
+
* \ingroup CUDA_MATH_FP8_MISC
|
| 319 |
+
* \brief Converts input vector of two \p fp8 values of the specified kind
|
| 320 |
+
* to a vector of two \p half precision values packed in \p __half2_raw
|
| 321 |
+
* structure.
|
| 322 |
+
*
|
| 323 |
+
* \details Converts input vector \p x of \p fp8 type of the kind specified by
|
| 324 |
+
* \p fp8_interpretation parameter
|
| 325 |
+
* to a vector of two \p half precision values and returns as \p __half2_raw
|
| 326 |
+
* structure.
|
| 327 |
+
*
|
| 328 |
+
* \returns
|
| 329 |
+
* - The \p __half2_raw value holds the result of conversion.
|
| 330 |
+
*/
|
| 331 |
+
__CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw
|
| 332 |
+
__nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x,
|
| 333 |
+
const __nv_fp8_interpretation_t fp8_interpretation);
|
| 334 |
+
|
| 335 |
+
#if defined(__cplusplus)
|
| 336 |
+
|
| 337 |
+
#define __CUDA_FP8_TYPES_EXIST__
|
| 338 |
+
|
| 339 |
+
/* Forward-declaration of structures defined in "cuda_fp8.hpp" */
|
| 340 |
+
struct __nv_fp8_e5m2;
|
| 341 |
+
struct __nv_fp8x2_e5m2;
|
| 342 |
+
struct __nv_fp8x4_e5m2;
|
| 343 |
+
|
| 344 |
+
struct __nv_fp8_e4m3;
|
| 345 |
+
struct __nv_fp8x2_e4m3;
|
| 346 |
+
struct __nv_fp8x4_e4m3;
|
| 347 |
+
|
| 348 |
+
#endif /* defined(__cplusplus) */
|
| 349 |
+
|
| 350 |
+
#include "cuda_fp8.hpp"
|
| 351 |
+
|
| 352 |
+
#undef __CUDA_FP8_DECL__
|
| 353 |
+
#undef __CUDA_HOSTDEVICE_FP8__
|
| 354 |
+
#undef __CUDA_HOSTDEVICE_FP8_DECL__
|
| 355 |
+
|
| 356 |
+
#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
|
| 357 |
+
#undef __CPP_VERSION_AT_LEAST_11_FP8
|
| 358 |
+
#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
|
| 359 |
+
|
| 360 |
+
#endif /* end of include guard: __CUDA_FP8_H__ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_gl_interop.h
ADDED
|
@@ -0,0 +1,508 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_GL_INTEROP_H__)
|
| 51 |
+
#define __CUDA_GL_INTEROP_H__
|
| 52 |
+
|
| 53 |
+
#include "cuda_runtime_api.h"
|
| 54 |
+
|
| 55 |
+
#if defined(__APPLE__)
|
| 56 |
+
|
| 57 |
+
#include <OpenGL/gl.h>
|
| 58 |
+
|
| 59 |
+
#else /* __APPLE__ */
|
| 60 |
+
|
| 61 |
+
#if defined(__arm__) || defined(__aarch64__)
|
| 62 |
+
#ifndef GL_VERSION
|
| 63 |
+
#error Please include the appropriate gl headers before including cuda_gl_interop.h
|
| 64 |
+
#endif
|
| 65 |
+
#else
|
| 66 |
+
#include <GL/gl.h>
|
| 67 |
+
#endif
|
| 68 |
+
|
| 69 |
+
#endif /* __APPLE__ */
|
| 70 |
+
|
| 71 |
+
/** \cond impl_private */
|
| 72 |
+
#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
|
| 73 |
+
#define __CUDA_DEPRECATED
|
| 74 |
+
#elif defined(_MSC_VER)
|
| 75 |
+
#define __CUDA_DEPRECATED __declspec(deprecated)
|
| 76 |
+
#elif defined(__GNUC__)
|
| 77 |
+
#define __CUDA_DEPRECATED __attribute__((deprecated))
|
| 78 |
+
#else
|
| 79 |
+
#define __CUDA_DEPRECATED
|
| 80 |
+
#endif
|
| 81 |
+
/** \endcond impl_private */
|
| 82 |
+
|
| 83 |
+
#if defined(__cplusplus)
|
| 84 |
+
extern "C" {
|
| 85 |
+
#endif /* __cplusplus */
|
| 86 |
+
|
| 87 |
+
/**
|
| 88 |
+
* \addtogroup CUDART_OPENGL OpenGL Interoperability
|
| 89 |
+
* This section describes the OpenGL interoperability functions of the CUDA
|
| 90 |
+
* runtime application programming interface. Note that mapping of OpenGL
|
| 91 |
+
* resources is performed with the graphics API agnostic, resource mapping
|
| 92 |
+
* interface described in \ref CUDART_INTEROP "Graphics Interopability".
|
| 93 |
+
*
|
| 94 |
+
* @{
|
| 95 |
+
*/
|
| 96 |
+
|
| 97 |
+
/**
|
| 98 |
+
* CUDA devices corresponding to the current OpenGL context
|
| 99 |
+
*/
|
| 100 |
+
enum cudaGLDeviceList
|
| 101 |
+
{
|
| 102 |
+
cudaGLDeviceListAll = 1, /**< The CUDA devices for all GPUs used by the current OpenGL context */
|
| 103 |
+
cudaGLDeviceListCurrentFrame = 2, /**< The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame */
|
| 104 |
+
cudaGLDeviceListNextFrame = 3 /**< The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame */
|
| 105 |
+
};
|
| 106 |
+
|
| 107 |
+
/**
|
| 108 |
+
* \brief Gets the CUDA devices associated with the current OpenGL context
|
| 109 |
+
*
|
| 110 |
+
* Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices
|
| 111 |
+
* corresponding to the current OpenGL context. Also returns in \p *pCudaDevices
|
| 112 |
+
* at most \p cudaDeviceCount of the CUDA-compatible devices corresponding to
|
| 113 |
+
* the current OpenGL context. If any of the GPUs being used by the current OpenGL
|
| 114 |
+
* context are not CUDA capable then the call will return ::cudaErrorNoDevice.
|
| 115 |
+
*
|
| 116 |
+
* \param pCudaDeviceCount - Returned number of CUDA devices corresponding to the
|
| 117 |
+
* current OpenGL context
|
| 118 |
+
* \param pCudaDevices - Returned CUDA devices corresponding to the current
|
| 119 |
+
* OpenGL context
|
| 120 |
+
* \param cudaDeviceCount - The size of the output device array \p pCudaDevices
|
| 121 |
+
* \param deviceList - The set of devices to return. This set may be
|
| 122 |
+
* ::cudaGLDeviceListAll for all devices,
|
| 123 |
+
* ::cudaGLDeviceListCurrentFrame for the devices used to
|
| 124 |
+
* render the current frame (in SLI), or
|
| 125 |
+
* ::cudaGLDeviceListNextFrame for the devices used to
|
| 126 |
+
* render the next frame (in SLI).
|
| 127 |
+
*
|
| 128 |
+
* \return
|
| 129 |
+
* ::cudaSuccess,
|
| 130 |
+
* ::cudaErrorNoDevice,
|
| 131 |
+
* ::cudaErrorInvalidGraphicsContext,
|
| 132 |
+
* ::cudaErrorUnknown
|
| 133 |
+
*
|
| 134 |
+
* \note This function is not supported on Mac OS X.
|
| 135 |
+
* \notefnerr
|
| 136 |
+
*
|
| 137 |
+
* \sa
|
| 138 |
+
* ::cudaGraphicsUnregisterResource,
|
| 139 |
+
* ::cudaGraphicsMapResources,
|
| 140 |
+
* ::cudaGraphicsSubResourceGetMappedArray,
|
| 141 |
+
* ::cudaGraphicsResourceGetMappedPointer,
|
| 142 |
+
* ::cuGLGetDevices
|
| 143 |
+
*/
|
| 144 |
+
extern __host__ cudaError_t CUDARTAPI cudaGLGetDevices(unsigned int *pCudaDeviceCount, int *pCudaDevices, unsigned int cudaDeviceCount, enum cudaGLDeviceList deviceList);
|
| 145 |
+
|
| 146 |
+
/**
|
| 147 |
+
* \brief Register an OpenGL texture or renderbuffer object
|
| 148 |
+
*
|
| 149 |
+
* Registers the texture or renderbuffer object specified by \p image for access by CUDA.
|
| 150 |
+
* A handle to the registered object is returned as \p resource.
|
| 151 |
+
*
|
| 152 |
+
* \p target must match the type of the object, and must be one of ::GL_TEXTURE_2D,
|
| 153 |
+
* ::GL_TEXTURE_RECTANGLE, ::GL_TEXTURE_CUBE_MAP, ::GL_TEXTURE_3D, ::GL_TEXTURE_2D_ARRAY,
|
| 154 |
+
* or ::GL_RENDERBUFFER.
|
| 155 |
+
*
|
| 156 |
+
* The register flags \p flags specify the intended usage, as follows:
|
| 157 |
+
* - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
|
| 158 |
+
* resource will be used. It is therefore assumed that this resource will be
|
| 159 |
+
* read from and written to by CUDA. This is the default value.
|
| 160 |
+
* - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
|
| 161 |
+
* will not write to this resource.
|
| 162 |
+
* - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
|
| 163 |
+
* CUDA will not read from this resource and will write over the
|
| 164 |
+
* entire contents of the resource, so none of the data previously
|
| 165 |
+
* stored in the resource will be preserved.
|
| 166 |
+
* - ::cudaGraphicsRegisterFlagsSurfaceLoadStore: Specifies that CUDA will
|
| 167 |
+
* bind this resource to a surface reference.
|
| 168 |
+
* - ::cudaGraphicsRegisterFlagsTextureGather: Specifies that CUDA will perform
|
| 169 |
+
* texture gather operations on this resource.
|
| 170 |
+
*
|
| 171 |
+
* The following image formats are supported. For brevity's sake, the list is abbreviated.
|
| 172 |
+
* For ex., {GL_R, GL_RG} X {8, 16} would expand to the following 4 formats
|
| 173 |
+
* {GL_R8, GL_R16, GL_RG8, GL_RG16} :
|
| 174 |
+
* - GL_RED, GL_RG, GL_RGBA, GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY
|
| 175 |
+
* - {GL_R, GL_RG, GL_RGBA} X {8, 16, 16F, 32F, 8UI, 16UI, 32UI, 8I, 16I, 32I}
|
| 176 |
+
* - {GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY} X
|
| 177 |
+
* {8, 16, 16F_ARB, 32F_ARB, 8UI_EXT, 16UI_EXT, 32UI_EXT, 8I_EXT, 16I_EXT, 32I_EXT}
|
| 178 |
+
*
|
| 179 |
+
* The following image classes are currently disallowed:
|
| 180 |
+
* - Textures with borders
|
| 181 |
+
* - Multisampled renderbuffers
|
| 182 |
+
*
|
| 183 |
+
* \param resource - Pointer to the returned object handle
|
| 184 |
+
* \param image - name of texture or renderbuffer object to be registered
|
| 185 |
+
* \param target - Identifies the type of object specified by \p image
|
| 186 |
+
* \param flags - Register flags
|
| 187 |
+
*
|
| 188 |
+
* \return
|
| 189 |
+
* ::cudaSuccess,
|
| 190 |
+
* ::cudaErrorInvalidDevice,
|
| 191 |
+
* ::cudaErrorInvalidValue,
|
| 192 |
+
* ::cudaErrorInvalidResourceHandle,
|
| 193 |
+
* ::cudaErrorUnknown
|
| 194 |
+
* \notefnerr
|
| 195 |
+
*
|
| 196 |
+
* \sa
|
| 197 |
+
* ::cudaGraphicsUnregisterResource,
|
| 198 |
+
* ::cudaGraphicsMapResources,
|
| 199 |
+
* ::cudaGraphicsSubResourceGetMappedArray,
|
| 200 |
+
* ::cuGraphicsGLRegisterImage
|
| 201 |
+
*/
|
| 202 |
+
extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterImage(struct cudaGraphicsResource **resource, GLuint image, GLenum target, unsigned int flags);
|
| 203 |
+
|
| 204 |
+
/**
|
| 205 |
+
* \brief Registers an OpenGL buffer object
|
| 206 |
+
*
|
| 207 |
+
* Registers the buffer object specified by \p buffer for access by
|
| 208 |
+
* CUDA. A handle to the registered object is returned as \p
|
| 209 |
+
* resource. The register flags \p flags specify the intended usage,
|
| 210 |
+
* as follows:
|
| 211 |
+
*
|
| 212 |
+
* - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
|
| 213 |
+
* resource will be used. It is therefore assumed that this resource will be
|
| 214 |
+
* read from and written to by CUDA. This is the default value.
|
| 215 |
+
* - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
|
| 216 |
+
* will not write to this resource.
|
| 217 |
+
* - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
|
| 218 |
+
* CUDA will not read from this resource and will write over the
|
| 219 |
+
* entire contents of the resource, so none of the data previously
|
| 220 |
+
* stored in the resource will be preserved.
|
| 221 |
+
*
|
| 222 |
+
* \param resource - Pointer to the returned object handle
|
| 223 |
+
* \param buffer - name of buffer object to be registered
|
| 224 |
+
* \param flags - Register flags
|
| 225 |
+
*
|
| 226 |
+
* \return
|
| 227 |
+
* ::cudaSuccess,
|
| 228 |
+
* ::cudaErrorInvalidDevice,
|
| 229 |
+
* ::cudaErrorInvalidValue,
|
| 230 |
+
* ::cudaErrorInvalidResourceHandle,
|
| 231 |
+
* ::cudaErrorUnknown
|
| 232 |
+
* \notefnerr
|
| 233 |
+
*
|
| 234 |
+
* \sa
|
| 235 |
+
* ::cudaGraphicsUnregisterResource,
|
| 236 |
+
* ::cudaGraphicsMapResources,
|
| 237 |
+
* ::cudaGraphicsResourceGetMappedPointer,
|
| 238 |
+
* ::cuGraphicsGLRegisterBuffer
|
| 239 |
+
*/
|
| 240 |
+
extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterBuffer(struct cudaGraphicsResource **resource, GLuint buffer, unsigned int flags);
|
| 241 |
+
|
| 242 |
+
#ifdef _WIN32
|
| 243 |
+
#ifndef WGL_NV_gpu_affinity
|
| 244 |
+
typedef void* HGPUNV;
|
| 245 |
+
#endif
|
| 246 |
+
|
| 247 |
+
/**
|
| 248 |
+
* \brief Gets the CUDA device associated with hGpu
|
| 249 |
+
*
|
| 250 |
+
* Returns the CUDA device associated with a hGpu, if applicable.
|
| 251 |
+
*
|
| 252 |
+
* \param device - Returns the device associated with hGpu, or -1 if hGpu is
|
| 253 |
+
* not a compute device.
|
| 254 |
+
* \param hGpu - Handle to a GPU, as queried via WGL_NV_gpu_affinity
|
| 255 |
+
*
|
| 256 |
+
* \return
|
| 257 |
+
* ::cudaSuccess
|
| 258 |
+
* \notefnerr
|
| 259 |
+
*
|
| 260 |
+
* \sa
|
| 261 |
+
* ::WGL_NV_gpu_affinity,
|
| 262 |
+
* ::cuWGLGetDevice
|
| 263 |
+
*/
|
| 264 |
+
extern __host__ cudaError_t CUDARTAPI cudaWGLGetDevice(int *device, HGPUNV hGpu);
|
| 265 |
+
#endif
|
| 266 |
+
|
| 267 |
+
/** @} */ /* END CUDART_OPENGL */
|
| 268 |
+
|
| 269 |
+
/**
|
| 270 |
+
* \addtogroup CUDART_OPENGL_DEPRECATED OpenGL Interoperability [DEPRECATED]
|
| 271 |
+
* This section describes deprecated OpenGL interoperability functionality.
|
| 272 |
+
*
|
| 273 |
+
* @{
|
| 274 |
+
*/
|
| 275 |
+
|
| 276 |
+
/**
|
| 277 |
+
* CUDA GL Map Flags
|
| 278 |
+
*/
|
| 279 |
+
enum cudaGLMapFlags
|
| 280 |
+
{
|
| 281 |
+
cudaGLMapFlagsNone = 0, /**< Default; Assume resource can be read/written */
|
| 282 |
+
cudaGLMapFlagsReadOnly = 1, /**< CUDA kernels will not write to this resource */
|
| 283 |
+
cudaGLMapFlagsWriteDiscard = 2 /**< CUDA kernels will only write to and will not read from this resource */
|
| 284 |
+
};
|
| 285 |
+
|
| 286 |
+
/**
|
| 287 |
+
* \brief Sets a CUDA device to use OpenGL interoperability
|
| 288 |
+
*
|
| 289 |
+
* \deprecated This function is deprecated as of CUDA 5.0.
|
| 290 |
+
*
|
| 291 |
+
* This function is deprecated and should no longer be used. It is
|
| 292 |
+
* no longer necessary to associate a CUDA device with an OpenGL
|
| 293 |
+
* context in order to achieve maximum interoperability performance.
|
| 294 |
+
*
|
| 295 |
+
* \param device - Device to use for OpenGL interoperability
|
| 296 |
+
*
|
| 297 |
+
* \return
|
| 298 |
+
* ::cudaSuccess,
|
| 299 |
+
* ::cudaErrorInvalidDevice,
|
| 300 |
+
* ::cudaErrorSetOnActiveProcess
|
| 301 |
+
* \notefnerr
|
| 302 |
+
*
|
| 303 |
+
* \sa ::cudaGraphicsGLRegisterBuffer, ::cudaGraphicsGLRegisterImage
|
| 304 |
+
*/
|
| 305 |
+
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device);
|
| 306 |
+
|
| 307 |
+
/**
|
| 308 |
+
* \brief Registers a buffer object for access by CUDA
|
| 309 |
+
*
|
| 310 |
+
* \deprecated This function is deprecated as of CUDA 3.0.
|
| 311 |
+
*
|
| 312 |
+
* Registers the buffer object of ID \p bufObj for access by
|
| 313 |
+
* CUDA. This function must be called before CUDA can map the buffer
|
| 314 |
+
* object. The OpenGL context used to create the buffer, or another
|
| 315 |
+
* context from the same share group, must be bound to the current
|
| 316 |
+
* thread when this is called.
|
| 317 |
+
*
|
| 318 |
+
* \param bufObj - Buffer object ID to register
|
| 319 |
+
*
|
| 320 |
+
* \return
|
| 321 |
+
* ::cudaSuccess,
|
| 322 |
+
* ::cudaErrorInitializationError
|
| 323 |
+
* \notefnerr
|
| 324 |
+
*
|
| 325 |
+
* \sa ::cudaGraphicsGLRegisterBuffer
|
| 326 |
+
*/
|
| 327 |
+
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint bufObj);
|
| 328 |
+
|
| 329 |
+
/**
|
| 330 |
+
* \brief Maps a buffer object for access by CUDA
|
| 331 |
+
*
|
| 332 |
+
* \deprecated This function is deprecated as of CUDA 3.0.
|
| 333 |
+
*
|
| 334 |
+
* Maps the buffer object of ID \p bufObj into the address space of
|
| 335 |
+
* CUDA and returns in \p *devPtr the base pointer of the resulting
|
| 336 |
+
* mapping. The buffer must have previously been registered by
|
| 337 |
+
* calling ::cudaGLRegisterBufferObject(). While a buffer is mapped
|
| 338 |
+
* by CUDA, any OpenGL operation which references the buffer will
|
| 339 |
+
* result in undefined behavior. The OpenGL context used to create
|
| 340 |
+
* the buffer, or another context from the same share group, must be
|
| 341 |
+
* bound to the current thread when this is called.
|
| 342 |
+
*
|
| 343 |
+
* All streams in the current thread are synchronized with the current
|
| 344 |
+
* GL context.
|
| 345 |
+
*
|
| 346 |
+
* \param devPtr - Returned device pointer to CUDA object
|
| 347 |
+
* \param bufObj - Buffer object ID to map
|
| 348 |
+
*
|
| 349 |
+
* \return
|
| 350 |
+
* ::cudaSuccess,
|
| 351 |
+
* ::cudaErrorMapBufferObjectFailed
|
| 352 |
+
* \notefnerr
|
| 353 |
+
*
|
| 354 |
+
* \sa ::cudaGraphicsMapResources
|
| 355 |
+
*/
|
| 356 |
+
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj);
|
| 357 |
+
|
| 358 |
+
/**
|
| 359 |
+
* \brief Unmaps a buffer object for access by CUDA
|
| 360 |
+
*
|
| 361 |
+
* \deprecated This function is deprecated as of CUDA 3.0.
|
| 362 |
+
*
|
| 363 |
+
* Unmaps the buffer object of ID \p bufObj for access by CUDA. When
|
| 364 |
+
* a buffer is unmapped, the base address returned by
|
| 365 |
+
* ::cudaGLMapBufferObject() is invalid and subsequent references to
|
| 366 |
+
* the address result in undefined behavior. The OpenGL context used
|
| 367 |
+
* to create the buffer, or another context from the same share group,
|
| 368 |
+
* must be bound to the current thread when this is called.
|
| 369 |
+
*
|
| 370 |
+
* All streams in the current thread are synchronized with the current
|
| 371 |
+
* GL context.
|
| 372 |
+
*
|
| 373 |
+
* \param bufObj - Buffer object to unmap
|
| 374 |
+
*
|
| 375 |
+
* \return
|
| 376 |
+
* ::cudaSuccess,
|
| 377 |
+
* ::cudaErrorUnmapBufferObjectFailed
|
| 378 |
+
* \notefnerr
|
| 379 |
+
*
|
| 380 |
+
* \sa ::cudaGraphicsUnmapResources
|
| 381 |
+
*/
|
| 382 |
+
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj);
|
| 383 |
+
|
| 384 |
+
/**
|
| 385 |
+
* \brief Unregisters a buffer object for access by CUDA
|
| 386 |
+
*
|
| 387 |
+
* \deprecated This function is deprecated as of CUDA 3.0.
|
| 388 |
+
*
|
| 389 |
+
* Unregisters the buffer object of ID \p bufObj for access by CUDA
|
| 390 |
+
* and releases any CUDA resources associated with the buffer. Once a
|
| 391 |
+
* buffer is unregistered, it may no longer be mapped by CUDA. The GL
|
| 392 |
+
* context used to create the buffer, or another context from the
|
| 393 |
+
* same share group, must be bound to the current thread when this is
|
| 394 |
+
* called.
|
| 395 |
+
*
|
| 396 |
+
* \param bufObj - Buffer object to unregister
|
| 397 |
+
*
|
| 398 |
+
* \return
|
| 399 |
+
* ::cudaSuccess
|
| 400 |
+
* \notefnerr
|
| 401 |
+
*
|
| 402 |
+
* \sa ::cudaGraphicsUnregisterResource
|
| 403 |
+
*/
|
| 404 |
+
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint bufObj);
|
| 405 |
+
|
| 406 |
+
/**
|
| 407 |
+
* \brief Set usage flags for mapping an OpenGL buffer
|
| 408 |
+
*
|
| 409 |
+
* \deprecated This function is deprecated as of CUDA 3.0.
|
| 410 |
+
*
|
| 411 |
+
* Set flags for mapping the OpenGL buffer \p bufObj
|
| 412 |
+
*
|
| 413 |
+
* Changes to flags will take effect the next time \p bufObj is mapped.
|
| 414 |
+
* The \p flags argument may be any of the following:
|
| 415 |
+
*
|
| 416 |
+
* - ::cudaGLMapFlagsNone: Specifies no hints about how this buffer will
|
| 417 |
+
* be used. It is therefore assumed that this buffer will be read from and
|
| 418 |
+
* written to by CUDA kernels. This is the default value.
|
| 419 |
+
* - ::cudaGLMapFlagsReadOnly: Specifies that CUDA kernels which access this
|
| 420 |
+
* buffer will not write to the buffer.
|
| 421 |
+
* - ::cudaGLMapFlagsWriteDiscard: Specifies that CUDA kernels which access
|
| 422 |
+
* this buffer will not read from the buffer and will write over the
|
| 423 |
+
* entire contents of the buffer, so none of the data previously stored in
|
| 424 |
+
* the buffer will be preserved.
|
| 425 |
+
*
|
| 426 |
+
* If \p bufObj has not been registered for use with CUDA, then
|
| 427 |
+
* ::cudaErrorInvalidResourceHandle is returned. If \p bufObj is presently
|
| 428 |
+
* mapped for access by CUDA, then ::cudaErrorUnknown is returned.
|
| 429 |
+
*
|
| 430 |
+
* \param bufObj - Registered buffer object to set flags for
|
| 431 |
+
* \param flags - Parameters for buffer mapping
|
| 432 |
+
*
|
| 433 |
+
* \return
|
| 434 |
+
* ::cudaSuccess,
|
| 435 |
+
* ::cudaErrorInvalidValue,
|
| 436 |
+
* ::cudaErrorInvalidResourceHandle,
|
| 437 |
+
* ::cudaErrorUnknown
|
| 438 |
+
* \notefnerr
|
| 439 |
+
*
|
| 440 |
+
* \sa ::cudaGraphicsResourceSetMapFlags
|
| 441 |
+
*/
|
| 442 |
+
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetBufferObjectMapFlags(GLuint bufObj, unsigned int flags);
|
| 443 |
+
|
| 444 |
+
/**
|
| 445 |
+
* \brief Maps a buffer object for access by CUDA
|
| 446 |
+
*
|
| 447 |
+
* \deprecated This function is deprecated as of CUDA 3.0.
|
| 448 |
+
*
|
| 449 |
+
* Maps the buffer object of ID \p bufObj into the address space of
|
| 450 |
+
* CUDA and returns in \p *devPtr the base pointer of the resulting
|
| 451 |
+
* mapping. The buffer must have previously been registered by
|
| 452 |
+
* calling ::cudaGLRegisterBufferObject(). While a buffer is mapped
|
| 453 |
+
* by CUDA, any OpenGL operation which references the buffer will
|
| 454 |
+
* result in undefined behavior. The OpenGL context used to create
|
| 455 |
+
* the buffer, or another context from the same share group, must be
|
| 456 |
+
* bound to the current thread when this is called.
|
| 457 |
+
*
|
| 458 |
+
* Stream /p stream is synchronized with the current GL context.
|
| 459 |
+
*
|
| 460 |
+
* \param devPtr - Returned device pointer to CUDA object
|
| 461 |
+
* \param bufObj - Buffer object ID to map
|
| 462 |
+
* \param stream - Stream to synchronize
|
| 463 |
+
*
|
| 464 |
+
* \return
|
| 465 |
+
* ::cudaSuccess,
|
| 466 |
+
* ::cudaErrorMapBufferObjectFailed
|
| 467 |
+
* \notefnerr
|
| 468 |
+
*
|
| 469 |
+
* \sa ::cudaGraphicsMapResources
|
| 470 |
+
*/
|
| 471 |
+
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObjectAsync(void **devPtr, GLuint bufObj, cudaStream_t stream);
|
| 472 |
+
|
| 473 |
+
/**
|
| 474 |
+
* \brief Unmaps a buffer object for access by CUDA
|
| 475 |
+
*
|
| 476 |
+
* \deprecated This function is deprecated as of CUDA 3.0.
|
| 477 |
+
*
|
| 478 |
+
* Unmaps the buffer object of ID \p bufObj for access by CUDA. When
|
| 479 |
+
* a buffer is unmapped, the base address returned by
|
| 480 |
+
* ::cudaGLMapBufferObject() is invalid and subsequent references to
|
| 481 |
+
* the address result in undefined behavior. The OpenGL context used
|
| 482 |
+
* to create the buffer, or another context from the same share group,
|
| 483 |
+
* must be bound to the current thread when this is called.
|
| 484 |
+
*
|
| 485 |
+
* Stream /p stream is synchronized with the current GL context.
|
| 486 |
+
*
|
| 487 |
+
* \param bufObj - Buffer object to unmap
|
| 488 |
+
* \param stream - Stream to synchronize
|
| 489 |
+
*
|
| 490 |
+
* \return
|
| 491 |
+
* ::cudaSuccess,
|
| 492 |
+
* ::cudaErrorUnmapBufferObjectFailed
|
| 493 |
+
* \notefnerr
|
| 494 |
+
*
|
| 495 |
+
* \sa ::cudaGraphicsUnmapResources
|
| 496 |
+
*/
|
| 497 |
+
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObjectAsync(GLuint bufObj, cudaStream_t stream);
|
| 498 |
+
|
| 499 |
+
/** @} */ /* END CUDART_OPENGL_DEPRECATED */
|
| 500 |
+
|
| 501 |
+
#if defined(__cplusplus)
|
| 502 |
+
}
|
| 503 |
+
#endif /* __cplusplus */
|
| 504 |
+
|
| 505 |
+
#undef __CUDA_DEPRECATED
|
| 506 |
+
|
| 507 |
+
#endif /* __CUDA_GL_INTEROP_H__ */
|
| 508 |
+
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_runtime_api.h
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_vdpau_interop.h
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_VDPAU_INTEROP_H__)
|
| 51 |
+
#define __CUDA_VDPAU_INTEROP_H__
|
| 52 |
+
|
| 53 |
+
#include "cuda_runtime_api.h"
|
| 54 |
+
|
| 55 |
+
#include <vdpau/vdpau.h>
|
| 56 |
+
|
| 57 |
+
#if defined(__cplusplus)
|
| 58 |
+
extern "C" {
|
| 59 |
+
#endif /* __cplusplus */
|
| 60 |
+
|
| 61 |
+
/**
|
| 62 |
+
* \addtogroup CUDART_VDPAU VDPAU Interoperability
|
| 63 |
+
* This section describes the VDPAU interoperability functions of the CUDA
|
| 64 |
+
* runtime application programming interface.
|
| 65 |
+
*
|
| 66 |
+
* @{
|
| 67 |
+
*/
|
| 68 |
+
|
| 69 |
+
/**
|
| 70 |
+
* \brief Gets the CUDA device associated with a VdpDevice.
|
| 71 |
+
*
|
| 72 |
+
* Returns the CUDA device associated with a VdpDevice, if applicable.
|
| 73 |
+
*
|
| 74 |
+
* \param device - Returns the device associated with vdpDevice, or -1 if
|
| 75 |
+
* the device associated with vdpDevice is not a compute device.
|
| 76 |
+
* \param vdpDevice - A VdpDevice handle
|
| 77 |
+
* \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
|
| 78 |
+
*
|
| 79 |
+
* \return
|
| 80 |
+
* ::cudaSuccess
|
| 81 |
+
* \notefnerr
|
| 82 |
+
*
|
| 83 |
+
* \sa
|
| 84 |
+
* ::cudaVDPAUSetVDPAUDevice,
|
| 85 |
+
* ::cuVDPAUGetDevice
|
| 86 |
+
*/
|
| 87 |
+
extern __host__ cudaError_t CUDARTAPI cudaVDPAUGetDevice(int *device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
|
| 88 |
+
|
| 89 |
+
/**
|
| 90 |
+
* \brief Sets a CUDA device to use VDPAU interoperability
|
| 91 |
+
*
|
| 92 |
+
* Records \p vdpDevice as the VdpDevice for VDPAU interoperability
|
| 93 |
+
* with the CUDA device \p device and sets \p device as the current
|
| 94 |
+
* device for the calling host thread.
|
| 95 |
+
*
|
| 96 |
+
* If \p device has already been initialized then this call will fail
|
| 97 |
+
* with the error ::cudaErrorSetOnActiveProcess. In this case it is
|
| 98 |
+
* necessary to reset \p device using ::cudaDeviceReset() before
|
| 99 |
+
* VDPAU interoperability on \p device may be enabled.
|
| 100 |
+
*
|
| 101 |
+
* \param device - Device to use for VDPAU interoperability
|
| 102 |
+
* \param vdpDevice - The VdpDevice to interoperate with
|
| 103 |
+
* \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
|
| 104 |
+
*
|
| 105 |
+
* \return
|
| 106 |
+
* ::cudaSuccess,
|
| 107 |
+
* ::cudaErrorInvalidDevice,
|
| 108 |
+
* ::cudaErrorSetOnActiveProcess
|
| 109 |
+
* \notefnerr
|
| 110 |
+
*
|
| 111 |
+
* \sa ::cudaGraphicsVDPAURegisterVideoSurface,
|
| 112 |
+
* ::cudaGraphicsVDPAURegisterOutputSurface,
|
| 113 |
+
* ::cudaDeviceReset
|
| 114 |
+
*/
|
| 115 |
+
extern __host__ cudaError_t CUDARTAPI cudaVDPAUSetVDPAUDevice(int device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
|
| 116 |
+
|
| 117 |
+
/**
|
| 118 |
+
* \brief Register a VdpVideoSurface object
|
| 119 |
+
*
|
| 120 |
+
* Registers the VdpVideoSurface specified by \p vdpSurface for access by CUDA.
|
| 121 |
+
* A handle to the registered object is returned as \p resource.
|
| 122 |
+
* The surface's intended usage is specified using \p flags, as follows:
|
| 123 |
+
*
|
| 124 |
+
* - ::cudaGraphicsMapFlagsNone: Specifies no hints about how this
|
| 125 |
+
* resource will be used. It is therefore assumed that this resource will be
|
| 126 |
+
* read from and written to by CUDA. This is the default value.
|
| 127 |
+
* - ::cudaGraphicsMapFlagsReadOnly: Specifies that CUDA
|
| 128 |
+
* will not write to this resource.
|
| 129 |
+
* - ::cudaGraphicsMapFlagsWriteDiscard: Specifies that
|
| 130 |
+
* CUDA will not read from this resource and will write over the
|
| 131 |
+
* entire contents of the resource, so none of the data previously
|
| 132 |
+
* stored in the resource will be preserved.
|
| 133 |
+
*
|
| 134 |
+
* \param resource - Pointer to the returned object handle
|
| 135 |
+
* \param vdpSurface - VDPAU object to be registered
|
| 136 |
+
* \param flags - Map flags
|
| 137 |
+
*
|
| 138 |
+
* \return
|
| 139 |
+
* ::cudaSuccess,
|
| 140 |
+
* ::cudaErrorInvalidDevice,
|
| 141 |
+
* ::cudaErrorInvalidValue,
|
| 142 |
+
* ::cudaErrorInvalidResourceHandle,
|
| 143 |
+
* ::cudaErrorUnknown
|
| 144 |
+
* \notefnerr
|
| 145 |
+
*
|
| 146 |
+
* \sa
|
| 147 |
+
* ::cudaVDPAUSetVDPAUDevice,
|
| 148 |
+
* ::cudaGraphicsUnregisterResource,
|
| 149 |
+
* ::cudaGraphicsSubResourceGetMappedArray,
|
| 150 |
+
* ::cuGraphicsVDPAURegisterVideoSurface
|
| 151 |
+
*/
|
| 152 |
+
extern __host__ cudaError_t CUDARTAPI cudaGraphicsVDPAURegisterVideoSurface(struct cudaGraphicsResource **resource, VdpVideoSurface vdpSurface, unsigned int flags);
|
| 153 |
+
|
| 154 |
+
/**
|
| 155 |
+
* \brief Register a VdpOutputSurface object
|
| 156 |
+
*
|
| 157 |
+
* Registers the VdpOutputSurface specified by \p vdpSurface for access by CUDA.
|
| 158 |
+
* A handle to the registered object is returned as \p resource.
|
| 159 |
+
* The surface's intended usage is specified using \p flags, as follows:
|
| 160 |
+
*
|
| 161 |
+
* - ::cudaGraphicsMapFlagsNone: Specifies no hints about how this
|
| 162 |
+
* resource will be used. It is therefore assumed that this resource will be
|
| 163 |
+
* read from and written to by CUDA. This is the default value.
|
| 164 |
+
* - ::cudaGraphicsMapFlagsReadOnly: Specifies that CUDA
|
| 165 |
+
* will not write to this resource.
|
| 166 |
+
* - ::cudaGraphicsMapFlagsWriteDiscard: Specifies that
|
| 167 |
+
* CUDA will not read from this resource and will write over the
|
| 168 |
+
* entire contents of the resource, so none of the data previously
|
| 169 |
+
* stored in the resource will be preserved.
|
| 170 |
+
*
|
| 171 |
+
* \param resource - Pointer to the returned object handle
|
| 172 |
+
* \param vdpSurface - VDPAU object to be registered
|
| 173 |
+
* \param flags - Map flags
|
| 174 |
+
*
|
| 175 |
+
* \return
|
| 176 |
+
* ::cudaSuccess,
|
| 177 |
+
* ::cudaErrorInvalidDevice,
|
| 178 |
+
* ::cudaErrorInvalidValue,
|
| 179 |
+
* ::cudaErrorInvalidResourceHandle,
|
| 180 |
+
* ::cudaErrorUnknown
|
| 181 |
+
* \notefnerr
|
| 182 |
+
*
|
| 183 |
+
* \sa
|
| 184 |
+
* ::cudaVDPAUSetVDPAUDevice,
|
| 185 |
+
* ::cudaGraphicsUnregisterResource,
|
| 186 |
+
* ::cudaGraphicsSubResourceGetMappedArray,
|
| 187 |
+
* ::cuGraphicsVDPAURegisterOutputSurface
|
| 188 |
+
*/
|
| 189 |
+
extern __host__ cudaError_t CUDARTAPI cudaGraphicsVDPAURegisterOutputSurface(struct cudaGraphicsResource **resource, VdpOutputSurface vdpSurface, unsigned int flags);
|
| 190 |
+
|
| 191 |
+
/** @} */ /* END CUDART_VDPAU */
|
| 192 |
+
|
| 193 |
+
#if defined(__cplusplus)
|
| 194 |
+
}
|
| 195 |
+
#endif /* __cplusplus */
|
| 196 |
+
|
| 197 |
+
#endif /* __CUDA_VDPAU_INTEROP_H__ */
|
| 198 |
+
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_functions.h
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
|
| 51 |
+
#if defined(_MSC_VER)
|
| 52 |
+
#pragma message("device_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
|
| 53 |
+
#else
|
| 54 |
+
#warning "device_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead."
|
| 55 |
+
#endif
|
| 56 |
+
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 57 |
+
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
#include "crt/device_functions.h"
|
| 61 |
+
|
| 62 |
+
#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__)
|
| 63 |
+
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
| 64 |
+
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__
|
| 65 |
+
#endif
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/math_constants.h
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__MATH_CONSTANTS_H__)
|
| 51 |
+
#define __MATH_CONSTANTS_H__
|
| 52 |
+
|
| 53 |
+
/* single precision constants */
|
| 54 |
+
#define CUDART_INF_F __int_as_float(0x7f800000U)
|
| 55 |
+
#define CUDART_NAN_F __int_as_float(0x7fffffffU)
|
| 56 |
+
#define CUDART_MIN_DENORM_F __int_as_float(0x00000001U)
|
| 57 |
+
#define CUDART_MAX_NORMAL_F __int_as_float(0x7f7fffffU)
|
| 58 |
+
#define CUDART_NEG_ZERO_F __int_as_float(0x80000000U)
|
| 59 |
+
#define CUDART_ZERO_F 0.0F
|
| 60 |
+
#define CUDART_ONE_F 1.0F
|
| 61 |
+
#define CUDART_SQRT_HALF_F 0.707106781F
|
| 62 |
+
#define CUDART_SQRT_HALF_HI_F 0.707106781F
|
| 63 |
+
#define CUDART_SQRT_HALF_LO_F 1.210161749e-08F
|
| 64 |
+
#define CUDART_SQRT_TWO_F 1.414213562F
|
| 65 |
+
#define CUDART_THIRD_F 0.333333333F
|
| 66 |
+
#define CUDART_PIO4_F 0.785398163F
|
| 67 |
+
#define CUDART_PIO2_F 1.570796327F
|
| 68 |
+
#define CUDART_3PIO4_F 2.356194490F
|
| 69 |
+
#define CUDART_2_OVER_PI_F 0.636619772F
|
| 70 |
+
#define CUDART_SQRT_2_OVER_PI_F 0.797884561F
|
| 71 |
+
#define CUDART_PI_F 3.141592654F
|
| 72 |
+
#define CUDART_L2E_F 1.442695041F
|
| 73 |
+
#define CUDART_L2T_F 3.321928094F
|
| 74 |
+
#define CUDART_LG2_F 0.301029996F
|
| 75 |
+
#define CUDART_LGE_F 0.434294482F
|
| 76 |
+
#define CUDART_LN2_F 0.693147181F
|
| 77 |
+
#define CUDART_LNT_F 2.302585093F
|
| 78 |
+
#define CUDART_LNPI_F 1.144729886F
|
| 79 |
+
#define CUDART_TWO_TO_M126_F 1.175494351e-38F
|
| 80 |
+
#define CUDART_TWO_TO_126_F 8.507059173e37F
|
| 81 |
+
#define CUDART_NORM_HUGE_F 3.402823466e38F
|
| 82 |
+
#define CUDART_TWO_TO_23_F 8388608.0F
|
| 83 |
+
#define CUDART_TWO_TO_24_F 16777216.0F
|
| 84 |
+
#define CUDART_TWO_TO_31_F 2147483648.0F
|
| 85 |
+
#define CUDART_TWO_TO_32_F 4294967296.0F
|
| 86 |
+
#define CUDART_REMQUO_BITS_F 3U
|
| 87 |
+
#define CUDART_REMQUO_MASK_F (~((~0U)<<CUDART_REMQUO_BITS_F))
|
| 88 |
+
#define CUDART_TRIG_PLOSS_F 105615.0F
|
| 89 |
+
|
| 90 |
+
/* double precision constants */
|
| 91 |
+
#define CUDART_INF __longlong_as_double(0x7ff0000000000000ULL)
|
| 92 |
+
#define CUDART_NAN __longlong_as_double(0xfff8000000000000ULL)
|
| 93 |
+
#define CUDART_NEG_ZERO __longlong_as_double(0x8000000000000000ULL)
|
| 94 |
+
#define CUDART_MIN_DENORM __longlong_as_double(0x0000000000000001ULL)
|
| 95 |
+
#define CUDART_ZERO 0.0
|
| 96 |
+
#define CUDART_ONE 1.0
|
| 97 |
+
#define CUDART_SQRT_TWO 1.4142135623730951e+0
|
| 98 |
+
#define CUDART_SQRT_HALF 7.0710678118654757e-1
|
| 99 |
+
#define CUDART_SQRT_HALF_HI 7.0710678118654757e-1
|
| 100 |
+
#define CUDART_SQRT_HALF_LO (-4.8336466567264567e-17)
|
| 101 |
+
#define CUDART_THIRD 3.3333333333333333e-1
|
| 102 |
+
#define CUDART_TWOTHIRD 6.6666666666666667e-1
|
| 103 |
+
#define CUDART_PIO4 7.8539816339744828e-1
|
| 104 |
+
#define CUDART_PIO4_HI 7.8539816339744828e-1
|
| 105 |
+
#define CUDART_PIO4_LO 3.0616169978683830e-17
|
| 106 |
+
#define CUDART_PIO2 1.5707963267948966e+0
|
| 107 |
+
#define CUDART_PIO2_HI 1.5707963267948966e+0
|
| 108 |
+
#define CUDART_PIO2_LO 6.1232339957367660e-17
|
| 109 |
+
#define CUDART_3PIO4 2.3561944901923448e+0
|
| 110 |
+
#define CUDART_2_OVER_PI 6.3661977236758138e-1
|
| 111 |
+
#define CUDART_PI 3.1415926535897931e+0
|
| 112 |
+
#define CUDART_PI_HI 3.1415926535897931e+0
|
| 113 |
+
#define CUDART_PI_LO 1.2246467991473532e-16
|
| 114 |
+
#define CUDART_SQRT_2PI 2.5066282746310007e+0
|
| 115 |
+
#define CUDART_SQRT_2PI_HI 2.5066282746310007e+0
|
| 116 |
+
#define CUDART_SQRT_2PI_LO (-1.8328579980459167e-16)
|
| 117 |
+
#define CUDART_SQRT_PIO2 1.2533141373155003e+0
|
| 118 |
+
#define CUDART_SQRT_PIO2_HI 1.2533141373155003e+0
|
| 119 |
+
#define CUDART_SQRT_PIO2_LO (-9.1642899902295834e-17)
|
| 120 |
+
#define CUDART_SQRT_2OPI 7.9788456080286536e-1
|
| 121 |
+
#define CUDART_L2E 1.4426950408889634e+0
|
| 122 |
+
#define CUDART_L2E_HI 1.4426950408889634e+0
|
| 123 |
+
#define CUDART_L2E_LO 2.0355273740931033e-17
|
| 124 |
+
#define CUDART_L2T 3.3219280948873622e+0
|
| 125 |
+
#define CUDART_LG2 3.0102999566398120e-1
|
| 126 |
+
#define CUDART_LG2_HI 3.0102999566398120e-1
|
| 127 |
+
#define CUDART_LG2_LO (-2.8037281277851704e-18)
|
| 128 |
+
#define CUDART_LGE 4.3429448190325182e-1
|
| 129 |
+
#define CUDART_LGE_HI 4.3429448190325182e-1
|
| 130 |
+
#define CUDART_LGE_LO 1.09831965021676510e-17
|
| 131 |
+
#define CUDART_LN2 6.9314718055994529e-1
|
| 132 |
+
#define CUDART_LN2_HI 6.9314718055994529e-1
|
| 133 |
+
#define CUDART_LN2_LO 2.3190468138462996e-17
|
| 134 |
+
#define CUDART_LNT 2.3025850929940459e+0
|
| 135 |
+
#define CUDART_LNT_HI 2.3025850929940459e+0
|
| 136 |
+
#define CUDART_LNT_LO (-2.1707562233822494e-16)
|
| 137 |
+
#define CUDART_LNPI 1.1447298858494002e+0
|
| 138 |
+
#define CUDART_LN2_X_1024 7.0978271289338397e+2
|
| 139 |
+
#define CUDART_LN2_X_1025 7.1047586007394398e+2
|
| 140 |
+
#define CUDART_LN2_X_1075 7.4513321910194122e+2
|
| 141 |
+
#define CUDART_LG2_X_1024 3.0825471555991675e+2
|
| 142 |
+
#define CUDART_LG2_X_1075 3.2360724533877976e+2
|
| 143 |
+
#define CUDART_TWO_TO_23 8388608.0
|
| 144 |
+
#define CUDART_TWO_TO_52 4503599627370496.0
|
| 145 |
+
#define CUDART_TWO_TO_53 9007199254740992.0
|
| 146 |
+
#define CUDART_TWO_TO_54 18014398509481984.0
|
| 147 |
+
#define CUDART_TWO_TO_M54 5.5511151231257827e-17
|
| 148 |
+
#define CUDART_TWO_TO_M1022 2.22507385850720140e-308
|
| 149 |
+
#define CUDART_TRIG_PLOSS 2147483648.0
|
| 150 |
+
#define CUDART_DBL2INT_CVT 6755399441055744.0
|
| 151 |
+
|
| 152 |
+
#endif /* !__MATH_CONSTANTS_H__ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_intrinsics.h
ADDED
|
@@ -0,0 +1,1551 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__SM_20_INTRINSICS_H__)
|
| 51 |
+
#define __SM_20_INTRINSICS_H__
|
| 52 |
+
|
| 53 |
+
#if defined(__CUDACC_RTC__)
|
| 54 |
+
#define __SM_20_INTRINSICS_DECL__ __device__
|
| 55 |
+
#else /* __CUDACC_RTC__ */
|
| 56 |
+
#define __SM_20_INTRINSICS_DECL__ static __inline__ __device__
|
| 57 |
+
#endif /* __CUDACC_RTC__ */
|
| 58 |
+
|
| 59 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 60 |
+
|
| 61 |
+
/*******************************************************************************
|
| 62 |
+
* *
|
| 63 |
+
* *
|
| 64 |
+
* *
|
| 65 |
+
*******************************************************************************/
|
| 66 |
+
|
| 67 |
+
#include "cuda_runtime_api.h"
|
| 68 |
+
|
| 69 |
+
#ifndef __CUDA_ARCH__
|
| 70 |
+
#define __DEF_IF_HOST { }
|
| 71 |
+
#else /* !__CUDA_ARCH__ */
|
| 72 |
+
#define __DEF_IF_HOST ;
|
| 73 |
+
#endif /* __CUDA_ARCH__ */
|
| 74 |
+
|
| 75 |
+
#if defined(_WIN32)
|
| 76 |
+
# define __DEPRECATED__(msg) __declspec(deprecated(msg))
|
| 77 |
+
#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
|
| 78 |
+
# define __DEPRECATED__(msg) __attribute__((deprecated))
|
| 79 |
+
#else
|
| 80 |
+
# define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
|
| 81 |
+
#endif
|
| 82 |
+
|
| 83 |
+
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
| 84 |
+
#define __WSB_DEPRECATION_MESSAGE(x) #x"() is not valid on compute_70 and above, and should be replaced with "#x"_sync()."\
|
| 85 |
+
"To continue using "#x"(), specify virtual architecture compute_60 when targeting sm_70 and above, for example, using the pair of compiler options: -arch=compute_60 -code=sm_70."
|
| 86 |
+
#else
|
| 87 |
+
#define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)."
|
| 88 |
+
#endif
|
| 89 |
+
|
| 90 |
+
extern "C"
|
| 91 |
+
{
|
| 92 |
+
extern __device__ __device_builtin__ void __threadfence_system(void);
|
| 93 |
+
/**
|
| 94 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 95 |
+
* \brief Divide two floating-point values in round-to-nearest-even mode.
|
| 96 |
+
*
|
| 97 |
+
* Divides two floating-point values \p x by \p y in round-to-nearest-even mode.
|
| 98 |
+
*
|
| 99 |
+
* \return Returns \p x / \p y.
|
| 100 |
+
*
|
| 101 |
+
* \note_accuracy_double
|
| 102 |
+
* \note_requires_fermi
|
| 103 |
+
*/
|
| 104 |
+
extern __device__ __device_builtin__ double __ddiv_rn(double x, double y);
|
| 105 |
+
/**
|
| 106 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 107 |
+
* \brief Divide two floating-point values in round-towards-zero mode.
|
| 108 |
+
*
|
| 109 |
+
* Divides two floating-point values \p x by \p y in round-towards-zero mode.
|
| 110 |
+
*
|
| 111 |
+
* \return Returns \p x / \p y.
|
| 112 |
+
*
|
| 113 |
+
* \note_accuracy_double
|
| 114 |
+
* \note_requires_fermi
|
| 115 |
+
*/
|
| 116 |
+
extern __device__ __device_builtin__ double __ddiv_rz(double x, double y);
|
| 117 |
+
/**
|
| 118 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 119 |
+
* \brief Divide two floating-point values in round-up mode.
|
| 120 |
+
*
|
| 121 |
+
* Divides two floating-point values \p x by \p y in round-up (to positive infinity) mode.
|
| 122 |
+
*
|
| 123 |
+
* \return Returns \p x / \p y.
|
| 124 |
+
*
|
| 125 |
+
* \note_accuracy_double
|
| 126 |
+
* \note_requires_fermi
|
| 127 |
+
*/
|
| 128 |
+
extern __device__ __device_builtin__ double __ddiv_ru(double x, double y);
|
| 129 |
+
/**
|
| 130 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 131 |
+
* \brief Divide two floating-point values in round-down mode.
|
| 132 |
+
*
|
| 133 |
+
* Divides two floating-point values \p x by \p y in round-down (to negative infinity) mode.
|
| 134 |
+
*
|
| 135 |
+
* \return Returns \p x / \p y.
|
| 136 |
+
*
|
| 137 |
+
* \note_accuracy_double
|
| 138 |
+
* \note_requires_fermi
|
| 139 |
+
*/
|
| 140 |
+
extern __device__ __device_builtin__ double __ddiv_rd(double x, double y);
|
| 141 |
+
/**
|
| 142 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 143 |
+
* \brief Compute
|
| 144 |
+
* \latexonly $\frac{1}{x}$ \endlatexonly
|
| 145 |
+
* \xmlonly
|
| 146 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 147 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 148 |
+
* <m:mfrac>
|
| 149 |
+
* <m:mn>1</m:mn>
|
| 150 |
+
* <m:mi>x</m:mi>
|
| 151 |
+
* </m:mfrac>
|
| 152 |
+
* </m:math>
|
| 153 |
+
* </d4p_MathML>
|
| 154 |
+
* \endxmlonly
|
| 155 |
+
* in round-to-nearest-even mode.
|
| 156 |
+
*
|
| 157 |
+
* Compute the reciprocal of \p x in round-to-nearest-even mode.
|
| 158 |
+
*
|
| 159 |
+
* \return Returns
|
| 160 |
+
* \latexonly $\frac{1}{x}$ \endlatexonly
|
| 161 |
+
* \xmlonly
|
| 162 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 163 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 164 |
+
* <m:mfrac>
|
| 165 |
+
* <m:mn>1</m:mn>
|
| 166 |
+
* <m:mi>x</m:mi>
|
| 167 |
+
* </m:mfrac>
|
| 168 |
+
* </m:math>
|
| 169 |
+
* </d4p_MathML>\endxmlonly.
|
| 170 |
+
*
|
| 171 |
+
* \note_accuracy_double
|
| 172 |
+
* \note_requires_fermi
|
| 173 |
+
*/
|
| 174 |
+
extern __device__ __device_builtin__ double __drcp_rn(double x);
|
| 175 |
+
/**
|
| 176 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 177 |
+
* \brief Compute
|
| 178 |
+
* \latexonly $\frac{1}{x}$ \endlatexonly
|
| 179 |
+
* \xmlonly
|
| 180 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 181 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 182 |
+
* <m:mfrac>
|
| 183 |
+
* <m:mn>1</m:mn>
|
| 184 |
+
* <m:mi>x</m:mi>
|
| 185 |
+
* </m:mfrac>
|
| 186 |
+
* </m:math>
|
| 187 |
+
* </d4p_MathML>
|
| 188 |
+
* \endxmlonly
|
| 189 |
+
* in round-towards-zero mode.
|
| 190 |
+
*
|
| 191 |
+
* Compute the reciprocal of \p x in round-towards-zero mode.
|
| 192 |
+
*
|
| 193 |
+
* \return Returns
|
| 194 |
+
* \latexonly $\frac{1}{x}$ \endlatexonly
|
| 195 |
+
* \xmlonly
|
| 196 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 197 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 198 |
+
* <m:mfrac>
|
| 199 |
+
* <m:mn>1</m:mn>
|
| 200 |
+
* <m:mi>x</m:mi>
|
| 201 |
+
* </m:mfrac>
|
| 202 |
+
* </m:math>
|
| 203 |
+
* </d4p_MathML>\endxmlonly.
|
| 204 |
+
*
|
| 205 |
+
* \note_accuracy_double
|
| 206 |
+
* \note_requires_fermi
|
| 207 |
+
*/
|
| 208 |
+
extern __device__ __device_builtin__ double __drcp_rz(double x);
|
| 209 |
+
/**
|
| 210 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 211 |
+
* \brief Compute
|
| 212 |
+
* \latexonly $\frac{1}{x}$ \endlatexonly
|
| 213 |
+
* \xmlonly
|
| 214 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 215 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 216 |
+
* <m:mfrac>
|
| 217 |
+
* <m:mn>1</m:mn>
|
| 218 |
+
* <m:mi>x</m:mi>
|
| 219 |
+
* </m:mfrac>
|
| 220 |
+
* </m:math>
|
| 221 |
+
* </d4p_MathML>
|
| 222 |
+
* \endxmlonly
|
| 223 |
+
* in round-up mode.
|
| 224 |
+
*
|
| 225 |
+
* Compute the reciprocal of \p x in round-up (to positive infinity) mode.
|
| 226 |
+
*
|
| 227 |
+
* \return Returns
|
| 228 |
+
* \latexonly $\frac{1}{x}$ \endlatexonly
|
| 229 |
+
* \xmlonly
|
| 230 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 231 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 232 |
+
* <m:mfrac>
|
| 233 |
+
* <m:mn>1</m:mn>
|
| 234 |
+
* <m:mi>x</m:mi>
|
| 235 |
+
* </m:mfrac>
|
| 236 |
+
* </m:math>
|
| 237 |
+
* </d4p_MathML>\endxmlonly.
|
| 238 |
+
*
|
| 239 |
+
* \note_accuracy_double
|
| 240 |
+
* \note_requires_fermi
|
| 241 |
+
*/
|
| 242 |
+
extern __device__ __device_builtin__ double __drcp_ru(double x);
|
| 243 |
+
/**
|
| 244 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 245 |
+
* \brief Compute
|
| 246 |
+
* \latexonly $\frac{1}{x}$ \endlatexonly
|
| 247 |
+
* \xmlonly
|
| 248 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 249 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 250 |
+
* <m:mfrac>
|
| 251 |
+
* <m:mn>1</m:mn>
|
| 252 |
+
* <m:mi>x</m:mi>
|
| 253 |
+
* </m:mfrac>
|
| 254 |
+
* </m:math>
|
| 255 |
+
* </d4p_MathML>
|
| 256 |
+
* \endxmlonly
|
| 257 |
+
* in round-down mode.
|
| 258 |
+
*
|
| 259 |
+
* Compute the reciprocal of \p x in round-down (to negative infinity) mode.
|
| 260 |
+
*
|
| 261 |
+
* \return Returns
|
| 262 |
+
* \latexonly $\frac{1}{x}$ \endlatexonly
|
| 263 |
+
* \xmlonly
|
| 264 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 265 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 266 |
+
* <m:mfrac>
|
| 267 |
+
* <m:mn>1</m:mn>
|
| 268 |
+
* <m:mi>x</m:mi>
|
| 269 |
+
* </m:mfrac>
|
| 270 |
+
* </m:math>
|
| 271 |
+
* </d4p_MathML>\endxmlonly.
|
| 272 |
+
*
|
| 273 |
+
* \note_accuracy_double
|
| 274 |
+
* \note_requires_fermi
|
| 275 |
+
*/
|
| 276 |
+
extern __device__ __device_builtin__ double __drcp_rd(double x);
|
| 277 |
+
/**
|
| 278 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 279 |
+
* \brief Compute
|
| 280 |
+
* \latexonly $\sqrt{x}$ \endlatexonly
|
| 281 |
+
* \xmlonly
|
| 282 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 283 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 284 |
+
* <m:msqrt>
|
| 285 |
+
* <m:mi>x</m:mi>
|
| 286 |
+
* </m:msqrt>
|
| 287 |
+
* </m:math>
|
| 288 |
+
* </d4p_MathML>
|
| 289 |
+
* \endxmlonly
|
| 290 |
+
* in round-to-nearest-even mode.
|
| 291 |
+
*
|
| 292 |
+
* Compute the square root of \p x in round-to-nearest-even mode.
|
| 293 |
+
*
|
| 294 |
+
* \return Returns
|
| 295 |
+
* \latexonly $\sqrt{x}$ \endlatexonly
|
| 296 |
+
* \xmlonly
|
| 297 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 298 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 299 |
+
* <m:msqrt>
|
| 300 |
+
* <m:mi>x</m:mi>
|
| 301 |
+
* </m:msqrt>
|
| 302 |
+
* </m:math>
|
| 303 |
+
* </d4p_MathML>\endxmlonly.
|
| 304 |
+
*
|
| 305 |
+
* \note_accuracy_double
|
| 306 |
+
* \note_requires_fermi
|
| 307 |
+
*/
|
| 308 |
+
extern __device__ __device_builtin__ double __dsqrt_rn(double x);
|
| 309 |
+
/**
|
| 310 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 311 |
+
* \brief Compute
|
| 312 |
+
* \latexonly $\sqrt{x}$ \endlatexonly
|
| 313 |
+
* \xmlonly
|
| 314 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 315 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 316 |
+
* <m:msqrt>
|
| 317 |
+
* <m:mi>x</m:mi>
|
| 318 |
+
* </m:msqrt>
|
| 319 |
+
* </m:math>
|
| 320 |
+
* </d4p_MathML>
|
| 321 |
+
* \endxmlonly
|
| 322 |
+
* in round-towards-zero mode.
|
| 323 |
+
*
|
| 324 |
+
* Compute the square root of \p x in round-towards-zero mode.
|
| 325 |
+
*
|
| 326 |
+
* \return Returns
|
| 327 |
+
* \latexonly $\sqrt{x}$ \endlatexonly
|
| 328 |
+
* \xmlonly
|
| 329 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 330 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 331 |
+
* <m:msqrt>
|
| 332 |
+
* <m:mi>x</m:mi>
|
| 333 |
+
* </m:msqrt>
|
| 334 |
+
* </m:math>
|
| 335 |
+
* </d4p_MathML>\endxmlonly.
|
| 336 |
+
*
|
| 337 |
+
* \note_accuracy_double
|
| 338 |
+
* \note_requires_fermi
|
| 339 |
+
*/
|
| 340 |
+
extern __device__ __device_builtin__ double __dsqrt_rz(double x);
|
| 341 |
+
/**
|
| 342 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 343 |
+
* \brief Compute
|
| 344 |
+
* \latexonly $\sqrt{x}$ \endlatexonly
|
| 345 |
+
* \xmlonly
|
| 346 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 347 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 348 |
+
* <m:msqrt>
|
| 349 |
+
* <m:mi>x</m:mi>
|
| 350 |
+
* </m:msqrt>
|
| 351 |
+
* </m:math>
|
| 352 |
+
* </d4p_MathML>
|
| 353 |
+
* \endxmlonly
|
| 354 |
+
* in round-up mode.
|
| 355 |
+
*
|
| 356 |
+
* Compute the square root of \p x in round-up (to positive infinity) mode.
|
| 357 |
+
*
|
| 358 |
+
* \return Returns
|
| 359 |
+
* \latexonly $\sqrt{x}$ \endlatexonly
|
| 360 |
+
* \xmlonly
|
| 361 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 362 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 363 |
+
* <m:msqrt>
|
| 364 |
+
* <m:mi>x</m:mi>
|
| 365 |
+
* </m:msqrt>
|
| 366 |
+
* </m:math>
|
| 367 |
+
* </d4p_MathML>\endxmlonly.
|
| 368 |
+
*
|
| 369 |
+
* \note_accuracy_double
|
| 370 |
+
* \note_requires_fermi
|
| 371 |
+
*/
|
| 372 |
+
extern __device__ __device_builtin__ double __dsqrt_ru(double x);
|
| 373 |
+
/**
|
| 374 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 375 |
+
* \brief Compute
|
| 376 |
+
* \latexonly $\sqrt{x}$ \endlatexonly
|
| 377 |
+
* \xmlonly
|
| 378 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 379 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 380 |
+
* <m:msqrt>
|
| 381 |
+
* <m:mi>x</m:mi>
|
| 382 |
+
* </m:msqrt>
|
| 383 |
+
* </m:math>
|
| 384 |
+
* </d4p_MathML>
|
| 385 |
+
* \endxmlonly
|
| 386 |
+
* in round-down mode.
|
| 387 |
+
*
|
| 388 |
+
* Compute the square root of \p x in round-down (to negative infinity) mode.
|
| 389 |
+
*
|
| 390 |
+
* \return Returns
|
| 391 |
+
* \latexonly $\sqrt{x}$ \endlatexonly
|
| 392 |
+
* \xmlonly
|
| 393 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 394 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 395 |
+
* <m:msqrt>
|
| 396 |
+
* <m:mi>x</m:mi>
|
| 397 |
+
* </m:msqrt>
|
| 398 |
+
* </m:math>
|
| 399 |
+
* </d4p_MathML>\endxmlonly.
|
| 400 |
+
*
|
| 401 |
+
* \note_accuracy_double
|
| 402 |
+
* \note_requires_fermi
|
| 403 |
+
*/
|
| 404 |
+
extern __device__ __device_builtin__ double __dsqrt_rd(double x);
|
| 405 |
+
extern __device__ __device_builtin__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__ballot)) unsigned int __ballot(int);
|
| 406 |
+
extern __device__ __device_builtin__ int __syncthreads_count(int);
|
| 407 |
+
extern __device__ __device_builtin__ int __syncthreads_and(int);
|
| 408 |
+
extern __device__ __device_builtin__ int __syncthreads_or(int);
|
| 409 |
+
extern __device__ __device_builtin__ long long int clock64(void);
|
| 410 |
+
|
| 411 |
+
|
| 412 |
+
/**
|
| 413 |
+
* \ingroup CUDA_MATH_INTRINSIC_SINGLE
|
| 414 |
+
* \brief Compute fused multiply-add operation in round-to-nearest-even mode, ignore \p -ftz=true compiler flag
|
| 415 |
+
*
|
| 416 |
+
* Behavior is the same as ::__fmaf_rn(\p x, \p y, \p z), the difference is in
|
| 417 |
+
* handling denormalized inputs and outputs: \p -ftz compiler flag has no effect.
|
| 418 |
+
*/
|
| 419 |
+
extern __device__ __device_builtin__ float __fmaf_ieee_rn(float x, float y, float z);
|
| 420 |
+
|
| 421 |
+
/**
|
| 422 |
+
* \ingroup CUDA_MATH_INTRINSIC_SINGLE
|
| 423 |
+
* \brief Compute fused multiply-add operation in round-down mode, ignore \p -ftz=true compiler flag
|
| 424 |
+
*
|
| 425 |
+
* Behavior is the same as ::__fmaf_rd(\p x, \p y, \p z), the difference is in
|
| 426 |
+
* handling denormalized inputs and outputs: \p -ftz compiler flag has no effect.
|
| 427 |
+
*/
|
| 428 |
+
extern __device__ __device_builtin__ float __fmaf_ieee_rd(float x, float y, float z);
|
| 429 |
+
|
| 430 |
+
/**
|
| 431 |
+
* \ingroup CUDA_MATH_INTRINSIC_SINGLE
|
| 432 |
+
* \brief Compute fused multiply-add operation in round-up mode, ignore \p -ftz=true compiler flag
|
| 433 |
+
*
|
| 434 |
+
* Behavior is the same as ::__fmaf_ru(\p x, \p y, \p z), the difference is in
|
| 435 |
+
* handling denormalized inputs and outputs: \p -ftz compiler flag has no effect.
|
| 436 |
+
*/
|
| 437 |
+
extern __device__ __device_builtin__ float __fmaf_ieee_ru(float x, float y, float z);
|
| 438 |
+
|
| 439 |
+
/**
|
| 440 |
+
* \ingroup CUDA_MATH_INTRINSIC_SINGLE
|
| 441 |
+
* \brief Compute fused multiply-add operation in round-towards-zero mode, ignore \p -ftz=true compiler flag
|
| 442 |
+
*
|
| 443 |
+
* Behavior is the same as ::__fmaf_rz(\p x, \p y, \p z), the difference is in
|
| 444 |
+
* handling denormalized inputs and outputs: \p -ftz compiler flag has no effect.
|
| 445 |
+
*/
|
| 446 |
+
extern __device__ __device_builtin__ float __fmaf_ieee_rz(float x, float y, float z);
|
| 447 |
+
|
| 448 |
+
|
| 449 |
+
// SM_13 intrinsics
|
| 450 |
+
|
| 451 |
+
/**
|
| 452 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 453 |
+
* \brief Reinterpret bits in a double as a 64-bit signed integer.
|
| 454 |
+
*
|
| 455 |
+
* Reinterpret the bits in the double-precision floating-point value \p x
|
| 456 |
+
* as a signed 64-bit integer.
|
| 457 |
+
* \return Returns reinterpreted value.
|
| 458 |
+
*/
|
| 459 |
+
extern __device__ __device_builtin__ long long int __double_as_longlong(double x);
|
| 460 |
+
/**
|
| 461 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 462 |
+
* \brief Reinterpret bits in a 64-bit signed integer as a double.
|
| 463 |
+
*
|
| 464 |
+
* Reinterpret the bits in the 64-bit signed integer value \p x as
|
| 465 |
+
* a double-precision floating-point value.
|
| 466 |
+
* \return Returns reinterpreted value.
|
| 467 |
+
*/
|
| 468 |
+
extern __device__ __device_builtin__ double __longlong_as_double(long long int x);
|
| 469 |
+
/**
|
| 470 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 471 |
+
* \brief Compute
|
| 472 |
+
* \latexonly $x \times y + z$ \endlatexonly
|
| 473 |
+
* \xmlonly
|
| 474 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 475 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 476 |
+
* <m:mi>x</m:mi>
|
| 477 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 478 |
+
* <m:mi>y</m:mi>
|
| 479 |
+
* <m:mo>+</m:mo>
|
| 480 |
+
* <m:mi>z</m:mi>
|
| 481 |
+
* </m:math>
|
| 482 |
+
* </d4p_MathML>
|
| 483 |
+
* \endxmlonly
|
| 484 |
+
* as a single operation in round-to-nearest-even mode.
|
| 485 |
+
*
|
| 486 |
+
* Computes the value of
|
| 487 |
+
* \latexonly $x \times y + z$ \endlatexonly
|
| 488 |
+
* \xmlonly
|
| 489 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 490 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 491 |
+
* <m:mi>x</m:mi>
|
| 492 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 493 |
+
* <m:mi>y</m:mi>
|
| 494 |
+
* <m:mo>+</m:mo>
|
| 495 |
+
* <m:mi>z</m:mi>
|
| 496 |
+
* </m:math>
|
| 497 |
+
* </d4p_MathML>
|
| 498 |
+
* \endxmlonly
|
| 499 |
+
* as a single ternary operation, rounding the
|
| 500 |
+
* result once in round-to-nearest-even mode.
|
| 501 |
+
*
|
| 502 |
+
* \return Returns the rounded value of
|
| 503 |
+
* \latexonly $x \times y + z$ \endlatexonly
|
| 504 |
+
* \xmlonly
|
| 505 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 506 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 507 |
+
* <m:mi>x</m:mi>
|
| 508 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 509 |
+
* <m:mi>y</m:mi>
|
| 510 |
+
* <m:mo>+</m:mo>
|
| 511 |
+
* <m:mi>z</m:mi>
|
| 512 |
+
* </m:math>
|
| 513 |
+
* </d4p_MathML>
|
| 514 |
+
* \endxmlonly
|
| 515 |
+
* as a single operation.
|
| 516 |
+
* - fmaf(
|
| 517 |
+
* \latexonly $\pm \infty$ \endlatexonly
|
| 518 |
+
* \xmlonly
|
| 519 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 520 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 521 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 522 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 523 |
+
* </m:math>
|
| 524 |
+
* </d4p_MathML>
|
| 525 |
+
* \endxmlonly
|
| 526 |
+
* ,
|
| 527 |
+
* \latexonly $\pm 0$ \endlatexonly
|
| 528 |
+
* \xmlonly
|
| 529 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 530 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 531 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 532 |
+
* <m:mn>0</m:mn>
|
| 533 |
+
* </m:math>
|
| 534 |
+
* </d4p_MathML>
|
| 535 |
+
* \endxmlonly
|
| 536 |
+
* , \p z) returns NaN.
|
| 537 |
+
* - fmaf(
|
| 538 |
+
* \latexonly $\pm 0$ \endlatexonly
|
| 539 |
+
* \xmlonly
|
| 540 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 541 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 542 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 543 |
+
* <m:mn>0</m:mn>
|
| 544 |
+
* </m:math>
|
| 545 |
+
* </d4p_MathML>
|
| 546 |
+
* \endxmlonly
|
| 547 |
+
* ,
|
| 548 |
+
* \latexonly $\pm \infty$ \endlatexonly
|
| 549 |
+
* \xmlonly
|
| 550 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 551 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 552 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 553 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 554 |
+
* </m:math>
|
| 555 |
+
* </d4p_MathML>
|
| 556 |
+
* \endxmlonly
|
| 557 |
+
* , \p z) returns NaN.
|
| 558 |
+
* - fmaf(\p x, \p y,
|
| 559 |
+
* \latexonly $-\infty$ \endlatexonly
|
| 560 |
+
* \xmlonly
|
| 561 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 562 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 563 |
+
* <m:mo>-</m:mo>
|
| 564 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 565 |
+
* </m:math>
|
| 566 |
+
* </d4p_MathML>
|
| 567 |
+
* \endxmlonly
|
| 568 |
+
* ) returns NaN if
|
| 569 |
+
* \latexonly $x \times y$ \endlatexonly
|
| 570 |
+
* \xmlonly
|
| 571 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 572 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 573 |
+
* <m:mi>x</m:mi>
|
| 574 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 575 |
+
* <m:mi>y</m:mi>
|
| 576 |
+
* </m:math>
|
| 577 |
+
* </d4p_MathML>
|
| 578 |
+
* \endxmlonly
|
| 579 |
+
* is an exact
|
| 580 |
+
* \latexonly $+\infty$ \endlatexonly
|
| 581 |
+
* \xmlonly
|
| 582 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 583 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 584 |
+
* <m:mo>+</m:mo>
|
| 585 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 586 |
+
* </m:math>
|
| 587 |
+
* </d4p_MathML>
|
| 588 |
+
* \endxmlonly
|
| 589 |
+
* .
|
| 590 |
+
* - fmaf(\p x, \p y,
|
| 591 |
+
* \latexonly $+\infty$ \endlatexonly
|
| 592 |
+
* \xmlonly
|
| 593 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 594 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 595 |
+
* <m:mo>+</m:mo>
|
| 596 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 597 |
+
* </m:math>
|
| 598 |
+
* </d4p_MathML>
|
| 599 |
+
* \endxmlonly
|
| 600 |
+
* ) returns NaN if
|
| 601 |
+
* \latexonly $x \times y$ \endlatexonly
|
| 602 |
+
* \xmlonly
|
| 603 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 604 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 605 |
+
* <m:mi>x</m:mi>
|
| 606 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 607 |
+
* <m:mi>y</m:mi>
|
| 608 |
+
* </m:math>
|
| 609 |
+
* </d4p_MathML>
|
| 610 |
+
* \endxmlonly
|
| 611 |
+
* is an exact
|
| 612 |
+
* \latexonly $-\infty$ \endlatexonly
|
| 613 |
+
* \xmlonly
|
| 614 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 615 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 616 |
+
* <m:mo>-</m:mo>
|
| 617 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 618 |
+
* </m:math>
|
| 619 |
+
* </d4p_MathML>
|
| 620 |
+
* \endxmlonly
|
| 621 |
+
* .
|
| 622 |
+
*
|
| 623 |
+
* \note_accuracy_double
|
| 624 |
+
*/
|
| 625 |
+
extern __device__ __device_builtin__ double __fma_rn(double x, double y, double z);
|
| 626 |
+
/**
|
| 627 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 628 |
+
* \brief Compute
|
| 629 |
+
* \latexonly $x \times y + z$ \endlatexonly
|
| 630 |
+
* \xmlonly
|
| 631 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 632 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 633 |
+
* <m:mi>x</m:mi>
|
| 634 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 635 |
+
* <m:mi>y</m:mi>
|
| 636 |
+
* <m:mo>+</m:mo>
|
| 637 |
+
* <m:mi>z</m:mi>
|
| 638 |
+
* </m:math>
|
| 639 |
+
* </d4p_MathML>
|
| 640 |
+
* \endxmlonly
|
| 641 |
+
* as a single operation in round-towards-zero mode.
|
| 642 |
+
*
|
| 643 |
+
* Computes the value of
|
| 644 |
+
* \latexonly $x \times y + z$ \endlatexonly
|
| 645 |
+
* \xmlonly
|
| 646 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 647 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 648 |
+
* <m:mi>x</m:mi>
|
| 649 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 650 |
+
* <m:mi>y</m:mi>
|
| 651 |
+
* <m:mo>+</m:mo>
|
| 652 |
+
* <m:mi>z</m:mi>
|
| 653 |
+
* </m:math>
|
| 654 |
+
* </d4p_MathML>
|
| 655 |
+
* \endxmlonly
|
| 656 |
+
* as a single ternary operation, rounding the
|
| 657 |
+
* result once in round-towards-zero mode.
|
| 658 |
+
*
|
| 659 |
+
* \return Returns the rounded value of
|
| 660 |
+
* \latexonly $x \times y + z$ \endlatexonly
|
| 661 |
+
* \xmlonly
|
| 662 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 663 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 664 |
+
* <m:mi>x</m:mi>
|
| 665 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 666 |
+
* <m:mi>y</m:mi>
|
| 667 |
+
* <m:mo>+</m:mo>
|
| 668 |
+
* <m:mi>z</m:mi>
|
| 669 |
+
* </m:math>
|
| 670 |
+
* </d4p_MathML>
|
| 671 |
+
* \endxmlonly
|
| 672 |
+
* as a single operation.
|
| 673 |
+
* - fmaf(
|
| 674 |
+
* \latexonly $\pm \infty$ \endlatexonly
|
| 675 |
+
* \xmlonly
|
| 676 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 677 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 678 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 679 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 680 |
+
* </m:math>
|
| 681 |
+
* </d4p_MathML>
|
| 682 |
+
* \endxmlonly
|
| 683 |
+
* ,
|
| 684 |
+
* \latexonly $\pm 0$ \endlatexonly
|
| 685 |
+
* \xmlonly
|
| 686 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 687 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 688 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 689 |
+
* <m:mn>0</m:mn>
|
| 690 |
+
* </m:math>
|
| 691 |
+
* </d4p_MathML>
|
| 692 |
+
* \endxmlonly
|
| 693 |
+
* , \p z) returns NaN.
|
| 694 |
+
* - fmaf(
|
| 695 |
+
* \latexonly $\pm 0$ \endlatexonly
|
| 696 |
+
* \xmlonly
|
| 697 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 698 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 699 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 700 |
+
* <m:mn>0</m:mn>
|
| 701 |
+
* </m:math>
|
| 702 |
+
* </d4p_MathML>
|
| 703 |
+
* \endxmlonly
|
| 704 |
+
* ,
|
| 705 |
+
* \latexonly $\pm \infty$ \endlatexonly
|
| 706 |
+
* \xmlonly
|
| 707 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 708 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 709 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 710 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 711 |
+
* </m:math>
|
| 712 |
+
* </d4p_MathML>
|
| 713 |
+
* \endxmlonly
|
| 714 |
+
* , \p z) returns NaN.
|
| 715 |
+
* - fmaf(\p x, \p y,
|
| 716 |
+
* \latexonly $-\infty$ \endlatexonly
|
| 717 |
+
* \xmlonly
|
| 718 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 719 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 720 |
+
* <m:mo>-</m:mo>
|
| 721 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 722 |
+
* </m:math>
|
| 723 |
+
* </d4p_MathML>
|
| 724 |
+
* \endxmlonly
|
| 725 |
+
* ) returns NaN if
|
| 726 |
+
* \latexonly $x \times y$ \endlatexonly
|
| 727 |
+
* \xmlonly
|
| 728 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 729 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 730 |
+
* <m:mi>x</m:mi>
|
| 731 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 732 |
+
* <m:mi>y</m:mi>
|
| 733 |
+
* </m:math>
|
| 734 |
+
* </d4p_MathML>
|
| 735 |
+
* \endxmlonly
|
| 736 |
+
* is an exact
|
| 737 |
+
* \latexonly $+\infty$ \endlatexonly
|
| 738 |
+
* \xmlonly
|
| 739 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 740 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 741 |
+
* <m:mo>+</m:mo>
|
| 742 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 743 |
+
* </m:math>
|
| 744 |
+
* </d4p_MathML>
|
| 745 |
+
* \endxmlonly
|
| 746 |
+
* .
|
| 747 |
+
* - fmaf(\p x, \p y,
|
| 748 |
+
* \latexonly $+\infty$ \endlatexonly
|
| 749 |
+
* \xmlonly
|
| 750 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 751 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 752 |
+
* <m:mo>+</m:mo>
|
| 753 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 754 |
+
* </m:math>
|
| 755 |
+
* </d4p_MathML>
|
| 756 |
+
* \endxmlonly
|
| 757 |
+
* ) returns NaN if
|
| 758 |
+
* \latexonly $x \times y$ \endlatexonly
|
| 759 |
+
* \xmlonly
|
| 760 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 761 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 762 |
+
* <m:mi>x</m:mi>
|
| 763 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 764 |
+
* <m:mi>y</m:mi>
|
| 765 |
+
* </m:math>
|
| 766 |
+
* </d4p_MathML>
|
| 767 |
+
* \endxmlonly
|
| 768 |
+
* is an exact
|
| 769 |
+
* \latexonly $-\infty$ \endlatexonly
|
| 770 |
+
* \xmlonly
|
| 771 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 772 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 773 |
+
* <m:mo>-</m:mo>
|
| 774 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 775 |
+
* </m:math>
|
| 776 |
+
* </d4p_MathML>
|
| 777 |
+
* \endxmlonly
|
| 778 |
+
* .
|
| 779 |
+
*
|
| 780 |
+
* \note_accuracy_double
|
| 781 |
+
*/
|
| 782 |
+
extern __device__ __device_builtin__ double __fma_rz(double x, double y, double z);
|
| 783 |
+
/**
|
| 784 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 785 |
+
* \brief Compute
|
| 786 |
+
* \latexonly $x \times y + z$ \endlatexonly
|
| 787 |
+
* \xmlonly
|
| 788 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 789 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 790 |
+
* <m:mi>x</m:mi>
|
| 791 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 792 |
+
* <m:mi>y</m:mi>
|
| 793 |
+
* <m:mo>+</m:mo>
|
| 794 |
+
* <m:mi>z</m:mi>
|
| 795 |
+
* </m:math>
|
| 796 |
+
* </d4p_MathML>
|
| 797 |
+
* \endxmlonly
|
| 798 |
+
* as a single operation in round-up mode.
|
| 799 |
+
*
|
| 800 |
+
* Computes the value of
|
| 801 |
+
* \latexonly $x \times y + z$ \endlatexonly
|
| 802 |
+
* \xmlonly
|
| 803 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 804 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 805 |
+
* <m:mi>x</m:mi>
|
| 806 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 807 |
+
* <m:mi>y</m:mi>
|
| 808 |
+
* <m:mo>+</m:mo>
|
| 809 |
+
* <m:mi>z</m:mi>
|
| 810 |
+
* </m:math>
|
| 811 |
+
* </d4p_MathML>
|
| 812 |
+
* \endxmlonly
|
| 813 |
+
* as a single ternary operation, rounding the
|
| 814 |
+
* result once in round-up (to positive infinity) mode.
|
| 815 |
+
*
|
| 816 |
+
* \return Returns the rounded value of
|
| 817 |
+
* \latexonly $x \times y + z$ \endlatexonly
|
| 818 |
+
* \xmlonly
|
| 819 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 820 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 821 |
+
* <m:mi>x</m:mi>
|
| 822 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 823 |
+
* <m:mi>y</m:mi>
|
| 824 |
+
* <m:mo>+</m:mo>
|
| 825 |
+
* <m:mi>z</m:mi>
|
| 826 |
+
* </m:math>
|
| 827 |
+
* </d4p_MathML>
|
| 828 |
+
* \endxmlonly
|
| 829 |
+
* as a single operation.
|
| 830 |
+
* - fmaf(
|
| 831 |
+
* \latexonly $\pm \infty$ \endlatexonly
|
| 832 |
+
* \xmlonly
|
| 833 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 834 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 835 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 836 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 837 |
+
* </m:math>
|
| 838 |
+
* </d4p_MathML>
|
| 839 |
+
* \endxmlonly
|
| 840 |
+
* ,
|
| 841 |
+
* \latexonly $\pm 0$ \endlatexonly
|
| 842 |
+
* \xmlonly
|
| 843 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 844 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 845 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 846 |
+
* <m:mn>0</m:mn>
|
| 847 |
+
* </m:math>
|
| 848 |
+
* </d4p_MathML>
|
| 849 |
+
* \endxmlonly
|
| 850 |
+
* , \p z) returns NaN.
|
| 851 |
+
* - fmaf(
|
| 852 |
+
* \latexonly $\pm 0$ \endlatexonly
|
| 853 |
+
* \xmlonly
|
| 854 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 855 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 856 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 857 |
+
* <m:mn>0</m:mn>
|
| 858 |
+
* </m:math>
|
| 859 |
+
* </d4p_MathML>
|
| 860 |
+
* \endxmlonly
|
| 861 |
+
* ,
|
| 862 |
+
* \latexonly $\pm \infty$ \endlatexonly
|
| 863 |
+
* \xmlonly
|
| 864 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 865 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 866 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 867 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 868 |
+
* </m:math>
|
| 869 |
+
* </d4p_MathML>
|
| 870 |
+
* \endxmlonly
|
| 871 |
+
* , \p z) returns NaN.
|
| 872 |
+
* - fmaf(\p x, \p y,
|
| 873 |
+
* \latexonly $-\infty$ \endlatexonly
|
| 874 |
+
* \xmlonly
|
| 875 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 876 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 877 |
+
* <m:mo>-</m:mo>
|
| 878 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 879 |
+
* </m:math>
|
| 880 |
+
* </d4p_MathML>
|
| 881 |
+
* \endxmlonly
|
| 882 |
+
* ) returns NaN if
|
| 883 |
+
* \latexonly $x \times y$ \endlatexonly
|
| 884 |
+
* \xmlonly
|
| 885 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 886 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 887 |
+
* <m:mi>x</m:mi>
|
| 888 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 889 |
+
* <m:mi>y</m:mi>
|
| 890 |
+
* </m:math>
|
| 891 |
+
* </d4p_MathML>
|
| 892 |
+
* \endxmlonly
|
| 893 |
+
* is an exact
|
| 894 |
+
* \latexonly $+\infty$ \endlatexonly
|
| 895 |
+
* \xmlonly
|
| 896 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 897 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 898 |
+
* <m:mo>+</m:mo>
|
| 899 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 900 |
+
* </m:math>
|
| 901 |
+
* </d4p_MathML>
|
| 902 |
+
* \endxmlonly
|
| 903 |
+
* .
|
| 904 |
+
* - fmaf(\p x, \p y,
|
| 905 |
+
* \latexonly $+\infty$ \endlatexonly
|
| 906 |
+
* \xmlonly
|
| 907 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 908 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 909 |
+
* <m:mo>+</m:mo>
|
| 910 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 911 |
+
* </m:math>
|
| 912 |
+
* </d4p_MathML>
|
| 913 |
+
* \endxmlonly
|
| 914 |
+
* ) returns NaN if
|
| 915 |
+
* \latexonly $x \times y$ \endlatexonly
|
| 916 |
+
* \xmlonly
|
| 917 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 918 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 919 |
+
* <m:mi>x</m:mi>
|
| 920 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 921 |
+
* <m:mi>y</m:mi>
|
| 922 |
+
* </m:math>
|
| 923 |
+
* </d4p_MathML>
|
| 924 |
+
* \endxmlonly
|
| 925 |
+
* is an exact
|
| 926 |
+
* \latexonly $-\infty$ \endlatexonly
|
| 927 |
+
* \xmlonly
|
| 928 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 929 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 930 |
+
* <m:mo>-</m:mo>
|
| 931 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 932 |
+
* </m:math>
|
| 933 |
+
* </d4p_MathML>
|
| 934 |
+
* \endxmlonly
|
| 935 |
+
* .
|
| 936 |
+
*
|
| 937 |
+
* \note_accuracy_double
|
| 938 |
+
*/
|
| 939 |
+
extern __device__ __device_builtin__ double __fma_ru(double x, double y, double z);
|
| 940 |
+
/**
|
| 941 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 942 |
+
* \brief Compute
|
| 943 |
+
* \latexonly $x \times y + z$ \endlatexonly
|
| 944 |
+
* \xmlonly
|
| 945 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 946 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 947 |
+
* <m:mi>x</m:mi>
|
| 948 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 949 |
+
* <m:mi>y</m:mi>
|
| 950 |
+
* <m:mo>+</m:mo>
|
| 951 |
+
* <m:mi>z</m:mi>
|
| 952 |
+
* </m:math>
|
| 953 |
+
* </d4p_MathML>
|
| 954 |
+
* \endxmlonly
|
| 955 |
+
* as a single operation in round-down mode.
|
| 956 |
+
*
|
| 957 |
+
* Computes the value of
|
| 958 |
+
* \latexonly $x \times y + z$ \endlatexonly
|
| 959 |
+
* \xmlonly
|
| 960 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 961 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 962 |
+
* <m:mi>x</m:mi>
|
| 963 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 964 |
+
* <m:mi>y</m:mi>
|
| 965 |
+
* <m:mo>+</m:mo>
|
| 966 |
+
* <m:mi>z</m:mi>
|
| 967 |
+
* </m:math>
|
| 968 |
+
* </d4p_MathML>
|
| 969 |
+
* \endxmlonly
|
| 970 |
+
* as a single ternary operation, rounding the
|
| 971 |
+
* result once in round-down (to negative infinity) mode.
|
| 972 |
+
*
|
| 973 |
+
* \return Returns the rounded value of
|
| 974 |
+
* \latexonly $x \times y + z$ \endlatexonly
|
| 975 |
+
* \xmlonly
|
| 976 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 977 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 978 |
+
* <m:mi>x</m:mi>
|
| 979 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 980 |
+
* <m:mi>y</m:mi>
|
| 981 |
+
* <m:mo>+</m:mo>
|
| 982 |
+
* <m:mi>z</m:mi>
|
| 983 |
+
* </m:math>
|
| 984 |
+
* </d4p_MathML>
|
| 985 |
+
* \endxmlonly
|
| 986 |
+
* as a single operation.
|
| 987 |
+
* - fmaf(
|
| 988 |
+
* \latexonly $\pm \infty$ \endlatexonly
|
| 989 |
+
* \xmlonly
|
| 990 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 991 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 992 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 993 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 994 |
+
* </m:math>
|
| 995 |
+
* </d4p_MathML>
|
| 996 |
+
* \endxmlonly
|
| 997 |
+
* ,
|
| 998 |
+
* \latexonly $\pm 0$ \endlatexonly
|
| 999 |
+
* \xmlonly
|
| 1000 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 1001 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 1002 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 1003 |
+
* <m:mn>0</m:mn>
|
| 1004 |
+
* </m:math>
|
| 1005 |
+
* </d4p_MathML>
|
| 1006 |
+
* \endxmlonly
|
| 1007 |
+
* , \p z) returns NaN.
|
| 1008 |
+
* - fmaf(
|
| 1009 |
+
* \latexonly $\pm 0$ \endlatexonly
|
| 1010 |
+
* \xmlonly
|
| 1011 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 1012 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 1013 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 1014 |
+
* <m:mn>0</m:mn>
|
| 1015 |
+
* </m:math>
|
| 1016 |
+
* </d4p_MathML>
|
| 1017 |
+
* \endxmlonly
|
| 1018 |
+
* ,
|
| 1019 |
+
* \latexonly $\pm \infty$ \endlatexonly
|
| 1020 |
+
* \xmlonly
|
| 1021 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 1022 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 1023 |
+
* <m:mo>±<!-- ± --></m:mo>
|
| 1024 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 1025 |
+
* </m:math>
|
| 1026 |
+
* </d4p_MathML>
|
| 1027 |
+
* \endxmlonly
|
| 1028 |
+
* , \p z) returns NaN.
|
| 1029 |
+
* - fmaf(\p x, \p y,
|
| 1030 |
+
* \latexonly $-\infty$ \endlatexonly
|
| 1031 |
+
* \xmlonly
|
| 1032 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 1033 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 1034 |
+
* <m:mo>-</m:mo>
|
| 1035 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 1036 |
+
* </m:math>
|
| 1037 |
+
* </d4p_MathML>
|
| 1038 |
+
* \endxmlonly
|
| 1039 |
+
* ) returns NaN if
|
| 1040 |
+
* \latexonly $x \times y$ \endlatexonly
|
| 1041 |
+
* \xmlonly
|
| 1042 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 1043 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 1044 |
+
* <m:mi>x</m:mi>
|
| 1045 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 1046 |
+
* <m:mi>y</m:mi>
|
| 1047 |
+
* </m:math>
|
| 1048 |
+
* </d4p_MathML>
|
| 1049 |
+
* \endxmlonly
|
| 1050 |
+
* is an exact
|
| 1051 |
+
* \latexonly $+\infty$ \endlatexonly
|
| 1052 |
+
* \xmlonly
|
| 1053 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 1054 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 1055 |
+
* <m:mo>+</m:mo>
|
| 1056 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 1057 |
+
* </m:math>
|
| 1058 |
+
* </d4p_MathML>
|
| 1059 |
+
* \endxmlonly
|
| 1060 |
+
* .
|
| 1061 |
+
* - fmaf(\p x, \p y,
|
| 1062 |
+
* \latexonly $+\infty$ \endlatexonly
|
| 1063 |
+
* \xmlonly
|
| 1064 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 1065 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 1066 |
+
* <m:mo>+</m:mo>
|
| 1067 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 1068 |
+
* </m:math>
|
| 1069 |
+
* </d4p_MathML>
|
| 1070 |
+
* \endxmlonly
|
| 1071 |
+
* ) returns NaN if
|
| 1072 |
+
* \latexonly $x \times y$ \endlatexonly
|
| 1073 |
+
* \xmlonly
|
| 1074 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 1075 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 1076 |
+
* <m:mi>x</m:mi>
|
| 1077 |
+
* <m:mo>×<!-- &Multiply; --></m:mo>
|
| 1078 |
+
* <m:mi>y</m:mi>
|
| 1079 |
+
* </m:math>
|
| 1080 |
+
* </d4p_MathML>
|
| 1081 |
+
* \endxmlonly
|
| 1082 |
+
* is an exact
|
| 1083 |
+
* \latexonly $-\infty$ \endlatexonly
|
| 1084 |
+
* \xmlonly
|
| 1085 |
+
* <d4p_MathML outputclass="xmlonly">
|
| 1086 |
+
* <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
|
| 1087 |
+
* <m:mo>-</m:mo>
|
| 1088 |
+
* <m:mn>∞<!-- &Infinity; --></m:mn>
|
| 1089 |
+
* </m:math>
|
| 1090 |
+
* </d4p_MathML>
|
| 1091 |
+
* \endxmlonly
|
| 1092 |
+
* .
|
| 1093 |
+
*
|
| 1094 |
+
* \note_accuracy_double
|
| 1095 |
+
*/
|
| 1096 |
+
extern __device__ __device_builtin__ double __fma_rd(double x, double y, double z);
|
| 1097 |
+
/**
|
| 1098 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 1099 |
+
* \brief Add two floating-point values in round-to-nearest-even mode.
|
| 1100 |
+
*
|
| 1101 |
+
* Adds two floating-point values \p x and \p y in round-to-nearest-even mode.
|
| 1102 |
+
*
|
| 1103 |
+
* \return Returns \p x + \p y.
|
| 1104 |
+
*
|
| 1105 |
+
* \note_accuracy_double
|
| 1106 |
+
* \note_nofma
|
| 1107 |
+
*/
|
| 1108 |
+
extern __device__ __device_builtin__ double __dadd_rn(double x, double y);
|
| 1109 |
+
/**
|
| 1110 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 1111 |
+
* \brief Add two floating-point values in round-towards-zero mode.
|
| 1112 |
+
*
|
| 1113 |
+
* Adds two floating-point values \p x and \p y in round-towards-zero mode.
|
| 1114 |
+
*
|
| 1115 |
+
* \return Returns \p x + \p y.
|
| 1116 |
+
*
|
| 1117 |
+
* \note_accuracy_double
|
| 1118 |
+
* \note_nofma
|
| 1119 |
+
*/
|
| 1120 |
+
extern __device__ __device_builtin__ double __dadd_rz(double x, double y);
|
| 1121 |
+
/**
|
| 1122 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 1123 |
+
* \brief Add two floating-point values in round-up mode.
|
| 1124 |
+
*
|
| 1125 |
+
* Adds two floating-point values \p x and \p y in round-up (to positive infinity) mode.
|
| 1126 |
+
*
|
| 1127 |
+
* \return Returns \p x + \p y.
|
| 1128 |
+
*
|
| 1129 |
+
* \note_accuracy_double
|
| 1130 |
+
* \note_nofma
|
| 1131 |
+
*/
|
| 1132 |
+
extern __device__ __device_builtin__ double __dadd_ru(double x, double y);
|
| 1133 |
+
/**
|
| 1134 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 1135 |
+
* \brief Add two floating-point values in round-down mode.
|
| 1136 |
+
*
|
| 1137 |
+
* Adds two floating-point values \p x and \p y in round-down (to negative infinity) mode.
|
| 1138 |
+
*
|
| 1139 |
+
* \return Returns \p x + \p y.
|
| 1140 |
+
*
|
| 1141 |
+
* \note_accuracy_double
|
| 1142 |
+
* \note_nofma
|
| 1143 |
+
*/
|
| 1144 |
+
extern __device__ __device_builtin__ double __dadd_rd(double x, double y);
|
| 1145 |
+
/**
|
| 1146 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 1147 |
+
* \brief Subtract two floating-point values in round-to-nearest-even mode.
|
| 1148 |
+
*
|
| 1149 |
+
* Subtracts two floating-point values \p x and \p y in round-to-nearest-even mode.
|
| 1150 |
+
*
|
| 1151 |
+
* \return Returns \p x - \p y.
|
| 1152 |
+
*
|
| 1153 |
+
* \note_accuracy_double
|
| 1154 |
+
* \note_nofma
|
| 1155 |
+
*/
|
| 1156 |
+
extern __device__ __device_builtin__ double __dsub_rn(double x, double y);
|
| 1157 |
+
/**
|
| 1158 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 1159 |
+
* \brief Subtract two floating-point values in round-towards-zero mode.
|
| 1160 |
+
*
|
| 1161 |
+
* Subtracts two floating-point values \p x and \p y in round-towards-zero mode.
|
| 1162 |
+
*
|
| 1163 |
+
* \return Returns \p x - \p y.
|
| 1164 |
+
*
|
| 1165 |
+
* \note_accuracy_double
|
| 1166 |
+
* \note_nofma
|
| 1167 |
+
*/
|
| 1168 |
+
extern __device__ __device_builtin__ double __dsub_rz(double x, double y);
|
| 1169 |
+
/**
|
| 1170 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 1171 |
+
* \brief Subtract two floating-point values in round-up mode.
|
| 1172 |
+
*
|
| 1173 |
+
* Subtracts two floating-point values \p x and \p y in round-up (to positive infinity) mode.
|
| 1174 |
+
*
|
| 1175 |
+
* \return Returns \p x - \p y.
|
| 1176 |
+
*
|
| 1177 |
+
* \note_accuracy_double
|
| 1178 |
+
* \note_nofma
|
| 1179 |
+
*/
|
| 1180 |
+
extern __device__ __device_builtin__ double __dsub_ru(double x, double y);
|
| 1181 |
+
/**
|
| 1182 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 1183 |
+
* \brief Subtract two floating-point values in round-down mode.
|
| 1184 |
+
*
|
| 1185 |
+
* Subtracts two floating-point values \p x and \p y in round-down (to negative infinity) mode.
|
| 1186 |
+
*
|
| 1187 |
+
* \return Returns \p x - \p y.
|
| 1188 |
+
*
|
| 1189 |
+
* \note_accuracy_double
|
| 1190 |
+
* \note_nofma
|
| 1191 |
+
*/
|
| 1192 |
+
extern __device__ __device_builtin__ double __dsub_rd(double x, double y);
|
| 1193 |
+
/**
|
| 1194 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 1195 |
+
* \brief Multiply two floating-point values in round-to-nearest-even mode.
|
| 1196 |
+
*
|
| 1197 |
+
* Multiplies two floating-point values \p x and \p y in round-to-nearest-even mode.
|
| 1198 |
+
*
|
| 1199 |
+
* \return Returns \p x * \p y.
|
| 1200 |
+
*
|
| 1201 |
+
* \note_accuracy_double
|
| 1202 |
+
* \note_nofma
|
| 1203 |
+
*/
|
| 1204 |
+
extern __device__ __device_builtin__ double __dmul_rn(double x, double y);
|
| 1205 |
+
/**
|
| 1206 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 1207 |
+
* \brief Multiply two floating-point values in round-towards-zero mode.
|
| 1208 |
+
*
|
| 1209 |
+
* Multiplies two floating-point values \p x and \p y in round-towards-zero mode.
|
| 1210 |
+
*
|
| 1211 |
+
* \return Returns \p x * \p y.
|
| 1212 |
+
*
|
| 1213 |
+
* \note_accuracy_double
|
| 1214 |
+
* \note_nofma
|
| 1215 |
+
*/
|
| 1216 |
+
extern __device__ __device_builtin__ double __dmul_rz(double x, double y);
|
| 1217 |
+
/**
|
| 1218 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 1219 |
+
* \brief Multiply two floating-point values in round-up mode.
|
| 1220 |
+
*
|
| 1221 |
+
* Multiplies two floating-point values \p x and \p y in round-up (to positive infinity) mode.
|
| 1222 |
+
*
|
| 1223 |
+
* \return Returns \p x * \p y.
|
| 1224 |
+
*
|
| 1225 |
+
* \note_accuracy_double
|
| 1226 |
+
* \note_nofma
|
| 1227 |
+
*/
|
| 1228 |
+
extern __device__ __device_builtin__ double __dmul_ru(double x, double y);
|
| 1229 |
+
/**
|
| 1230 |
+
* \ingroup CUDA_MATH_INTRINSIC_DOUBLE
|
| 1231 |
+
* \brief Multiply two floating-point values in round-down mode.
|
| 1232 |
+
*
|
| 1233 |
+
* Multiplies two floating-point values \p x and \p y in round-down (to negative infinity) mode.
|
| 1234 |
+
*
|
| 1235 |
+
* \return Returns \p x * \p y.
|
| 1236 |
+
*
|
| 1237 |
+
* \note_accuracy_double
|
| 1238 |
+
* \note_nofma
|
| 1239 |
+
*/
|
| 1240 |
+
extern __device__ __device_builtin__ double __dmul_rd(double x, double y);
|
| 1241 |
+
/**
|
| 1242 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1243 |
+
* \brief Convert a double to a float in round-to-nearest-even mode.
|
| 1244 |
+
*
|
| 1245 |
+
* Convert the double-precision floating-point value \p x to a single-precision
|
| 1246 |
+
* floating-point value in round-to-nearest-even mode.
|
| 1247 |
+
* \return Returns converted value.
|
| 1248 |
+
*/
|
| 1249 |
+
extern __device__ __device_builtin__ float __double2float_rn(double x);
|
| 1250 |
+
/**
|
| 1251 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1252 |
+
* \brief Convert a double to a float in round-towards-zero mode.
|
| 1253 |
+
*
|
| 1254 |
+
* Convert the double-precision floating-point value \p x to a single-precision
|
| 1255 |
+
* floating-point value in round-towards-zero mode.
|
| 1256 |
+
* \return Returns converted value.
|
| 1257 |
+
*/
|
| 1258 |
+
extern __device__ __device_builtin__ float __double2float_rz(double x);
|
| 1259 |
+
/**
|
| 1260 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1261 |
+
* \brief Convert a double to a float in round-up mode.
|
| 1262 |
+
*
|
| 1263 |
+
* Convert the double-precision floating-point value \p x to a single-precision
|
| 1264 |
+
* floating-point value in round-up (to positive infinity) mode.
|
| 1265 |
+
* \return Returns converted value.
|
| 1266 |
+
*/
|
| 1267 |
+
extern __device__ __device_builtin__ float __double2float_ru(double x);
|
| 1268 |
+
/**
|
| 1269 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1270 |
+
* \brief Convert a double to a float in round-down mode.
|
| 1271 |
+
*
|
| 1272 |
+
* Convert the double-precision floating-point value \p x to a single-precision
|
| 1273 |
+
* floating-point value in round-down (to negative infinity) mode.
|
| 1274 |
+
* \return Returns converted value.
|
| 1275 |
+
*/
|
| 1276 |
+
extern __device__ __device_builtin__ float __double2float_rd(double x);
|
| 1277 |
+
/**
|
| 1278 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1279 |
+
* \brief Convert a double to a signed int in round-to-nearest-even mode.
|
| 1280 |
+
*
|
| 1281 |
+
* Convert the double-precision floating-point value \p x to a
|
| 1282 |
+
* signed integer value in round-to-nearest-even mode.
|
| 1283 |
+
* \return Returns converted value.
|
| 1284 |
+
*/
|
| 1285 |
+
extern __device__ __device_builtin__ int __double2int_rn(double x);
|
| 1286 |
+
/**
|
| 1287 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1288 |
+
* \brief Convert a double to a signed int in round-up mode.
|
| 1289 |
+
*
|
| 1290 |
+
* Convert the double-precision floating-point value \p x to a
|
| 1291 |
+
* signed integer value in round-up (to positive infinity) mode.
|
| 1292 |
+
* \return Returns converted value.
|
| 1293 |
+
*/
|
| 1294 |
+
extern __device__ __device_builtin__ int __double2int_ru(double x);
|
| 1295 |
+
/**
|
| 1296 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1297 |
+
* \brief Convert a double to a signed int in round-down mode.
|
| 1298 |
+
*
|
| 1299 |
+
* Convert the double-precision floating-point value \p x to a
|
| 1300 |
+
* signed integer value in round-down (to negative infinity) mode.
|
| 1301 |
+
* \return Returns converted value.
|
| 1302 |
+
*/
|
| 1303 |
+
extern __device__ __device_builtin__ int __double2int_rd(double x);
|
| 1304 |
+
/**
|
| 1305 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1306 |
+
* \brief Convert a double to an unsigned int in round-to-nearest-even mode.
|
| 1307 |
+
*
|
| 1308 |
+
* Convert the double-precision floating-point value \p x to an
|
| 1309 |
+
* unsigned integer value in round-to-nearest-even mode.
|
| 1310 |
+
* \return Returns converted value.
|
| 1311 |
+
*/
|
| 1312 |
+
extern __device__ __device_builtin__ unsigned int __double2uint_rn(double x);
|
| 1313 |
+
/**
|
| 1314 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1315 |
+
* \brief Convert a double to an unsigned int in round-up mode.
|
| 1316 |
+
*
|
| 1317 |
+
* Convert the double-precision floating-point value \p x to an
|
| 1318 |
+
* unsigned integer value in round-up (to positive infinity) mode.
|
| 1319 |
+
* \return Returns converted value.
|
| 1320 |
+
*/
|
| 1321 |
+
extern __device__ __device_builtin__ unsigned int __double2uint_ru(double x);
|
| 1322 |
+
/**
|
| 1323 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1324 |
+
* \brief Convert a double to an unsigned int in round-down mode.
|
| 1325 |
+
*
|
| 1326 |
+
* Convert the double-precision floating-point value \p x to an
|
| 1327 |
+
* unsigned integer value in round-down (to negative infinity) mode.
|
| 1328 |
+
* \return Returns converted value.
|
| 1329 |
+
*/
|
| 1330 |
+
extern __device__ __device_builtin__ unsigned int __double2uint_rd(double x);
|
| 1331 |
+
/**
|
| 1332 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1333 |
+
* \brief Convert a double to a signed 64-bit int in round-to-nearest-even mode.
|
| 1334 |
+
*
|
| 1335 |
+
* Convert the double-precision floating-point value \p x to a
|
| 1336 |
+
* signed 64-bit integer value in round-to-nearest-even mode.
|
| 1337 |
+
* \return Returns converted value.
|
| 1338 |
+
*/
|
| 1339 |
+
extern __device__ __device_builtin__ long long int __double2ll_rn(double x);
|
| 1340 |
+
/**
|
| 1341 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1342 |
+
* \brief Convert a double to a signed 64-bit int in round-up mode.
|
| 1343 |
+
*
|
| 1344 |
+
* Convert the double-precision floating-point value \p x to a
|
| 1345 |
+
* signed 64-bit integer value in round-up (to positive infinity) mode.
|
| 1346 |
+
* \return Returns converted value.
|
| 1347 |
+
*/
|
| 1348 |
+
extern __device__ __device_builtin__ long long int __double2ll_ru(double x);
|
| 1349 |
+
/**
|
| 1350 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1351 |
+
* \brief Convert a double to a signed 64-bit int in round-down mode.
|
| 1352 |
+
*
|
| 1353 |
+
* Convert the double-precision floating-point value \p x to a
|
| 1354 |
+
* signed 64-bit integer value in round-down (to negative infinity) mode.
|
| 1355 |
+
* \return Returns converted value.
|
| 1356 |
+
*/
|
| 1357 |
+
extern __device__ __device_builtin__ long long int __double2ll_rd(double x);
|
| 1358 |
+
/**
|
| 1359 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1360 |
+
* \brief Convert a double to an unsigned 64-bit int in round-to-nearest-even mode.
|
| 1361 |
+
*
|
| 1362 |
+
* Convert the double-precision floating-point value \p x to an
|
| 1363 |
+
* unsigned 64-bit integer value in round-to-nearest-even mode.
|
| 1364 |
+
* \return Returns converted value.
|
| 1365 |
+
*/
|
| 1366 |
+
extern __device__ __device_builtin__ unsigned long long int __double2ull_rn(double x);
|
| 1367 |
+
/**
|
| 1368 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1369 |
+
* \brief Convert a double to an unsigned 64-bit int in round-up mode.
|
| 1370 |
+
*
|
| 1371 |
+
* Convert the double-precision floating-point value \p x to an
|
| 1372 |
+
* unsigned 64-bit integer value in round-up (to positive infinity) mode.
|
| 1373 |
+
* \return Returns converted value.
|
| 1374 |
+
*/
|
| 1375 |
+
extern __device__ __device_builtin__ unsigned long long int __double2ull_ru(double x);
|
| 1376 |
+
/**
|
| 1377 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1378 |
+
* \brief Convert a double to an unsigned 64-bit int in round-down mode.
|
| 1379 |
+
*
|
| 1380 |
+
* Convert the double-precision floating-point value \p x to an
|
| 1381 |
+
* unsigned 64-bit integer value in round-down (to negative infinity) mode.
|
| 1382 |
+
* \return Returns converted value.
|
| 1383 |
+
*/
|
| 1384 |
+
extern __device__ __device_builtin__ unsigned long long int __double2ull_rd(double x);
|
| 1385 |
+
/**
|
| 1386 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1387 |
+
* \brief Convert a signed int to a double.
|
| 1388 |
+
*
|
| 1389 |
+
* Convert the signed integer value \p x to a double-precision floating-point value.
|
| 1390 |
+
* \return Returns converted value.
|
| 1391 |
+
*/
|
| 1392 |
+
extern __device__ __device_builtin__ double __int2double_rn(int x);
|
| 1393 |
+
/**
|
| 1394 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1395 |
+
* \brief Convert an unsigned int to a double.
|
| 1396 |
+
*
|
| 1397 |
+
* Convert the unsigned integer value \p x to a double-precision floating-point value.
|
| 1398 |
+
* \return Returns converted value.
|
| 1399 |
+
*/
|
| 1400 |
+
extern __device__ __device_builtin__ double __uint2double_rn(unsigned int x);
|
| 1401 |
+
/**
|
| 1402 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1403 |
+
* \brief Convert a signed 64-bit int to a double in round-to-nearest-even mode.
|
| 1404 |
+
*
|
| 1405 |
+
* Convert the signed 64-bit integer value \p x to a double-precision floating-point
|
| 1406 |
+
* value in round-to-nearest-even mode.
|
| 1407 |
+
* \return Returns converted value.
|
| 1408 |
+
*/
|
| 1409 |
+
extern __device__ __device_builtin__ double __ll2double_rn(long long int x);
|
| 1410 |
+
/**
|
| 1411 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1412 |
+
* \brief Convert a signed 64-bit int to a double in round-towards-zero mode.
|
| 1413 |
+
*
|
| 1414 |
+
* Convert the signed 64-bit integer value \p x to a double-precision floating-point
|
| 1415 |
+
* value in round-towards-zero mode.
|
| 1416 |
+
* \return Returns converted value.
|
| 1417 |
+
*/
|
| 1418 |
+
extern __device__ __device_builtin__ double __ll2double_rz(long long int x);
|
| 1419 |
+
/**
|
| 1420 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1421 |
+
* \brief Convert a signed 64-bit int to a double in round-up mode.
|
| 1422 |
+
*
|
| 1423 |
+
* Convert the signed 64-bit integer value \p x to a double-precision floating-point
|
| 1424 |
+
* value in round-up (to positive infinity) mode.
|
| 1425 |
+
* \return Returns converted value.
|
| 1426 |
+
*/
|
| 1427 |
+
extern __device__ __device_builtin__ double __ll2double_ru(long long int x);
|
| 1428 |
+
/**
|
| 1429 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1430 |
+
* \brief Convert a signed 64-bit int to a double in round-down mode.
|
| 1431 |
+
*
|
| 1432 |
+
* Convert the signed 64-bit integer value \p x to a double-precision floating-point
|
| 1433 |
+
* value in round-down (to negative infinity) mode.
|
| 1434 |
+
* \return Returns converted value.
|
| 1435 |
+
*/
|
| 1436 |
+
extern __device__ __device_builtin__ double __ll2double_rd(long long int x);
|
| 1437 |
+
/**
|
| 1438 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1439 |
+
* \brief Convert an unsigned 64-bit int to a double in round-to-nearest-even mode.
|
| 1440 |
+
*
|
| 1441 |
+
* Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
|
| 1442 |
+
* value in round-to-nearest-even mode.
|
| 1443 |
+
* \return Returns converted value.
|
| 1444 |
+
*/
|
| 1445 |
+
extern __device__ __device_builtin__ double __ull2double_rn(unsigned long long int x);
|
| 1446 |
+
/**
|
| 1447 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1448 |
+
* \brief Convert an unsigned 64-bit int to a double in round-towards-zero mode.
|
| 1449 |
+
*
|
| 1450 |
+
* Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
|
| 1451 |
+
* value in round-towards-zero mode.
|
| 1452 |
+
* \return Returns converted value.
|
| 1453 |
+
*/
|
| 1454 |
+
extern __device__ __device_builtin__ double __ull2double_rz(unsigned long long int x);
|
| 1455 |
+
/**
|
| 1456 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1457 |
+
* \brief Convert an unsigned 64-bit int to a double in round-up mode.
|
| 1458 |
+
*
|
| 1459 |
+
* Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
|
| 1460 |
+
* value in round-up (to positive infinity) mode.
|
| 1461 |
+
* \return Returns converted value.
|
| 1462 |
+
*/
|
| 1463 |
+
extern __device__ __device_builtin__ double __ull2double_ru(unsigned long long int x);
|
| 1464 |
+
/**
|
| 1465 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1466 |
+
* \brief Convert an unsigned 64-bit int to a double in round-down mode.
|
| 1467 |
+
*
|
| 1468 |
+
* Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
|
| 1469 |
+
* value in round-down (to negative infinity) mode.
|
| 1470 |
+
* \return Returns converted value.
|
| 1471 |
+
*/
|
| 1472 |
+
extern __device__ __device_builtin__ double __ull2double_rd(unsigned long long int x);
|
| 1473 |
+
/**
|
| 1474 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1475 |
+
* \brief Reinterpret high 32 bits in a double as a signed integer.
|
| 1476 |
+
*
|
| 1477 |
+
* Reinterpret the high 32 bits in the double-precision floating-point value \p x
|
| 1478 |
+
* as a signed integer.
|
| 1479 |
+
* \return Returns reinterpreted value.
|
| 1480 |
+
*/
|
| 1481 |
+
extern __device__ __device_builtin__ int __double2hiint(double x);
|
| 1482 |
+
/**
|
| 1483 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1484 |
+
* \brief Reinterpret low 32 bits in a double as a signed integer.
|
| 1485 |
+
*
|
| 1486 |
+
* Reinterpret the low 32 bits in the double-precision floating-point value \p x
|
| 1487 |
+
* as a signed integer.
|
| 1488 |
+
* \return Returns reinterpreted value.
|
| 1489 |
+
*/
|
| 1490 |
+
extern __device__ __device_builtin__ int __double2loint(double x);
|
| 1491 |
+
/**
|
| 1492 |
+
* \ingroup CUDA_MATH_INTRINSIC_CAST
|
| 1493 |
+
* \brief Reinterpret high and low 32-bit integer values as a double.
|
| 1494 |
+
*
|
| 1495 |
+
* Reinterpret the integer value of \p hi as the high 32 bits of a
|
| 1496 |
+
* double-precision floating-point value and the integer value of \p lo
|
| 1497 |
+
* as the low 32 bits of the same double-precision floating-point value.
|
| 1498 |
+
* \return Returns reinterpreted value.
|
| 1499 |
+
*/
|
| 1500 |
+
extern __device__ __device_builtin__ double __hiloint2double(int hi, int lo);
|
| 1501 |
+
|
| 1502 |
+
|
| 1503 |
+
}
|
| 1504 |
+
|
| 1505 |
+
/*******************************************************************************
|
| 1506 |
+
* *
|
| 1507 |
+
* *
|
| 1508 |
+
* *
|
| 1509 |
+
*******************************************************************************/
|
| 1510 |
+
__SM_20_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__ballot)) unsigned int ballot(bool pred) __DEF_IF_HOST
|
| 1511 |
+
|
| 1512 |
+
__SM_20_INTRINSICS_DECL__ int syncthreads_count(bool pred) __DEF_IF_HOST
|
| 1513 |
+
|
| 1514 |
+
__SM_20_INTRINSICS_DECL__ bool syncthreads_and(bool pred) __DEF_IF_HOST
|
| 1515 |
+
|
| 1516 |
+
__SM_20_INTRINSICS_DECL__ bool syncthreads_or(bool pred) __DEF_IF_HOST
|
| 1517 |
+
|
| 1518 |
+
#undef __DEPRECATED__
|
| 1519 |
+
#undef __WSB_DEPRECATION_MESSAGE
|
| 1520 |
+
|
| 1521 |
+
__SM_20_INTRINSICS_DECL__ unsigned int __isGlobal(const void *ptr) __DEF_IF_HOST
|
| 1522 |
+
__SM_20_INTRINSICS_DECL__ unsigned int __isShared(const void *ptr) __DEF_IF_HOST
|
| 1523 |
+
__SM_20_INTRINSICS_DECL__ unsigned int __isConstant(const void *ptr) __DEF_IF_HOST
|
| 1524 |
+
__SM_20_INTRINSICS_DECL__ unsigned int __isLocal(const void *ptr) __DEF_IF_HOST
|
| 1525 |
+
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
|
| 1526 |
+
__SM_20_INTRINSICS_DECL__ unsigned int __isGridConstant(const void *ptr) __DEF_IF_HOST
|
| 1527 |
+
#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
|
| 1528 |
+
__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_global(const void *ptr) __DEF_IF_HOST
|
| 1529 |
+
__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_shared(const void *ptr) __DEF_IF_HOST
|
| 1530 |
+
__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_constant(const void *ptr) __DEF_IF_HOST
|
| 1531 |
+
__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_local(const void *ptr) __DEF_IF_HOST
|
| 1532 |
+
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
|
| 1533 |
+
__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_grid_constant(const void *ptr) __DEF_IF_HOST
|
| 1534 |
+
#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
|
| 1535 |
+
|
| 1536 |
+
__SM_20_INTRINSICS_DECL__ void * __cvta_global_to_generic(size_t rawbits) __DEF_IF_HOST
|
| 1537 |
+
__SM_20_INTRINSICS_DECL__ void * __cvta_shared_to_generic(size_t rawbits) __DEF_IF_HOST
|
| 1538 |
+
__SM_20_INTRINSICS_DECL__ void * __cvta_constant_to_generic(size_t rawbits) __DEF_IF_HOST
|
| 1539 |
+
__SM_20_INTRINSICS_DECL__ void * __cvta_local_to_generic(size_t rawbits) __DEF_IF_HOST
|
| 1540 |
+
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
|
| 1541 |
+
__SM_20_INTRINSICS_DECL__ void * __cvta_grid_constant_to_generic(size_t rawbits) __DEF_IF_HOST
|
| 1542 |
+
#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
|
| 1543 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 1544 |
+
|
| 1545 |
+
#undef __DEF_IF_HOST
|
| 1546 |
+
#undef __SM_20_INTRINSICS_DECL__
|
| 1547 |
+
|
| 1548 |
+
#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
|
| 1549 |
+
#include "sm_20_intrinsics.hpp"
|
| 1550 |
+
#endif /* !__CUDACC_RTC__ */
|
| 1551 |
+
#endif /* !__SM_20_INTRINSICS_H__ && defined(__CUDA_ARCH__) */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_intrinsics.hpp
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__SM_20_INTRINSICS_HPP__)
|
| 51 |
+
#define __SM_20_INTRINSICS_HPP__
|
| 52 |
+
|
| 53 |
+
#if defined(__CUDACC_RTC__)
|
| 54 |
+
#define __SM_20_INTRINSICS_DECL__ __device__
|
| 55 |
+
#else /* __CUDACC_RTC__ */
|
| 56 |
+
#define __SM_20_INTRINSICS_DECL__ static __inline__ __device__
|
| 57 |
+
#endif /* __CUDACC_RTC__ */
|
| 58 |
+
|
| 59 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 60 |
+
|
| 61 |
+
/*******************************************************************************
|
| 62 |
+
* *
|
| 63 |
+
* *
|
| 64 |
+
* *
|
| 65 |
+
*******************************************************************************/
|
| 66 |
+
|
| 67 |
+
#include "cuda_runtime_api.h"
|
| 68 |
+
|
| 69 |
+
/*******************************************************************************
|
| 70 |
+
* *
|
| 71 |
+
* *
|
| 72 |
+
* *
|
| 73 |
+
*******************************************************************************/
|
| 74 |
+
|
| 75 |
+
__SM_20_INTRINSICS_DECL__ unsigned int ballot(bool pred)
|
| 76 |
+
{
|
| 77 |
+
return __ballot((int)pred);
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
__SM_20_INTRINSICS_DECL__ int syncthreads_count(bool pred)
|
| 81 |
+
{
|
| 82 |
+
return __syncthreads_count((int)pred);
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
__SM_20_INTRINSICS_DECL__ bool syncthreads_and(bool pred)
|
| 86 |
+
{
|
| 87 |
+
return (bool)__syncthreads_and((int)pred);
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
__SM_20_INTRINSICS_DECL__ bool syncthreads_or(bool pred)
|
| 91 |
+
{
|
| 92 |
+
return (bool)__syncthreads_or((int)pred);
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
extern "C" {
|
| 97 |
+
__device__ unsigned __nv_isGlobal_impl(const void *);
|
| 98 |
+
__device__ unsigned __nv_isShared_impl(const void *);
|
| 99 |
+
__device__ unsigned __nv_isConstant_impl(const void *);
|
| 100 |
+
__device__ unsigned __nv_isLocal_impl(const void *);
|
| 101 |
+
__device__ unsigned __nv_isGridConstant_impl(const void *);
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
__SM_20_INTRINSICS_DECL__ unsigned int __isGlobal(const void *ptr)
|
| 105 |
+
{
|
| 106 |
+
return __nv_isGlobal_impl(ptr);
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
__SM_20_INTRINSICS_DECL__ unsigned int __isShared(const void *ptr)
|
| 110 |
+
{
|
| 111 |
+
return __nv_isShared_impl(ptr);
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
__SM_20_INTRINSICS_DECL__ unsigned int __isConstant(const void *ptr)
|
| 115 |
+
{
|
| 116 |
+
return __nv_isConstant_impl(ptr);
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
__SM_20_INTRINSICS_DECL__ unsigned int __isLocal(const void *ptr)
|
| 120 |
+
{
|
| 121 |
+
return __nv_isLocal_impl(ptr);
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
|
| 125 |
+
__SM_20_INTRINSICS_DECL__ unsigned int __isGridConstant(const void *ptr)
|
| 126 |
+
{
|
| 127 |
+
return __nv_isGridConstant_impl(ptr);
|
| 128 |
+
}
|
| 129 |
+
#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
|
| 130 |
+
|
| 131 |
+
extern "C" {
|
| 132 |
+
__device__ size_t __nv_cvta_generic_to_global_impl(const void *);
|
| 133 |
+
__device__ size_t __nv_cvta_generic_to_shared_impl(const void *);
|
| 134 |
+
__device__ size_t __nv_cvta_generic_to_constant_impl(const void *);
|
| 135 |
+
__device__ size_t __nv_cvta_generic_to_local_impl(const void *);
|
| 136 |
+
__device__ void * __nv_cvta_global_to_generic_impl(size_t);
|
| 137 |
+
__device__ void * __nv_cvta_shared_to_generic_impl(size_t);
|
| 138 |
+
__device__ void * __nv_cvta_constant_to_generic_impl(size_t);
|
| 139 |
+
__device__ void * __nv_cvta_local_to_generic_impl(size_t);
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_global(const void *p)
|
| 143 |
+
{
|
| 144 |
+
return __nv_cvta_generic_to_global_impl(p);
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_shared(const void *p)
|
| 148 |
+
{
|
| 149 |
+
return __nv_cvta_generic_to_shared_impl(p);
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_constant(const void *p)
|
| 153 |
+
{
|
| 154 |
+
return __nv_cvta_generic_to_constant_impl(p);
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_local(const void *p)
|
| 158 |
+
{
|
| 159 |
+
return __nv_cvta_generic_to_local_impl(p);
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
__SM_20_INTRINSICS_DECL__ void * __cvta_global_to_generic(size_t rawbits)
|
| 163 |
+
{
|
| 164 |
+
return __nv_cvta_global_to_generic_impl(rawbits);
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
__SM_20_INTRINSICS_DECL__ void * __cvta_shared_to_generic(size_t rawbits)
|
| 168 |
+
{
|
| 169 |
+
return __nv_cvta_shared_to_generic_impl(rawbits);
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
__SM_20_INTRINSICS_DECL__ void * __cvta_constant_to_generic(size_t rawbits)
|
| 173 |
+
{
|
| 174 |
+
return __nv_cvta_constant_to_generic_impl(rawbits);
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
__SM_20_INTRINSICS_DECL__ void * __cvta_local_to_generic(size_t rawbits)
|
| 178 |
+
{
|
| 179 |
+
return __nv_cvta_local_to_generic_impl(rawbits);
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
|
| 183 |
+
#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
|
| 184 |
+
#define __CVTA_PTR_64 1
|
| 185 |
+
#endif
|
| 186 |
+
|
| 187 |
+
__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_grid_constant(const void *ptr)
|
| 188 |
+
{
|
| 189 |
+
#if __CVTA_PTR_64
|
| 190 |
+
unsigned long long ret;
|
| 191 |
+
asm("cvta.to.param.u64 %0, %1;" : "=l"(ret) : "l"(ptr));
|
| 192 |
+
#else /* !__CVTA_PTR_64 */
|
| 193 |
+
unsigned ret;
|
| 194 |
+
asm("cvta.to.param.u32 %0, %1;" : "=r"(ret) : "r"(ptr));
|
| 195 |
+
#endif /* __CVTA_PTR_64 */
|
| 196 |
+
return (size_t)ret;
|
| 197 |
+
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
__SM_20_INTRINSICS_DECL__ void * __cvta_grid_constant_to_generic(size_t rawbits)
|
| 201 |
+
{
|
| 202 |
+
void *ret;
|
| 203 |
+
#if __CVTA_PTR_64
|
| 204 |
+
unsigned long long in = rawbits;
|
| 205 |
+
asm("cvta.param.u64 %0, %1;" : "=l"(ret) : "l"(in));
|
| 206 |
+
#else /* !__CVTA_PTR_64 */
|
| 207 |
+
unsigned in = rawbits;
|
| 208 |
+
asm("cvta.param.u32 %0, %1;" : "=r"(ret) : "r"(in));
|
| 209 |
+
#endif /* __CVTA_PTR_64 */
|
| 210 |
+
return ret;
|
| 211 |
+
}
|
| 212 |
+
#undef __CVTA_PTR_64
|
| 213 |
+
#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 217 |
+
|
| 218 |
+
#undef __SM_20_INTRINSICS_DECL__
|
| 219 |
+
|
| 220 |
+
#endif /* !__SM_20_INTRINSICS_HPP__ */
|
| 221 |
+
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_30_intrinsics.h
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__SM_30_INTRINSICS_H__)
|
| 51 |
+
#define __SM_30_INTRINSICS_H__
|
| 52 |
+
|
| 53 |
+
#if defined(__CUDACC_RTC__)
|
| 54 |
+
#define __SM_30_INTRINSICS_DECL__ __device__
|
| 55 |
+
#else /* !__CUDACC_RTC__ */
|
| 56 |
+
#define __SM_30_INTRINSICS_DECL__ static __device__ __inline__
|
| 57 |
+
#endif /* __CUDACC_RTC__ */
|
| 58 |
+
|
| 59 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 60 |
+
|
| 61 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
|
| 62 |
+
|
| 63 |
+
/*******************************************************************************
|
| 64 |
+
* *
|
| 65 |
+
* *
|
| 66 |
+
* *
|
| 67 |
+
*******************************************************************************/
|
| 68 |
+
|
| 69 |
+
#include "cuda_runtime_api.h"
|
| 70 |
+
|
| 71 |
+
#ifndef __CUDA_ARCH__
|
| 72 |
+
#define __DEF_IF_HOST { }
|
| 73 |
+
#else /* !__CUDA_ARCH__ */
|
| 74 |
+
#define __DEF_IF_HOST ;
|
| 75 |
+
#endif /* __CUDA_ARCH__ */
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
/*******************************************************************************
|
| 79 |
+
* *
|
| 80 |
+
* Below are declarations of SM-3.0 intrinsics which are included as *
|
| 81 |
+
* source (instead of being built in to the compiler) *
|
| 82 |
+
* *
|
| 83 |
+
*******************************************************************************/
|
| 84 |
+
|
| 85 |
+
#if !defined warpSize && !defined __local_warpSize
|
| 86 |
+
#define warpSize 32
|
| 87 |
+
#define __local_warpSize
|
| 88 |
+
#endif
|
| 89 |
+
|
| 90 |
+
#if defined(_WIN32)
|
| 91 |
+
# define __DEPRECATED__(msg) __declspec(deprecated(msg))
|
| 92 |
+
#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
|
| 93 |
+
# define __DEPRECATED__(msg) __attribute__((deprecated))
|
| 94 |
+
#else
|
| 95 |
+
# define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
|
| 96 |
+
#endif
|
| 97 |
+
|
| 98 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
|
| 99 |
+
#define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)."
|
| 100 |
+
#endif
|
| 101 |
+
|
| 102 |
+
__SM_30_INTRINSICS_DECL__ unsigned __fns(unsigned mask, unsigned base, int offset) __DEF_IF_HOST
|
| 103 |
+
__SM_30_INTRINSICS_DECL__ void __barrier_sync(unsigned id) __DEF_IF_HOST
|
| 104 |
+
__SM_30_INTRINSICS_DECL__ void __barrier_sync_count(unsigned id, unsigned cnt) __DEF_IF_HOST
|
| 105 |
+
__SM_30_INTRINSICS_DECL__ void __syncwarp(unsigned mask=0xFFFFFFFF) __DEF_IF_HOST
|
| 106 |
+
__SM_30_INTRINSICS_DECL__ int __all_sync(unsigned mask, int pred) __DEF_IF_HOST
|
| 107 |
+
__SM_30_INTRINSICS_DECL__ int __any_sync(unsigned mask, int pred) __DEF_IF_HOST
|
| 108 |
+
__SM_30_INTRINSICS_DECL__ int __uni_sync(unsigned mask, int pred) __DEF_IF_HOST
|
| 109 |
+
__SM_30_INTRINSICS_DECL__ unsigned __ballot_sync(unsigned mask, int pred) __DEF_IF_HOST
|
| 110 |
+
__SM_30_INTRINSICS_DECL__ unsigned __activemask() __DEF_IF_HOST
|
| 111 |
+
|
| 112 |
+
// Warp register exchange (shuffle) intrinsics.
|
| 113 |
+
// Notes:
|
| 114 |
+
// a) Warp size is hardcoded to 32 here, because the compiler does not know
|
| 115 |
+
// the "warpSize" constant at this time
|
| 116 |
+
// b) we cannot map the float __shfl to the int __shfl because it'll mess with
|
| 117 |
+
// the register number (especially if you're doing two shfls to move a double).
|
| 118 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
|
| 119 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) int __shfl(int var, int srcLane, int width=warpSize) __DEF_IF_HOST
|
| 120 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) unsigned int __shfl(unsigned int var, int srcLane, int width=warpSize) __DEF_IF_HOST
|
| 121 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) int __shfl_up(int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 122 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) unsigned int __shfl_up(unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 123 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) int __shfl_down(int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 124 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) unsigned int __shfl_down(unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 125 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) int __shfl_xor(int var, int laneMask, int width=warpSize) __DEF_IF_HOST
|
| 126 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) unsigned int __shfl_xor(unsigned int var, int laneMask, int width=warpSize) __DEF_IF_HOST
|
| 127 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) float __shfl(float var, int srcLane, int width=warpSize) __DEF_IF_HOST
|
| 128 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) float __shfl_up(float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 129 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) float __shfl_down(float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 130 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) float __shfl_xor(float var, int laneMask, int width=warpSize) __DEF_IF_HOST
|
| 131 |
+
#endif
|
| 132 |
+
|
| 133 |
+
__SM_30_INTRINSICS_DECL__ int __shfl_sync(unsigned mask, int var, int srcLane, int width=warpSize) __DEF_IF_HOST
|
| 134 |
+
__SM_30_INTRINSICS_DECL__ unsigned int __shfl_sync(unsigned mask, unsigned int var, int srcLane, int width=warpSize) __DEF_IF_HOST
|
| 135 |
+
__SM_30_INTRINSICS_DECL__ int __shfl_up_sync(unsigned mask, int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 136 |
+
__SM_30_INTRINSICS_DECL__ unsigned int __shfl_up_sync(unsigned mask, unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 137 |
+
__SM_30_INTRINSICS_DECL__ int __shfl_down_sync(unsigned mask, int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 138 |
+
__SM_30_INTRINSICS_DECL__ unsigned int __shfl_down_sync(unsigned mask, unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 139 |
+
__SM_30_INTRINSICS_DECL__ int __shfl_xor_sync(unsigned mask, int var, int laneMask, int width=warpSize) __DEF_IF_HOST
|
| 140 |
+
__SM_30_INTRINSICS_DECL__ unsigned int __shfl_xor_sync(unsigned mask, unsigned int var, int laneMask, int width=warpSize) __DEF_IF_HOST
|
| 141 |
+
__SM_30_INTRINSICS_DECL__ float __shfl_sync(unsigned mask, float var, int srcLane, int width=warpSize) __DEF_IF_HOST
|
| 142 |
+
__SM_30_INTRINSICS_DECL__ float __shfl_up_sync(unsigned mask, float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 143 |
+
__SM_30_INTRINSICS_DECL__ float __shfl_down_sync(unsigned mask, float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 144 |
+
__SM_30_INTRINSICS_DECL__ float __shfl_xor_sync(unsigned mask, float var, int laneMask, int width=warpSize) __DEF_IF_HOST
|
| 145 |
+
|
| 146 |
+
// 64-bits SHFL
|
| 147 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
|
| 148 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) unsigned long long __shfl(unsigned long long var, int srcLane, int width=warpSize) __DEF_IF_HOST
|
| 149 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) long long __shfl(long long var, int srcLane, int width=warpSize) __DEF_IF_HOST
|
| 150 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) long long __shfl_up(long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 151 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) unsigned long long __shfl_up(unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 152 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) long long __shfl_down(long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 153 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) unsigned long long __shfl_down(unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 154 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) long long __shfl_xor(long long var, int laneMask, int width=warpSize) __DEF_IF_HOST
|
| 155 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) unsigned long long __shfl_xor(unsigned long long var, int laneMask, int width=warpSize) __DEF_IF_HOST
|
| 156 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) double __shfl(double var, int srcLane, int width=warpSize) __DEF_IF_HOST
|
| 157 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) double __shfl_up(double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 158 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) double __shfl_down(double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 159 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) double __shfl_xor(double var, int laneMask, int width=warpSize) __DEF_IF_HOST
|
| 160 |
+
#endif
|
| 161 |
+
|
| 162 |
+
__SM_30_INTRINSICS_DECL__ long long __shfl_sync(unsigned mask, long long var, int srcLane, int width=warpSize) __DEF_IF_HOST
|
| 163 |
+
__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_sync(unsigned mask, unsigned long long var, int srcLane, int width=warpSize) __DEF_IF_HOST
|
| 164 |
+
__SM_30_INTRINSICS_DECL__ long long __shfl_up_sync(unsigned mask, long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 165 |
+
__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_up_sync(unsigned mask, unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 166 |
+
__SM_30_INTRINSICS_DECL__ long long __shfl_down_sync(unsigned mask, long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 167 |
+
__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_down_sync(unsigned mask, unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 168 |
+
__SM_30_INTRINSICS_DECL__ long long __shfl_xor_sync(unsigned mask, long long var, int laneMask, int width=warpSize) __DEF_IF_HOST
|
| 169 |
+
__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_xor_sync(unsigned mask, unsigned long long var, int laneMask, int width=warpSize) __DEF_IF_HOST
|
| 170 |
+
__SM_30_INTRINSICS_DECL__ double __shfl_sync(unsigned mask, double var, int srcLane, int width=warpSize) __DEF_IF_HOST
|
| 171 |
+
__SM_30_INTRINSICS_DECL__ double __shfl_up_sync(unsigned mask, double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 172 |
+
__SM_30_INTRINSICS_DECL__ double __shfl_down_sync(unsigned mask, double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 173 |
+
__SM_30_INTRINSICS_DECL__ double __shfl_xor_sync(unsigned mask, double var, int laneMask, int width=warpSize) __DEF_IF_HOST
|
| 174 |
+
|
| 175 |
+
// long needs some help to choose between 32-bits and 64-bits
|
| 176 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
|
| 177 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) long __shfl(long var, int srcLane, int width=warpSize) __DEF_IF_HOST
|
| 178 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) unsigned long __shfl(unsigned long var, int srcLane, int width=warpSize) __DEF_IF_HOST
|
| 179 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) long __shfl_up(long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 180 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) unsigned long __shfl_up(unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 181 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) long __shfl_down(long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 182 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) unsigned long __shfl_down(unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 183 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) long __shfl_xor(long var, int laneMask, int width=warpSize) __DEF_IF_HOST
|
| 184 |
+
__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) unsigned long __shfl_xor(unsigned long var, int laneMask, int width=warpSize) __DEF_IF_HOST
|
| 185 |
+
#endif
|
| 186 |
+
|
| 187 |
+
__SM_30_INTRINSICS_DECL__ long __shfl_sync(unsigned mask, long var, int srcLane, int width=warpSize) __DEF_IF_HOST
|
| 188 |
+
__SM_30_INTRINSICS_DECL__ unsigned long __shfl_sync(unsigned mask, unsigned long var, int srcLane, int width=warpSize) __DEF_IF_HOST
|
| 189 |
+
__SM_30_INTRINSICS_DECL__ long __shfl_up_sync(unsigned mask, long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 190 |
+
__SM_30_INTRINSICS_DECL__ unsigned long __shfl_up_sync(unsigned mask, unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 191 |
+
__SM_30_INTRINSICS_DECL__ long __shfl_down_sync(unsigned mask, long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 192 |
+
__SM_30_INTRINSICS_DECL__ unsigned long __shfl_down_sync(unsigned mask, unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
|
| 193 |
+
__SM_30_INTRINSICS_DECL__ long __shfl_xor_sync(unsigned mask, long var, int laneMask, int width=warpSize) __DEF_IF_HOST
|
| 194 |
+
__SM_30_INTRINSICS_DECL__ unsigned long __shfl_xor_sync(unsigned mask, unsigned long var, int laneMask, int width=warpSize) __DEF_IF_HOST
|
| 195 |
+
|
| 196 |
+
#undef __DEPRECATED__
|
| 197 |
+
#undef __WSB_DEPRECATION_MESSAGE
|
| 198 |
+
|
| 199 |
+
#if defined(__local_warpSize)
|
| 200 |
+
#undef warpSize
|
| 201 |
+
#undef __local_warpSize
|
| 202 |
+
#endif
|
| 203 |
+
|
| 204 |
+
#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 300 */
|
| 205 |
+
|
| 206 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 207 |
+
|
| 208 |
+
#undef __DEF_IF_HOST
|
| 209 |
+
#undef __SM_30_INTRINSICS_DECL__
|
| 210 |
+
|
| 211 |
+
#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
|
| 212 |
+
#include "sm_30_intrinsics.hpp"
|
| 213 |
+
#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
|
| 214 |
+
|
| 215 |
+
#endif /* !__SM_30_INTRINSICS_H__ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_30_intrinsics.hpp
ADDED
|
@@ -0,0 +1,604 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__SM_30_INTRINSICS_HPP__)
|
| 51 |
+
#define __SM_30_INTRINSICS_HPP__
|
| 52 |
+
|
| 53 |
+
#if defined(__CUDACC_RTC__)
|
| 54 |
+
#define __SM_30_INTRINSICS_DECL__ __device__
|
| 55 |
+
#else /* !__CUDACC_RTC__ */
|
| 56 |
+
#define __SM_30_INTRINSICS_DECL__ static __device__ __inline__
|
| 57 |
+
#endif /* __CUDACC_RTC__ */
|
| 58 |
+
|
| 59 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 60 |
+
|
| 61 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
|
| 62 |
+
|
| 63 |
+
/*******************************************************************************
|
| 64 |
+
* *
|
| 65 |
+
* *
|
| 66 |
+
* *
|
| 67 |
+
*******************************************************************************/
|
| 68 |
+
|
| 69 |
+
#include "cuda_runtime_api.h"
|
| 70 |
+
|
| 71 |
+
// In here are intrinsics which are built in to the compiler. These may be
|
| 72 |
+
// referenced by intrinsic implementations from this file.
|
| 73 |
+
extern "C"
|
| 74 |
+
{
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
/*******************************************************************************
|
| 78 |
+
* *
|
| 79 |
+
* Below are implementations of SM-3.0 intrinsics which are included as *
|
| 80 |
+
* source (instead of being built in to the compiler) *
|
| 81 |
+
* *
|
| 82 |
+
*******************************************************************************/
|
| 83 |
+
|
| 84 |
+
#if !defined warpSize && !defined __local_warpSize
|
| 85 |
+
#define warpSize 32
|
| 86 |
+
#define __local_warpSize
|
| 87 |
+
#endif
|
| 88 |
+
|
| 89 |
+
__SM_30_INTRINSICS_DECL__
|
| 90 |
+
unsigned __fns(unsigned mask, unsigned base, int offset) {
|
| 91 |
+
extern __device__ __device_builtin__ unsigned int __nvvm_fns(unsigned int mask, unsigned int base, int offset);
|
| 92 |
+
return __nvvm_fns(mask, base, offset);
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
__SM_30_INTRINSICS_DECL__
|
| 96 |
+
void __barrier_sync(unsigned id) {
|
| 97 |
+
extern __device__ __device_builtin__ void __nvvm_barrier_sync(unsigned id);
|
| 98 |
+
return __nvvm_barrier_sync(id);
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
__SM_30_INTRINSICS_DECL__
|
| 102 |
+
void __barrier_sync_count(unsigned id, unsigned cnt) {
|
| 103 |
+
extern __device__ __device_builtin__ void __nvvm_barrier_sync_cnt(unsigned id, unsigned cnt);
|
| 104 |
+
return __nvvm_barrier_sync_cnt(id, cnt);
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
__SM_30_INTRINSICS_DECL__
|
| 108 |
+
void __syncwarp(unsigned mask) {
|
| 109 |
+
extern __device__ __device_builtin__ void __nvvm_bar_warp_sync(unsigned mask);
|
| 110 |
+
return __nvvm_bar_warp_sync(mask);
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
__SM_30_INTRINSICS_DECL__
|
| 114 |
+
int __all_sync(unsigned mask, int pred) {
|
| 115 |
+
extern __device__ __device_builtin__ int __nvvm_vote_all_sync(unsigned int mask, int pred);
|
| 116 |
+
return __nvvm_vote_all_sync(mask, pred);
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
__SM_30_INTRINSICS_DECL__
|
| 120 |
+
int __any_sync(unsigned mask, int pred) {
|
| 121 |
+
extern __device__ __device_builtin__ int __nvvm_vote_any_sync(unsigned int mask, int pred);
|
| 122 |
+
return __nvvm_vote_any_sync(mask, pred);
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
__SM_30_INTRINSICS_DECL__
|
| 126 |
+
int __uni_sync(unsigned mask, int pred) {
|
| 127 |
+
extern __device__ __device_builtin__ int __nvvm_vote_uni_sync(unsigned int mask, int pred);
|
| 128 |
+
return __nvvm_vote_uni_sync(mask, pred);
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
__SM_30_INTRINSICS_DECL__
|
| 132 |
+
unsigned __ballot_sync(unsigned mask, int pred) {
|
| 133 |
+
extern __device__ __device_builtin__ unsigned int __nvvm_vote_ballot_sync(unsigned int mask, int pred);
|
| 134 |
+
return __nvvm_vote_ballot_sync(mask, pred);
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
__SM_30_INTRINSICS_DECL__
|
| 138 |
+
unsigned __activemask() {
|
| 139 |
+
unsigned ret;
|
| 140 |
+
asm volatile ("activemask.b32 %0;" : "=r"(ret));
|
| 141 |
+
return ret;
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
// These are removed starting with compute_70 and onwards
|
| 145 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
|
| 146 |
+
|
| 147 |
+
__SM_30_INTRINSICS_DECL__ int __shfl(int var, int srcLane, int width) {
|
| 148 |
+
int ret;
|
| 149 |
+
int c = ((warpSize-width) << 8) | 0x1f;
|
| 150 |
+
asm volatile ("shfl.idx.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(srcLane), "r"(c));
|
| 151 |
+
return ret;
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
__SM_30_INTRINSICS_DECL__ unsigned int __shfl(unsigned int var, int srcLane, int width) {
|
| 155 |
+
return (unsigned int) __shfl((int)var, srcLane, width);
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
__SM_30_INTRINSICS_DECL__ int __shfl_up(int var, unsigned int delta, int width) {
|
| 159 |
+
int ret;
|
| 160 |
+
int c = (warpSize-width) << 8;
|
| 161 |
+
asm volatile ("shfl.up.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(delta), "r"(c));
|
| 162 |
+
return ret;
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
__SM_30_INTRINSICS_DECL__ unsigned int __shfl_up(unsigned int var, unsigned int delta, int width) {
|
| 166 |
+
return (unsigned int) __shfl_up((int)var, delta, width);
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
__SM_30_INTRINSICS_DECL__ int __shfl_down(int var, unsigned int delta, int width) {
|
| 170 |
+
int ret;
|
| 171 |
+
int c = ((warpSize-width) << 8) | 0x1f;
|
| 172 |
+
asm volatile ("shfl.down.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(delta), "r"(c));
|
| 173 |
+
return ret;
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
__SM_30_INTRINSICS_DECL__ unsigned int __shfl_down(unsigned int var, unsigned int delta, int width) {
|
| 177 |
+
return (unsigned int) __shfl_down((int)var, delta, width);
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
__SM_30_INTRINSICS_DECL__ int __shfl_xor(int var, int laneMask, int width) {
|
| 181 |
+
int ret;
|
| 182 |
+
int c = ((warpSize-width) << 8) | 0x1f;
|
| 183 |
+
asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(laneMask), "r"(c));
|
| 184 |
+
return ret;
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
__SM_30_INTRINSICS_DECL__ unsigned int __shfl_xor(unsigned int var, int laneMask, int width) {
|
| 188 |
+
return (unsigned int) __shfl_xor((int)var, laneMask, width);
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
__SM_30_INTRINSICS_DECL__ float __shfl(float var, int srcLane, int width) {
|
| 192 |
+
float ret;
|
| 193 |
+
int c;
|
| 194 |
+
c = ((warpSize-width) << 8) | 0x1f;
|
| 195 |
+
asm volatile ("shfl.idx.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(srcLane), "r"(c));
|
| 196 |
+
return ret;
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
__SM_30_INTRINSICS_DECL__ float __shfl_up(float var, unsigned int delta, int width) {
|
| 200 |
+
float ret;
|
| 201 |
+
int c;
|
| 202 |
+
c = (warpSize-width) << 8;
|
| 203 |
+
asm volatile ("shfl.up.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(delta), "r"(c));
|
| 204 |
+
return ret;
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
__SM_30_INTRINSICS_DECL__ float __shfl_down(float var, unsigned int delta, int width) {
|
| 208 |
+
float ret;
|
| 209 |
+
int c;
|
| 210 |
+
c = ((warpSize-width) << 8) | 0x1f;
|
| 211 |
+
asm volatile ("shfl.down.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(delta), "r"(c));
|
| 212 |
+
return ret;
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
__SM_30_INTRINSICS_DECL__ float __shfl_xor(float var, int laneMask, int width) {
|
| 216 |
+
float ret;
|
| 217 |
+
int c;
|
| 218 |
+
c = ((warpSize-width) << 8) | 0x1f;
|
| 219 |
+
asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(laneMask), "r"(c));
|
| 220 |
+
return ret;
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
// 64-bits SHFL
|
| 224 |
+
|
| 225 |
+
__SM_30_INTRINSICS_DECL__ long long __shfl(long long var, int srcLane, int width) {
|
| 226 |
+
int lo, hi;
|
| 227 |
+
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
|
| 228 |
+
hi = __shfl(hi, srcLane, width);
|
| 229 |
+
lo = __shfl(lo, srcLane, width);
|
| 230 |
+
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
|
| 231 |
+
return var;
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
__SM_30_INTRINSICS_DECL__ unsigned long long __shfl(unsigned long long var, int srcLane, int width) {
|
| 235 |
+
return (unsigned long long) __shfl((long long) var, srcLane, width);
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
__SM_30_INTRINSICS_DECL__ long long __shfl_up(long long var, unsigned int delta, int width) {
|
| 239 |
+
int lo, hi;
|
| 240 |
+
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
|
| 241 |
+
hi = __shfl_up(hi, delta, width);
|
| 242 |
+
lo = __shfl_up(lo, delta, width);
|
| 243 |
+
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
|
| 244 |
+
return var;
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_up(unsigned long long var, unsigned int delta, int width) {
|
| 248 |
+
return (unsigned long long) __shfl_up((long long) var, delta, width);
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
__SM_30_INTRINSICS_DECL__ long long __shfl_down(long long var, unsigned int delta, int width) {
|
| 252 |
+
int lo, hi;
|
| 253 |
+
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
|
| 254 |
+
hi = __shfl_down(hi, delta, width);
|
| 255 |
+
lo = __shfl_down(lo, delta, width);
|
| 256 |
+
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
|
| 257 |
+
return var;
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_down(unsigned long long var, unsigned int delta, int width) {
|
| 261 |
+
return (unsigned long long) __shfl_down((long long) var, delta, width);
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
__SM_30_INTRINSICS_DECL__ long long __shfl_xor(long long var, int laneMask, int width) {
|
| 265 |
+
int lo, hi;
|
| 266 |
+
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
|
| 267 |
+
hi = __shfl_xor(hi, laneMask, width);
|
| 268 |
+
lo = __shfl_xor(lo, laneMask, width);
|
| 269 |
+
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
|
| 270 |
+
return var;
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_xor(unsigned long long var, int laneMask, int width) {
|
| 274 |
+
return (unsigned long long) __shfl_xor((long long) var, laneMask, width);
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
__SM_30_INTRINSICS_DECL__ double __shfl(double var, int srcLane, int width) {
|
| 278 |
+
unsigned lo, hi;
|
| 279 |
+
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
|
| 280 |
+
hi = __shfl(hi, srcLane, width);
|
| 281 |
+
lo = __shfl(lo, srcLane, width);
|
| 282 |
+
asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
|
| 283 |
+
return var;
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
__SM_30_INTRINSICS_DECL__ double __shfl_up(double var, unsigned int delta, int width) {
|
| 287 |
+
unsigned lo, hi;
|
| 288 |
+
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
|
| 289 |
+
hi = __shfl_up(hi, delta, width);
|
| 290 |
+
lo = __shfl_up(lo, delta, width);
|
| 291 |
+
asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
|
| 292 |
+
return var;
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
__SM_30_INTRINSICS_DECL__ double __shfl_down(double var, unsigned int delta, int width) {
|
| 296 |
+
unsigned lo, hi;
|
| 297 |
+
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
|
| 298 |
+
hi = __shfl_down(hi, delta, width);
|
| 299 |
+
lo = __shfl_down(lo, delta, width);
|
| 300 |
+
asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
|
| 301 |
+
return var;
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
__SM_30_INTRINSICS_DECL__ double __shfl_xor(double var, int laneMask, int width) {
|
| 305 |
+
unsigned lo, hi;
|
| 306 |
+
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
|
| 307 |
+
hi = __shfl_xor(hi, laneMask, width);
|
| 308 |
+
lo = __shfl_xor(lo, laneMask, width);
|
| 309 |
+
asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
|
| 310 |
+
return var;
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
__SM_30_INTRINSICS_DECL__ long __shfl(long var, int srcLane, int width) {
|
| 314 |
+
return (sizeof(long) == sizeof(long long)) ?
|
| 315 |
+
__shfl((long long) var, srcLane, width) :
|
| 316 |
+
__shfl((int) var, srcLane, width);
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
__SM_30_INTRINSICS_DECL__ unsigned long __shfl(unsigned long var, int srcLane, int width) {
|
| 320 |
+
return (sizeof(long) == sizeof(long long)) ?
|
| 321 |
+
__shfl((unsigned long long) var, srcLane, width) :
|
| 322 |
+
__shfl((unsigned int) var, srcLane, width);
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
__SM_30_INTRINSICS_DECL__ long __shfl_up(long var, unsigned int delta, int width) {
|
| 326 |
+
return (sizeof(long) == sizeof(long long)) ?
|
| 327 |
+
__shfl_up((long long) var, delta, width) :
|
| 328 |
+
__shfl_up((int) var, delta, width);
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
__SM_30_INTRINSICS_DECL__ unsigned long __shfl_up(unsigned long var, unsigned int delta, int width) {
|
| 332 |
+
return (sizeof(long) == sizeof(long long)) ?
|
| 333 |
+
__shfl_up((unsigned long long) var, delta, width) :
|
| 334 |
+
__shfl_up((unsigned int) var, delta, width);
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
__SM_30_INTRINSICS_DECL__ long __shfl_down(long var, unsigned int delta, int width) {
|
| 338 |
+
return (sizeof(long) == sizeof(long long)) ?
|
| 339 |
+
__shfl_down((long long) var, delta, width) :
|
| 340 |
+
__shfl_down((int) var, delta, width);
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
__SM_30_INTRINSICS_DECL__ unsigned long __shfl_down(unsigned long var, unsigned int delta, int width) {
|
| 344 |
+
return (sizeof(long) == sizeof(long long)) ?
|
| 345 |
+
__shfl_down((unsigned long long) var, delta, width) :
|
| 346 |
+
__shfl_down((unsigned int) var, delta, width);
|
| 347 |
+
}
|
| 348 |
+
|
| 349 |
+
__SM_30_INTRINSICS_DECL__ long __shfl_xor(long var, int laneMask, int width) {
|
| 350 |
+
return (sizeof(long) == sizeof(long long)) ?
|
| 351 |
+
__shfl_xor((long long) var, laneMask, width) :
|
| 352 |
+
__shfl_xor((int) var, laneMask, width);
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
__SM_30_INTRINSICS_DECL__ unsigned long __shfl_xor(unsigned long var, int laneMask, int width) {
|
| 356 |
+
return (sizeof(long) == sizeof(long long)) ?
|
| 357 |
+
__shfl_xor((unsigned long long) var, laneMask, width) :
|
| 358 |
+
__shfl_xor((unsigned int) var, laneMask, width);
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */
|
| 362 |
+
|
| 363 |
+
// Warp register exchange (shuffle) intrinsics.
|
| 364 |
+
// Notes:
|
| 365 |
+
// a) Warp size is hardcoded to 32 here, because the compiler does not know
|
| 366 |
+
// the "warpSize" constant at this time
|
| 367 |
+
// b) we cannot map the float __shfl to the int __shfl because it'll mess with
|
| 368 |
+
// the register number (especially if you're doing two shfls to move a double).
|
| 369 |
+
__SM_30_INTRINSICS_DECL__ int __shfl_sync(unsigned mask, int var, int srcLane, int width) {
|
| 370 |
+
extern __device__ __device_builtin__ unsigned __nvvm_shfl_idx_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
|
| 371 |
+
int ret;
|
| 372 |
+
int c = ((warpSize-width) << 8) | 0x1f;
|
| 373 |
+
ret = __nvvm_shfl_idx_sync(mask, var, srcLane, c);
|
| 374 |
+
return ret;
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
__SM_30_INTRINSICS_DECL__ unsigned int __shfl_sync(unsigned mask, unsigned int var, int srcLane, int width) {
|
| 378 |
+
return (unsigned int) __shfl_sync(mask, (int)var, srcLane, width);
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
__SM_30_INTRINSICS_DECL__ int __shfl_up_sync(unsigned mask, int var, unsigned int delta, int width) {
|
| 382 |
+
extern __device__ __device_builtin__ unsigned __nvvm_shfl_up_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
|
| 383 |
+
int ret;
|
| 384 |
+
int c = (warpSize-width) << 8;
|
| 385 |
+
ret = __nvvm_shfl_up_sync(mask, var, delta, c);
|
| 386 |
+
return ret;
|
| 387 |
+
}
|
| 388 |
+
|
| 389 |
+
__SM_30_INTRINSICS_DECL__ unsigned int __shfl_up_sync(unsigned mask, unsigned int var, unsigned int delta, int width) {
|
| 390 |
+
return (unsigned int) __shfl_up_sync(mask, (int)var, delta, width);
|
| 391 |
+
}
|
| 392 |
+
|
| 393 |
+
__SM_30_INTRINSICS_DECL__ int __shfl_down_sync(unsigned mask, int var, unsigned int delta, int width) {
|
| 394 |
+
extern __device__ __device_builtin__ unsigned __nvvm_shfl_down_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
|
| 395 |
+
int ret;
|
| 396 |
+
int c = ((warpSize-width) << 8) | 0x1f;
|
| 397 |
+
ret = __nvvm_shfl_down_sync(mask, var, delta, c);
|
| 398 |
+
return ret;
|
| 399 |
+
}
|
| 400 |
+
|
| 401 |
+
__SM_30_INTRINSICS_DECL__ unsigned int __shfl_down_sync(unsigned mask, unsigned int var, unsigned int delta, int width) {
|
| 402 |
+
return (unsigned int) __shfl_down_sync(mask, (int)var, delta, width);
|
| 403 |
+
}
|
| 404 |
+
|
| 405 |
+
__SM_30_INTRINSICS_DECL__ int __shfl_xor_sync(unsigned mask, int var, int laneMask, int width) {
|
| 406 |
+
extern __device__ __device_builtin__ unsigned __nvvm_shfl_bfly_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
|
| 407 |
+
int ret;
|
| 408 |
+
int c = ((warpSize-width) << 8) | 0x1f;
|
| 409 |
+
ret = __nvvm_shfl_bfly_sync(mask, var, laneMask, c);
|
| 410 |
+
return ret;
|
| 411 |
+
}
|
| 412 |
+
|
| 413 |
+
__SM_30_INTRINSICS_DECL__ unsigned int __shfl_xor_sync(unsigned mask, unsigned int var, int laneMask, int width) {
|
| 414 |
+
return (unsigned int) __shfl_xor_sync(mask, (int)var, laneMask, width);
|
| 415 |
+
}
|
| 416 |
+
|
| 417 |
+
__SM_30_INTRINSICS_DECL__ float __shfl_sync(unsigned mask, float var, int srcLane, int width) {
|
| 418 |
+
extern __device__ __device_builtin__ unsigned __nvvm_shfl_idx_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
|
| 419 |
+
int ret;
|
| 420 |
+
int c;
|
| 421 |
+
c = ((warpSize-width) << 8) | 0x1f;
|
| 422 |
+
ret = __nvvm_shfl_idx_sync(mask, __float_as_int(var), srcLane, c);
|
| 423 |
+
return __int_as_float(ret);
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
__SM_30_INTRINSICS_DECL__ float __shfl_up_sync(unsigned mask, float var, unsigned int delta, int width) {
|
| 427 |
+
extern __device__ __device_builtin__ unsigned __nvvm_shfl_up_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
|
| 428 |
+
int ret;
|
| 429 |
+
int c;
|
| 430 |
+
c = (warpSize-width) << 8;
|
| 431 |
+
ret = __nvvm_shfl_up_sync(mask, __float_as_int(var), delta, c);
|
| 432 |
+
return __int_as_float(ret);
|
| 433 |
+
}
|
| 434 |
+
|
| 435 |
+
__SM_30_INTRINSICS_DECL__ float __shfl_down_sync(unsigned mask, float var, unsigned int delta, int width) {
|
| 436 |
+
extern __device__ __device_builtin__ unsigned __nvvm_shfl_down_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
|
| 437 |
+
int ret;
|
| 438 |
+
int c;
|
| 439 |
+
c = ((warpSize-width) << 8) | 0x1f;
|
| 440 |
+
ret = __nvvm_shfl_down_sync(mask, __float_as_int(var), delta, c);
|
| 441 |
+
return __int_as_float(ret);
|
| 442 |
+
}
|
| 443 |
+
|
| 444 |
+
__SM_30_INTRINSICS_DECL__ float __shfl_xor_sync(unsigned mask, float var, int laneMask, int width) {
|
| 445 |
+
extern __device__ __device_builtin__ unsigned __nvvm_shfl_bfly_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
|
| 446 |
+
int ret;
|
| 447 |
+
int c;
|
| 448 |
+
c = ((warpSize-width) << 8) | 0x1f;
|
| 449 |
+
ret = __nvvm_shfl_bfly_sync(mask, __float_as_int(var), laneMask, c);
|
| 450 |
+
return __int_as_float(ret);
|
| 451 |
+
}
|
| 452 |
+
|
| 453 |
+
// 64-bits SHFL
|
| 454 |
+
__SM_30_INTRINSICS_DECL__ long long __shfl_sync(unsigned mask, long long var, int srcLane, int width) {
|
| 455 |
+
int lo, hi;
|
| 456 |
+
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
|
| 457 |
+
hi = __shfl_sync(mask, hi, srcLane, width);
|
| 458 |
+
lo = __shfl_sync(mask, lo, srcLane, width);
|
| 459 |
+
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
|
| 460 |
+
return var;
|
| 461 |
+
}
|
| 462 |
+
|
| 463 |
+
__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_sync(unsigned mask, unsigned long long var, int srcLane, int width) {
|
| 464 |
+
return (unsigned long long) __shfl_sync(mask, (long long) var, srcLane, width);
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
__SM_30_INTRINSICS_DECL__ long long __shfl_up_sync(unsigned mask, long long var, unsigned int delta, int width) {
|
| 468 |
+
int lo, hi;
|
| 469 |
+
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
|
| 470 |
+
hi = __shfl_up_sync(mask, hi, delta, width);
|
| 471 |
+
lo = __shfl_up_sync(mask, lo, delta, width);
|
| 472 |
+
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
|
| 473 |
+
return var;
|
| 474 |
+
}
|
| 475 |
+
|
| 476 |
+
__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_up_sync(unsigned mask, unsigned long long var, unsigned int delta, int width) {
|
| 477 |
+
return (unsigned long long) __shfl_up_sync(mask, (long long) var, delta, width);
|
| 478 |
+
}
|
| 479 |
+
|
| 480 |
+
__SM_30_INTRINSICS_DECL__ long long __shfl_down_sync(unsigned mask, long long var, unsigned int delta, int width) {
|
| 481 |
+
int lo, hi;
|
| 482 |
+
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
|
| 483 |
+
hi = __shfl_down_sync(mask, hi, delta, width);
|
| 484 |
+
lo = __shfl_down_sync(mask, lo, delta, width);
|
| 485 |
+
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
|
| 486 |
+
return var;
|
| 487 |
+
}
|
| 488 |
+
|
| 489 |
+
__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_down_sync(unsigned mask, unsigned long long var, unsigned int delta, int width) {
|
| 490 |
+
return (unsigned long long) __shfl_down_sync(mask, (long long) var, delta, width);
|
| 491 |
+
}
|
| 492 |
+
|
| 493 |
+
__SM_30_INTRINSICS_DECL__ long long __shfl_xor_sync(unsigned mask, long long var, int laneMask, int width) {
|
| 494 |
+
int lo, hi;
|
| 495 |
+
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
|
| 496 |
+
hi = __shfl_xor_sync(mask, hi, laneMask, width);
|
| 497 |
+
lo = __shfl_xor_sync(mask, lo, laneMask, width);
|
| 498 |
+
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
|
| 499 |
+
return var;
|
| 500 |
+
}
|
| 501 |
+
|
| 502 |
+
__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_xor_sync(unsigned mask, unsigned long long var, int laneMask, int width) {
|
| 503 |
+
return (unsigned long long) __shfl_xor_sync(mask, (long long) var, laneMask, width);
|
| 504 |
+
}
|
| 505 |
+
|
| 506 |
+
__SM_30_INTRINSICS_DECL__ double __shfl_sync(unsigned mask, double var, int srcLane, int width) {
|
| 507 |
+
unsigned lo, hi;
|
| 508 |
+
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
|
| 509 |
+
hi = __shfl_sync(mask, hi, srcLane, width);
|
| 510 |
+
lo = __shfl_sync(mask, lo, srcLane, width);
|
| 511 |
+
asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
|
| 512 |
+
return var;
|
| 513 |
+
}
|
| 514 |
+
|
| 515 |
+
__SM_30_INTRINSICS_DECL__ double __shfl_up_sync(unsigned mask, double var, unsigned int delta, int width) {
|
| 516 |
+
unsigned lo, hi;
|
| 517 |
+
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
|
| 518 |
+
hi = __shfl_up_sync(mask, hi, delta, width);
|
| 519 |
+
lo = __shfl_up_sync(mask, lo, delta, width);
|
| 520 |
+
asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
|
| 521 |
+
return var;
|
| 522 |
+
}
|
| 523 |
+
|
| 524 |
+
__SM_30_INTRINSICS_DECL__ double __shfl_down_sync(unsigned mask, double var, unsigned int delta, int width) {
|
| 525 |
+
unsigned lo, hi;
|
| 526 |
+
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
|
| 527 |
+
hi = __shfl_down_sync(mask, hi, delta, width);
|
| 528 |
+
lo = __shfl_down_sync(mask, lo, delta, width);
|
| 529 |
+
asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
|
| 530 |
+
return var;
|
| 531 |
+
}
|
| 532 |
+
|
| 533 |
+
__SM_30_INTRINSICS_DECL__ double __shfl_xor_sync(unsigned mask, double var, int laneMask, int width) {
|
| 534 |
+
unsigned lo, hi;
|
| 535 |
+
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
|
| 536 |
+
hi = __shfl_xor_sync(mask, hi, laneMask, width);
|
| 537 |
+
lo = __shfl_xor_sync(mask, lo, laneMask, width);
|
| 538 |
+
asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
|
| 539 |
+
return var;
|
| 540 |
+
}
|
| 541 |
+
|
| 542 |
+
// long needs some help to choose between 32-bits and 64-bits
|
| 543 |
+
|
| 544 |
+
__SM_30_INTRINSICS_DECL__ long __shfl_sync(unsigned mask, long var, int srcLane, int width) {
|
| 545 |
+
return (sizeof(long) == sizeof(long long)) ?
|
| 546 |
+
__shfl_sync(mask, (long long) var, srcLane, width) :
|
| 547 |
+
__shfl_sync(mask, (int) var, srcLane, width);
|
| 548 |
+
}
|
| 549 |
+
|
| 550 |
+
__SM_30_INTRINSICS_DECL__ unsigned long __shfl_sync(unsigned mask, unsigned long var, int srcLane, int width) {
|
| 551 |
+
return (sizeof(long) == sizeof(long long)) ?
|
| 552 |
+
__shfl_sync(mask, (unsigned long long) var, srcLane, width) :
|
| 553 |
+
__shfl_sync(mask, (unsigned int) var, srcLane, width);
|
| 554 |
+
}
|
| 555 |
+
|
| 556 |
+
__SM_30_INTRINSICS_DECL__ long __shfl_up_sync(unsigned mask, long var, unsigned int delta, int width) {
|
| 557 |
+
return (sizeof(long) == sizeof(long long)) ?
|
| 558 |
+
__shfl_up_sync(mask, (long long) var, delta, width) :
|
| 559 |
+
__shfl_up_sync(mask, (int) var, delta, width);
|
| 560 |
+
}
|
| 561 |
+
|
| 562 |
+
__SM_30_INTRINSICS_DECL__ unsigned long __shfl_up_sync(unsigned mask, unsigned long var, unsigned int delta, int width) {
|
| 563 |
+
return (sizeof(long) == sizeof(long long)) ?
|
| 564 |
+
__shfl_up_sync(mask, (unsigned long long) var, delta, width) :
|
| 565 |
+
__shfl_up_sync(mask, (unsigned int) var, delta, width);
|
| 566 |
+
}
|
| 567 |
+
|
| 568 |
+
__SM_30_INTRINSICS_DECL__ long __shfl_down_sync(unsigned mask, long var, unsigned int delta, int width) {
|
| 569 |
+
return (sizeof(long) == sizeof(long long)) ?
|
| 570 |
+
__shfl_down_sync(mask, (long long) var, delta, width) :
|
| 571 |
+
__shfl_down_sync(mask, (int) var, delta, width);
|
| 572 |
+
}
|
| 573 |
+
|
| 574 |
+
__SM_30_INTRINSICS_DECL__ unsigned long __shfl_down_sync(unsigned mask, unsigned long var, unsigned int delta, int width) {
|
| 575 |
+
return (sizeof(long) == sizeof(long long)) ?
|
| 576 |
+
__shfl_down_sync(mask, (unsigned long long) var, delta, width) :
|
| 577 |
+
__shfl_down_sync(mask, (unsigned int) var, delta, width);
|
| 578 |
+
}
|
| 579 |
+
|
| 580 |
+
__SM_30_INTRINSICS_DECL__ long __shfl_xor_sync(unsigned mask, long var, int laneMask, int width) {
|
| 581 |
+
return (sizeof(long) == sizeof(long long)) ?
|
| 582 |
+
__shfl_xor_sync(mask, (long long) var, laneMask, width) :
|
| 583 |
+
__shfl_xor_sync(mask, (int) var, laneMask, width);
|
| 584 |
+
}
|
| 585 |
+
|
| 586 |
+
__SM_30_INTRINSICS_DECL__ unsigned long __shfl_xor_sync(unsigned mask, unsigned long var, int laneMask, int width) {
|
| 587 |
+
return (sizeof(long) == sizeof(long long)) ?
|
| 588 |
+
__shfl_xor_sync(mask, (unsigned long long) var, laneMask, width) :
|
| 589 |
+
__shfl_xor_sync(mask, (unsigned int) var, laneMask, width);
|
| 590 |
+
}
|
| 591 |
+
|
| 592 |
+
#if defined(__local_warpSize)
|
| 593 |
+
#undef warpSize
|
| 594 |
+
#undef __local_warpSize
|
| 595 |
+
#endif
|
| 596 |
+
|
| 597 |
+
#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 300 */
|
| 598 |
+
|
| 599 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 600 |
+
|
| 601 |
+
#undef __SM_30_INTRINSICS_DECL__
|
| 602 |
+
|
| 603 |
+
#endif /* !__SM_30_INTRINSICS_HPP__ */
|
| 604 |
+
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_35_atomic_functions.h
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 35.235 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.35.235 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__SM_35_ATOMIC_FUNCTIONS_H__)
|
| 51 |
+
#define __SM_35_ATOMIC_FUNCTIONS_H__
|
| 52 |
+
|
| 53 |
+
/*******************************************************************************
|
| 54 |
+
* All sm_35 atomics are supported by sm_32 so simply include its header file *
|
| 55 |
+
*******************************************************************************/
|
| 56 |
+
#include "sm_32_atomic_functions.h"
|
| 57 |
+
|
| 58 |
+
#endif /* !__SM_35_ATOMIC_FUNCTIONS_H__ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_61_intrinsics.hpp
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 2016 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__SM_61_INTRINSICS_HPP__)
|
| 51 |
+
#define __SM_61_INTRINSICS_HPP__
|
| 52 |
+
|
| 53 |
+
#if defined(__CUDACC_RTC__)
|
| 54 |
+
#define __SM_61_INTRINSICS_DECL__ __device__
|
| 55 |
+
#else /* !__CUDACC_RTC__ */
|
| 56 |
+
#define __SM_61_INTRINSICS_DECL__ static __device__ __inline__
|
| 57 |
+
#endif /* __CUDACC_RTC__ */
|
| 58 |
+
|
| 59 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 60 |
+
|
| 61 |
+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 610
|
| 62 |
+
|
| 63 |
+
/*******************************************************************************
|
| 64 |
+
* *
|
| 65 |
+
* *
|
| 66 |
+
* *
|
| 67 |
+
*******************************************************************************/
|
| 68 |
+
|
| 69 |
+
#include "cuda_runtime_api.h"
|
| 70 |
+
|
| 71 |
+
/*******************************************************************************
|
| 72 |
+
* *
|
| 73 |
+
* Below are implementations of SM-6.1 intrinsics which are included as *
|
| 74 |
+
* source (instead of being built in to the compiler) *
|
| 75 |
+
* *
|
| 76 |
+
*******************************************************************************/
|
| 77 |
+
|
| 78 |
+
// 4a
|
| 79 |
+
__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, int srcB, int c) {
|
| 80 |
+
int ret;
|
| 81 |
+
asm volatile ("dp4a.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
|
| 82 |
+
return ret;
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(unsigned int srcA, unsigned int srcB, unsigned int c) {
|
| 86 |
+
unsigned int ret;
|
| 87 |
+
asm volatile ("dp4a.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
|
| 88 |
+
return ret;
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
__SM_61_INTRINSICS_DECL__ int __dp4a(char4 srcA, char4 srcB, int c) {
|
| 92 |
+
int ret;
|
| 93 |
+
asm volatile ("dp4a.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c));
|
| 94 |
+
return ret;
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(uchar4 srcA, uchar4 srcB, unsigned int c) {
|
| 98 |
+
unsigned int ret;
|
| 99 |
+
asm volatile ("dp4a.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c));
|
| 100 |
+
return ret;
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
// 2a.lo
|
| 104 |
+
__SM_61_INTRINSICS_DECL__ int __dp2a_lo(int srcA, int srcB, int c) {
|
| 105 |
+
int ret;
|
| 106 |
+
asm volatile ("dp2a.lo.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
|
| 107 |
+
return ret;
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(unsigned int srcA, unsigned int srcB, unsigned int c) {
|
| 111 |
+
unsigned int ret;
|
| 112 |
+
asm volatile ("dp2a.lo.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
|
| 113 |
+
return ret;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
__SM_61_INTRINSICS_DECL__ int __dp2a_lo(short2 srcA, char4 srcB, int c) {
|
| 117 |
+
int ret;
|
| 118 |
+
asm volatile ("dp2a.lo.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c));
|
| 119 |
+
return ret;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(ushort2 srcA, uchar4 srcB, unsigned int c) {
|
| 123 |
+
unsigned int ret;
|
| 124 |
+
asm volatile ("dp2a.lo.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c));
|
| 125 |
+
return ret;
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
// 2a.hi
|
| 129 |
+
__SM_61_INTRINSICS_DECL__ int __dp2a_hi(int srcA, int srcB, int c) {
|
| 130 |
+
int ret;
|
| 131 |
+
asm volatile ("dp2a.hi.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
|
| 132 |
+
return ret;
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(unsigned int srcA, unsigned int srcB, unsigned int c) {
|
| 136 |
+
unsigned int ret;
|
| 137 |
+
asm volatile ("dp2a.hi.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
|
| 138 |
+
return ret;
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
__SM_61_INTRINSICS_DECL__ int __dp2a_hi(short2 srcA, char4 srcB, int c) {
|
| 142 |
+
int ret;
|
| 143 |
+
asm volatile ("dp2a.hi.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c));
|
| 144 |
+
return ret;
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(ushort2 srcA, uchar4 srcB, unsigned int c) {
|
| 148 |
+
unsigned int ret;
|
| 149 |
+
asm volatile ("dp2a.hi.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c));
|
| 150 |
+
return ret;
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 610 */
|
| 155 |
+
|
| 156 |
+
#endif /* __cplusplus && __CUDACC__ */
|
| 157 |
+
|
| 158 |
+
#undef __SM_61_INTRINSICS_DECL__
|
| 159 |
+
|
| 160 |
+
#endif /* !__SM_61_INTRINSICS_HPP__ */
|
| 161 |
+
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/surface_indirect_functions.h
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
#ifndef __SURFACE_INDIRECT_FUNCTIONS_H__
|
| 52 |
+
#define __SURFACE_INDIRECT_FUNCTIONS_H__
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 56 |
+
|
| 57 |
+
#include "cuda_runtime_api.h"
|
| 58 |
+
|
| 59 |
+
template<typename T> struct __nv_isurf_trait { };
|
| 60 |
+
template<> struct __nv_isurf_trait<char> { typedef void type; };
|
| 61 |
+
template<> struct __nv_isurf_trait<signed char> { typedef void type; };
|
| 62 |
+
template<> struct __nv_isurf_trait<char1> { typedef void type; };
|
| 63 |
+
template<> struct __nv_isurf_trait<unsigned char> { typedef void type; };
|
| 64 |
+
template<> struct __nv_isurf_trait<uchar1> { typedef void type; };
|
| 65 |
+
template<> struct __nv_isurf_trait<short> { typedef void type; };
|
| 66 |
+
template<> struct __nv_isurf_trait<short1> { typedef void type; };
|
| 67 |
+
template<> struct __nv_isurf_trait<unsigned short> { typedef void type; };
|
| 68 |
+
template<> struct __nv_isurf_trait<ushort1> { typedef void type; };
|
| 69 |
+
template<> struct __nv_isurf_trait<int> { typedef void type; };
|
| 70 |
+
template<> struct __nv_isurf_trait<int1> { typedef void type; };
|
| 71 |
+
template<> struct __nv_isurf_trait<unsigned int> { typedef void type; };
|
| 72 |
+
template<> struct __nv_isurf_trait<uint1> { typedef void type; };
|
| 73 |
+
template<> struct __nv_isurf_trait<long long> { typedef void type; };
|
| 74 |
+
template<> struct __nv_isurf_trait<longlong1> { typedef void type; };
|
| 75 |
+
template<> struct __nv_isurf_trait<unsigned long long> { typedef void type; };
|
| 76 |
+
template<> struct __nv_isurf_trait<ulonglong1> { typedef void type; };
|
| 77 |
+
template<> struct __nv_isurf_trait<float> { typedef void type; };
|
| 78 |
+
template<> struct __nv_isurf_trait<float1> { typedef void type; };
|
| 79 |
+
|
| 80 |
+
template<> struct __nv_isurf_trait<char2> { typedef void type; };
|
| 81 |
+
template<> struct __nv_isurf_trait<uchar2> { typedef void type; };
|
| 82 |
+
template<> struct __nv_isurf_trait<short2> { typedef void type; };
|
| 83 |
+
template<> struct __nv_isurf_trait<ushort2> { typedef void type; };
|
| 84 |
+
template<> struct __nv_isurf_trait<int2> { typedef void type; };
|
| 85 |
+
template<> struct __nv_isurf_trait<uint2> { typedef void type; };
|
| 86 |
+
template<> struct __nv_isurf_trait<longlong2> { typedef void type; };
|
| 87 |
+
template<> struct __nv_isurf_trait<ulonglong2> { typedef void type; };
|
| 88 |
+
template<> struct __nv_isurf_trait<float2> { typedef void type; };
|
| 89 |
+
|
| 90 |
+
template<> struct __nv_isurf_trait<char4> { typedef void type; };
|
| 91 |
+
template<> struct __nv_isurf_trait<uchar4> { typedef void type; };
|
| 92 |
+
template<> struct __nv_isurf_trait<short4> { typedef void type; };
|
| 93 |
+
template<> struct __nv_isurf_trait<ushort4> { typedef void type; };
|
| 94 |
+
template<> struct __nv_isurf_trait<int4> { typedef void type; };
|
| 95 |
+
template<> struct __nv_isurf_trait<uint4> { typedef void type; };
|
| 96 |
+
template<> struct __nv_isurf_trait<float4> { typedef void type; };
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
template <typename T>
|
| 100 |
+
static __device__ typename __nv_isurf_trait<T>::type surf1Dread(T *ptr, cudaSurfaceObject_t obj, int x, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 101 |
+
{
|
| 102 |
+
#ifdef __CUDA_ARCH__
|
| 103 |
+
__nv_tex_surf_handler("__isurf1Dread", ptr, obj, x, mode);
|
| 104 |
+
#endif /* __CUDA_ARCH__ */
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
template <class T>
|
| 108 |
+
static __device__ T surf1Dread(cudaSurfaceObject_t surfObject, int x, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
|
| 109 |
+
{
|
| 110 |
+
#ifdef __CUDA_ARCH__
|
| 111 |
+
T ret;
|
| 112 |
+
surf1Dread(&ret, surfObject, x, boundaryMode);
|
| 113 |
+
return ret;
|
| 114 |
+
#endif /* __CUDA_ARCH__ */
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
template <typename T>
|
| 118 |
+
static __device__ typename __nv_isurf_trait<T>::type surf2Dread(T *ptr, cudaSurfaceObject_t obj, int x, int y, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 119 |
+
{
|
| 120 |
+
#ifdef __CUDA_ARCH__
|
| 121 |
+
__nv_tex_surf_handler("__isurf2Dread", ptr, obj, x, y, mode);
|
| 122 |
+
#endif /* __CUDA_ARCH__ */
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
template <class T>
|
| 126 |
+
static __device__ T surf2Dread(cudaSurfaceObject_t surfObject, int x, int y, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
|
| 127 |
+
{
|
| 128 |
+
#ifdef __CUDA_ARCH__
|
| 129 |
+
T ret;
|
| 130 |
+
surf2Dread(&ret, surfObject, x, y, boundaryMode);
|
| 131 |
+
return ret;
|
| 132 |
+
#endif /* __CUDA_ARCH__ */
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
template <typename T>
|
| 137 |
+
static __device__ typename __nv_isurf_trait<T>::type surf3Dread(T *ptr, cudaSurfaceObject_t obj, int x, int y, int z, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 138 |
+
{
|
| 139 |
+
#ifdef __CUDA_ARCH__
|
| 140 |
+
__nv_tex_surf_handler("__isurf3Dread", ptr, obj, x, y, z, mode);
|
| 141 |
+
#endif /* __CUDA_ARCH__ */
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
template <class T>
|
| 145 |
+
static __device__ T surf3Dread(cudaSurfaceObject_t surfObject, int x, int y, int z, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
|
| 146 |
+
{
|
| 147 |
+
#ifdef __CUDA_ARCH__
|
| 148 |
+
T ret;
|
| 149 |
+
surf3Dread(&ret, surfObject, x, y, z, boundaryMode);
|
| 150 |
+
return ret;
|
| 151 |
+
#endif /* __CUDA_ARCH__ */
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
template <typename T>
|
| 155 |
+
static __device__ typename __nv_isurf_trait<T>::type surf1DLayeredread(T *ptr, cudaSurfaceObject_t obj, int x, int layer, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 156 |
+
{
|
| 157 |
+
#ifdef __CUDA_ARCH__
|
| 158 |
+
__nv_tex_surf_handler("__isurf1DLayeredread", ptr, obj, x, layer, mode);
|
| 159 |
+
#endif /* __CUDA_ARCH__ */
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
template <class T>
|
| 163 |
+
static __device__ T surf1DLayeredread(cudaSurfaceObject_t surfObject, int x, int layer, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
|
| 164 |
+
{
|
| 165 |
+
#ifdef __CUDA_ARCH__
|
| 166 |
+
T ret;
|
| 167 |
+
surf1DLayeredread(&ret, surfObject, x, layer, boundaryMode);
|
| 168 |
+
return ret;
|
| 169 |
+
#endif /* __CUDA_ARCH__ */
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
template <typename T>
|
| 173 |
+
static __device__ typename __nv_isurf_trait<T>::type surf2DLayeredread(T *ptr, cudaSurfaceObject_t obj, int x, int y, int layer, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 174 |
+
{
|
| 175 |
+
#ifdef __CUDA_ARCH__
|
| 176 |
+
__nv_tex_surf_handler("__isurf2DLayeredread", ptr, obj, x, y, layer, mode);
|
| 177 |
+
#endif /* __CUDA_ARCH__ */
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
template <class T>
|
| 181 |
+
static __device__ T surf2DLayeredread(cudaSurfaceObject_t surfObject, int x, int y, int layer, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
|
| 182 |
+
{
|
| 183 |
+
#ifdef __CUDA_ARCH__
|
| 184 |
+
T ret;
|
| 185 |
+
surf2DLayeredread(&ret, surfObject, x, y, layer, boundaryMode);
|
| 186 |
+
return ret;
|
| 187 |
+
#endif /* __CUDA_ARCH__ */
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
template <typename T>
|
| 191 |
+
static __device__ typename __nv_isurf_trait<T>::type surfCubemapread(T *ptr, cudaSurfaceObject_t obj, int x, int y, int face, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 192 |
+
{
|
| 193 |
+
#ifdef __CUDA_ARCH__
|
| 194 |
+
__nv_tex_surf_handler("__isurfCubemapread", ptr, obj, x, y, face, mode);
|
| 195 |
+
#endif /* __CUDA_ARCH__ */
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
template <class T>
|
| 199 |
+
static __device__ T surfCubemapread(cudaSurfaceObject_t surfObject, int x, int y, int face, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
|
| 200 |
+
{
|
| 201 |
+
#ifdef __CUDA_ARCH__
|
| 202 |
+
T ret;
|
| 203 |
+
surfCubemapread(&ret, surfObject, x, y, face, boundaryMode);
|
| 204 |
+
return ret;
|
| 205 |
+
#endif /* __CUDA_ARCH__ */
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
template <typename T>
|
| 209 |
+
static __device__ typename __nv_isurf_trait<T>::type surfCubemapLayeredread(T *ptr, cudaSurfaceObject_t obj, int x, int y, int layerface, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 210 |
+
{
|
| 211 |
+
#ifdef __CUDA_ARCH__
|
| 212 |
+
__nv_tex_surf_handler("__isurfCubemapLayeredread", ptr, obj, x, y, layerface, mode);
|
| 213 |
+
#endif /* __CUDA_ARCH__ */
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
template <class T>
|
| 217 |
+
static __device__ T surfCubemapLayeredread(cudaSurfaceObject_t surfObject, int x, int y, int layerface, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
|
| 218 |
+
{
|
| 219 |
+
#ifdef __CUDA_ARCH__
|
| 220 |
+
T ret;
|
| 221 |
+
surfCubemapLayeredread(&ret, surfObject, x, y, layerface, boundaryMode);
|
| 222 |
+
return ret;
|
| 223 |
+
#endif /* __CUDA_ARCH__ */
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
template <typename T>
|
| 227 |
+
static __device__ typename __nv_isurf_trait<T>::type surf1Dwrite(T val, cudaSurfaceObject_t obj, int x, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 228 |
+
{
|
| 229 |
+
#ifdef __CUDA_ARCH__
|
| 230 |
+
__nv_tex_surf_handler("__isurf1Dwrite_v2", &val, obj, x, mode);
|
| 231 |
+
#endif /* __CUDA_ARCH__ */
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
template <typename T>
|
| 235 |
+
static __device__ typename __nv_isurf_trait<T>::type surf2Dwrite(T val, cudaSurfaceObject_t obj, int x, int y, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 236 |
+
{
|
| 237 |
+
#ifdef __CUDA_ARCH__
|
| 238 |
+
__nv_tex_surf_handler("__isurf2Dwrite_v2", &val, obj, x, y, mode);
|
| 239 |
+
#endif /* __CUDA_ARCH__ */
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
template <typename T>
|
| 243 |
+
static __device__ typename __nv_isurf_trait<T>::type surf3Dwrite(T val, cudaSurfaceObject_t obj, int x, int y, int z, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 244 |
+
{
|
| 245 |
+
#ifdef __CUDA_ARCH__
|
| 246 |
+
__nv_tex_surf_handler("__isurf3Dwrite_v2", &val, obj, x, y, z, mode);
|
| 247 |
+
#endif /* __CUDA_ARCH__ */
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
template <typename T>
|
| 251 |
+
static __device__ typename __nv_isurf_trait<T>::type surf1DLayeredwrite(T val, cudaSurfaceObject_t obj, int x, int layer, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 252 |
+
{
|
| 253 |
+
#ifdef __CUDA_ARCH__
|
| 254 |
+
__nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, obj, x, layer, mode);
|
| 255 |
+
#endif /* __CUDA_ARCH__ */
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
template <typename T>
|
| 259 |
+
static __device__ typename __nv_isurf_trait<T>::type surf2DLayeredwrite(T val, cudaSurfaceObject_t obj, int x, int y, int layer, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 260 |
+
{
|
| 261 |
+
#ifdef __CUDA_ARCH__
|
| 262 |
+
__nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, obj, x, y, layer, mode);
|
| 263 |
+
#endif /* __CUDA_ARCH__ */
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
template <typename T>
|
| 267 |
+
static __device__ typename __nv_isurf_trait<T>::type surfCubemapwrite(T val, cudaSurfaceObject_t obj, int x, int y, int face, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 268 |
+
{
|
| 269 |
+
#ifdef __CUDA_ARCH__
|
| 270 |
+
__nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, obj, x, y, face, mode);
|
| 271 |
+
#endif /* __CUDA_ARCH__ */
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
template <typename T>
|
| 275 |
+
static __device__ typename __nv_isurf_trait<T>::type surfCubemapLayeredwrite(T val, cudaSurfaceObject_t obj, int x, int y, int layerface, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
|
| 276 |
+
{
|
| 277 |
+
#ifdef __CUDA_ARCH__
|
| 278 |
+
__nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, obj, x, y, layerface, mode);
|
| 279 |
+
#endif /* __CUDA_ARCH__ */
|
| 280 |
+
}
|
| 281 |
+
|
| 282 |
+
#endif // __cplusplus && __CUDACC__
|
| 283 |
+
|
| 284 |
+
#endif // __SURFACE_INDIRECT_FUNCTIONS_H__
|
| 285 |
+
|
| 286 |
+
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/surface_types.h
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
#if !defined(__SURFACE_TYPES_H__)
|
| 51 |
+
#define __SURFACE_TYPES_H__
|
| 52 |
+
|
| 53 |
+
/*******************************************************************************
|
| 54 |
+
* *
|
| 55 |
+
* *
|
| 56 |
+
* *
|
| 57 |
+
*******************************************************************************/
|
| 58 |
+
|
| 59 |
+
#include "driver_types.h"
|
| 60 |
+
|
| 61 |
+
/**
|
| 62 |
+
* \addtogroup CUDART_TYPES
|
| 63 |
+
*
|
| 64 |
+
* @{
|
| 65 |
+
*/
|
| 66 |
+
|
| 67 |
+
/*******************************************************************************
|
| 68 |
+
* *
|
| 69 |
+
* *
|
| 70 |
+
* *
|
| 71 |
+
*******************************************************************************/
|
| 72 |
+
|
| 73 |
+
#define cudaSurfaceType1D 0x01
|
| 74 |
+
#define cudaSurfaceType2D 0x02
|
| 75 |
+
#define cudaSurfaceType3D 0x03
|
| 76 |
+
#define cudaSurfaceTypeCubemap 0x0C
|
| 77 |
+
#define cudaSurfaceType1DLayered 0xF1
|
| 78 |
+
#define cudaSurfaceType2DLayered 0xF2
|
| 79 |
+
#define cudaSurfaceTypeCubemapLayered 0xFC
|
| 80 |
+
|
| 81 |
+
/**
|
| 82 |
+
* CUDA Surface boundary modes
|
| 83 |
+
*/
|
| 84 |
+
enum __device_builtin__ cudaSurfaceBoundaryMode
|
| 85 |
+
{
|
| 86 |
+
cudaBoundaryModeZero = 0, /**< Zero boundary mode */
|
| 87 |
+
cudaBoundaryModeClamp = 1, /**< Clamp boundary mode */
|
| 88 |
+
cudaBoundaryModeTrap = 2 /**< Trap boundary mode */
|
| 89 |
+
};
|
| 90 |
+
|
| 91 |
+
/**
|
| 92 |
+
* CUDA Surface format modes
|
| 93 |
+
*/
|
| 94 |
+
enum __device_builtin__ cudaSurfaceFormatMode
|
| 95 |
+
{
|
| 96 |
+
cudaFormatModeForced = 0, /**< Forced format mode */
|
| 97 |
+
cudaFormatModeAuto = 1 /**< Auto format mode */
|
| 98 |
+
};
|
| 99 |
+
|
| 100 |
+
/**
|
| 101 |
+
* CUDA Surface reference
|
| 102 |
+
*/
|
| 103 |
+
struct __device_builtin__ surfaceReference
|
| 104 |
+
{
|
| 105 |
+
/**
|
| 106 |
+
* Channel descriptor for surface reference
|
| 107 |
+
*/
|
| 108 |
+
struct cudaChannelFormatDesc channelDesc;
|
| 109 |
+
};
|
| 110 |
+
|
| 111 |
+
/**
|
| 112 |
+
* An opaque value that represents a CUDA Surface object
|
| 113 |
+
*/
|
| 114 |
+
typedef __device_builtin__ unsigned long long cudaSurfaceObject_t;
|
| 115 |
+
|
| 116 |
+
/** @} */
|
| 117 |
+
/** @} */ /* END CUDART_TYPES */
|
| 118 |
+
|
| 119 |
+
#endif /* !__SURFACE_TYPES_H__ */
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_indirect_functions.h
ADDED
|
@@ -0,0 +1,771 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Copyright 1993-2020 NVIDIA Corporation. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* NOTICE TO LICENSEE:
|
| 5 |
+
*
|
| 6 |
+
* This source code and/or documentation ("Licensed Deliverables") are
|
| 7 |
+
* subject to NVIDIA intellectual property rights under U.S. and
|
| 8 |
+
* international Copyright laws.
|
| 9 |
+
*
|
| 10 |
+
* These Licensed Deliverables contained herein is PROPRIETARY and
|
| 11 |
+
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
| 12 |
+
* conditions of a form of NVIDIA software license agreement by and
|
| 13 |
+
* between NVIDIA and Licensee ("License Agreement") or electronically
|
| 14 |
+
* accepted by Licensee. Notwithstanding any terms or conditions to
|
| 15 |
+
* the contrary in the License Agreement, reproduction or disclosure
|
| 16 |
+
* of the Licensed Deliverables to any third party without the express
|
| 17 |
+
* written consent of NVIDIA is prohibited.
|
| 18 |
+
*
|
| 19 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 20 |
+
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
| 21 |
+
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
| 22 |
+
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
| 23 |
+
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
| 24 |
+
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
| 25 |
+
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
| 26 |
+
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
| 27 |
+
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
| 28 |
+
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
| 29 |
+
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
| 30 |
+
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
| 31 |
+
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
| 32 |
+
* OF THESE LICENSED DELIVERABLES.
|
| 33 |
+
*
|
| 34 |
+
* U.S. Government End Users. These Licensed Deliverables are a
|
| 35 |
+
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
| 36 |
+
* 1995), consisting of "commercial computer software" and "commercial
|
| 37 |
+
* computer software documentation" as such terms are used in 48
|
| 38 |
+
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
| 39 |
+
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
| 40 |
+
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
| 41 |
+
* U.S. Government End Users acquire the Licensed Deliverables with
|
| 42 |
+
* only those rights set forth herein.
|
| 43 |
+
*
|
| 44 |
+
* Any use of the Licensed Deliverables in individual and commercial
|
| 45 |
+
* software must include, in the user documentation and internal
|
| 46 |
+
* comments to the code, the above Disclaimer and U.S. Government End
|
| 47 |
+
* Users Notice.
|
| 48 |
+
*/
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
#ifndef __TEXTURE_INDIRECT_FUNCTIONS_H__
|
| 52 |
+
#define __TEXTURE_INDIRECT_FUNCTIONS_H__
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
#if defined(__cplusplus) && defined(__CUDACC__)
|
| 56 |
+
|
| 57 |
+
#include "cuda_runtime_api.h"
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
|
| 61 |
+
#define __NV_TEX_SPARSE 1
|
| 62 |
+
#endif /* endif */
|
| 63 |
+
|
| 64 |
+
template <typename T> struct __nv_itex_trait { };
|
| 65 |
+
template<> struct __nv_itex_trait<char> { typedef void type; };
|
| 66 |
+
template<> struct __nv_itex_trait<signed char> { typedef void type; };
|
| 67 |
+
template<> struct __nv_itex_trait<char1> { typedef void type; };
|
| 68 |
+
template<> struct __nv_itex_trait<char2> { typedef void type; };
|
| 69 |
+
template<> struct __nv_itex_trait<char4> { typedef void type; };
|
| 70 |
+
template<> struct __nv_itex_trait<unsigned char> { typedef void type; };
|
| 71 |
+
template<> struct __nv_itex_trait<uchar1> { typedef void type; };
|
| 72 |
+
template<> struct __nv_itex_trait<uchar2> { typedef void type; };
|
| 73 |
+
template<> struct __nv_itex_trait<uchar4> { typedef void type; };
|
| 74 |
+
template<> struct __nv_itex_trait<short> { typedef void type; };
|
| 75 |
+
template<> struct __nv_itex_trait<short1> { typedef void type; };
|
| 76 |
+
template<> struct __nv_itex_trait<short2> { typedef void type; };
|
| 77 |
+
template<> struct __nv_itex_trait<short4> { typedef void type; };
|
| 78 |
+
template<> struct __nv_itex_trait<unsigned short> { typedef void type; };
|
| 79 |
+
template<> struct __nv_itex_trait<ushort1> { typedef void type; };
|
| 80 |
+
template<> struct __nv_itex_trait<ushort2> { typedef void type; };
|
| 81 |
+
template<> struct __nv_itex_trait<ushort4> { typedef void type; };
|
| 82 |
+
template<> struct __nv_itex_trait<int> { typedef void type; };
|
| 83 |
+
template<> struct __nv_itex_trait<int1> { typedef void type; };
|
| 84 |
+
template<> struct __nv_itex_trait<int2> { typedef void type; };
|
| 85 |
+
template<> struct __nv_itex_trait<int4> { typedef void type; };
|
| 86 |
+
template<> struct __nv_itex_trait<unsigned int> { typedef void type; };
|
| 87 |
+
template<> struct __nv_itex_trait<uint1> { typedef void type; };
|
| 88 |
+
template<> struct __nv_itex_trait<uint2> { typedef void type; };
|
| 89 |
+
template<> struct __nv_itex_trait<uint4> { typedef void type; };
|
| 90 |
+
#if !defined(__LP64__)
|
| 91 |
+
template<> struct __nv_itex_trait<long> { typedef void type; };
|
| 92 |
+
template<> struct __nv_itex_trait<long1> { typedef void type; };
|
| 93 |
+
template<> struct __nv_itex_trait<long2> { typedef void type; };
|
| 94 |
+
template<> struct __nv_itex_trait<long4> { typedef void type; };
|
| 95 |
+
template<> struct __nv_itex_trait<unsigned long> { typedef void type; };
|
| 96 |
+
template<> struct __nv_itex_trait<ulong1> { typedef void type; };
|
| 97 |
+
template<> struct __nv_itex_trait<ulong2> { typedef void type; };
|
| 98 |
+
template<> struct __nv_itex_trait<ulong4> { typedef void type; };
|
| 99 |
+
#endif /* !__LP64__ */
|
| 100 |
+
template<> struct __nv_itex_trait<float> { typedef void type; };
|
| 101 |
+
template<> struct __nv_itex_trait<float1> { typedef void type; };
|
| 102 |
+
template<> struct __nv_itex_trait<float2> { typedef void type; };
|
| 103 |
+
template<> struct __nv_itex_trait<float4> { typedef void type; };
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
template <typename T>
|
| 108 |
+
static __device__ typename __nv_itex_trait<T>::type tex1Dfetch(T *ptr, cudaTextureObject_t obj, int x)
|
| 109 |
+
{
|
| 110 |
+
#ifdef __CUDA_ARCH__
|
| 111 |
+
__nv_tex_surf_handler("__itex1Dfetch", ptr, obj, x);
|
| 112 |
+
#endif
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
template <class T>
|
| 116 |
+
static __device__ T tex1Dfetch(cudaTextureObject_t texObject, int x)
|
| 117 |
+
{
|
| 118 |
+
#ifdef __CUDA_ARCH__
|
| 119 |
+
T ret;
|
| 120 |
+
tex1Dfetch(&ret, texObject, x);
|
| 121 |
+
return ret;
|
| 122 |
+
#endif
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
template <typename T>
|
| 126 |
+
static __device__ typename __nv_itex_trait<T>::type tex1D(T *ptr, cudaTextureObject_t obj, float x)
|
| 127 |
+
{
|
| 128 |
+
#ifdef __CUDA_ARCH__
|
| 129 |
+
__nv_tex_surf_handler("__itex1D", ptr, obj, x);
|
| 130 |
+
#endif
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
template <class T>
|
| 135 |
+
static __device__ T tex1D(cudaTextureObject_t texObject, float x)
|
| 136 |
+
{
|
| 137 |
+
#ifdef __CUDA_ARCH__
|
| 138 |
+
T ret;
|
| 139 |
+
tex1D(&ret, texObject, x);
|
| 140 |
+
return ret;
|
| 141 |
+
#endif
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
template <typename T>
|
| 146 |
+
static __device__ typename __nv_itex_trait<T>::type tex2D(T *ptr, cudaTextureObject_t obj, float x, float y)
|
| 147 |
+
{
|
| 148 |
+
#ifdef __CUDA_ARCH__
|
| 149 |
+
__nv_tex_surf_handler("__itex2D", ptr, obj, x, y);
|
| 150 |
+
#endif
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
template <class T>
|
| 154 |
+
static __device__ T tex2D(cudaTextureObject_t texObject, float x, float y)
|
| 155 |
+
{
|
| 156 |
+
#ifdef __CUDA_ARCH__
|
| 157 |
+
T ret;
|
| 158 |
+
tex2D(&ret, texObject, x, y);
|
| 159 |
+
return ret;
|
| 160 |
+
#endif
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
#if __NV_TEX_SPARSE
|
| 164 |
+
template <typename T>
|
| 165 |
+
static __device__ typename __nv_itex_trait<T>::type tex2D(T *ptr, cudaTextureObject_t obj, float x, float y,
|
| 166 |
+
bool* isResident)
|
| 167 |
+
{
|
| 168 |
+
#ifdef __CUDA_ARCH__
|
| 169 |
+
unsigned char res;
|
| 170 |
+
__nv_tex_surf_handler("__itex2D_sparse", ptr, obj, x, y, &res);
|
| 171 |
+
*isResident = (res != 0);
|
| 172 |
+
#endif
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
template <class T>
|
| 176 |
+
static __device__ T tex2D(cudaTextureObject_t texObject, float x, float y, bool* isResident)
|
| 177 |
+
{
|
| 178 |
+
#ifdef __CUDA_ARCH__
|
| 179 |
+
T ret;
|
| 180 |
+
tex2D(&ret, texObject, x, y, isResident);
|
| 181 |
+
return ret;
|
| 182 |
+
#endif
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
#endif /* __NV_TEX_SPARSE */
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
template <typename T>
|
| 189 |
+
static __device__ typename __nv_itex_trait<T>::type tex3D(T *ptr, cudaTextureObject_t obj, float x, float y, float z)
|
| 190 |
+
{
|
| 191 |
+
#ifdef __CUDA_ARCH__
|
| 192 |
+
__nv_tex_surf_handler("__itex3D", ptr, obj, x, y, z);
|
| 193 |
+
#endif
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
template <class T>
|
| 197 |
+
static __device__ T tex3D(cudaTextureObject_t texObject, float x, float y, float z)
|
| 198 |
+
{
|
| 199 |
+
#ifdef __CUDA_ARCH__
|
| 200 |
+
T ret;
|
| 201 |
+
tex3D(&ret, texObject, x, y, z);
|
| 202 |
+
return ret;
|
| 203 |
+
#endif
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
#if __NV_TEX_SPARSE
|
| 207 |
+
template <typename T>
|
| 208 |
+
static __device__ typename __nv_itex_trait<T>::type tex3D(T *ptr, cudaTextureObject_t obj, float x, float y, float z,
|
| 209 |
+
bool* isResident)
|
| 210 |
+
{
|
| 211 |
+
#ifdef __CUDA_ARCH__
|
| 212 |
+
unsigned char res;
|
| 213 |
+
__nv_tex_surf_handler("__itex3D_sparse", ptr, obj, x, y, z, &res);
|
| 214 |
+
*isResident = (res != 0);
|
| 215 |
+
#endif
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
template <class T>
|
| 219 |
+
static __device__ T tex3D(cudaTextureObject_t texObject, float x, float y, float z, bool* isResident)
|
| 220 |
+
{
|
| 221 |
+
#ifdef __CUDA_ARCH__
|
| 222 |
+
T ret;
|
| 223 |
+
tex3D(&ret, texObject, x, y, z, isResident);
|
| 224 |
+
return ret;
|
| 225 |
+
#endif
|
| 226 |
+
}
|
| 227 |
+
#endif /* __NV_TEX_SPARSE */
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
template <typename T>
|
| 231 |
+
static __device__ typename __nv_itex_trait<T>::type tex1DLayered(T *ptr, cudaTextureObject_t obj, float x, int layer)
|
| 232 |
+
{
|
| 233 |
+
#ifdef __CUDA_ARCH__
|
| 234 |
+
__nv_tex_surf_handler("__itex1DLayered", ptr, obj, x, layer);
|
| 235 |
+
#endif
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
template <class T>
|
| 239 |
+
static __device__ T tex1DLayered(cudaTextureObject_t texObject, float x, int layer)
|
| 240 |
+
{
|
| 241 |
+
#ifdef __CUDA_ARCH__
|
| 242 |
+
T ret;
|
| 243 |
+
tex1DLayered(&ret, texObject, x, layer);
|
| 244 |
+
return ret;
|
| 245 |
+
#endif
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
template <typename T>
|
| 249 |
+
static __device__ typename __nv_itex_trait<T>::type tex2DLayered(T *ptr, cudaTextureObject_t obj, float x, float y, int layer)
|
| 250 |
+
{
|
| 251 |
+
#ifdef __CUDA_ARCH__
|
| 252 |
+
__nv_tex_surf_handler("__itex2DLayered", ptr, obj, x, y, layer);
|
| 253 |
+
#endif
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
template <class T>
|
| 257 |
+
static __device__ T tex2DLayered(cudaTextureObject_t texObject, float x, float y, int layer)
|
| 258 |
+
{
|
| 259 |
+
#ifdef __CUDA_ARCH__
|
| 260 |
+
T ret;
|
| 261 |
+
tex2DLayered(&ret, texObject, x, y, layer);
|
| 262 |
+
return ret;
|
| 263 |
+
#endif
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
#if __NV_TEX_SPARSE
|
| 267 |
+
template <typename T>
|
| 268 |
+
static __device__ typename __nv_itex_trait<T>::type tex2DLayered(T *ptr, cudaTextureObject_t obj, float x, float y, int layer, bool* isResident)
|
| 269 |
+
{
|
| 270 |
+
#ifdef __CUDA_ARCH__
|
| 271 |
+
unsigned char res;
|
| 272 |
+
__nv_tex_surf_handler("__itex2DLayered_sparse", ptr, obj, x, y, layer, &res);
|
| 273 |
+
*isResident = (res != 0);
|
| 274 |
+
#endif
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
template <class T>
|
| 278 |
+
static __device__ T tex2DLayered(cudaTextureObject_t texObject, float x, float y, int layer, bool* isResident)
|
| 279 |
+
{
|
| 280 |
+
#ifdef __CUDA_ARCH__
|
| 281 |
+
T ret;
|
| 282 |
+
tex2DLayered(&ret, texObject, x, y, layer, isResident);
|
| 283 |
+
return ret;
|
| 284 |
+
#endif
|
| 285 |
+
}
|
| 286 |
+
#endif /* __NV_TEX_SPARSE */
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
template <typename T>
|
| 290 |
+
static __device__ typename __nv_itex_trait<T>::type texCubemap(T *ptr, cudaTextureObject_t obj, float x, float y, float z)
|
| 291 |
+
{
|
| 292 |
+
#ifdef __CUDA_ARCH__
|
| 293 |
+
__nv_tex_surf_handler("__itexCubemap", ptr, obj, x, y, z);
|
| 294 |
+
#endif
|
| 295 |
+
}
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
template <class T>
|
| 299 |
+
static __device__ T texCubemap(cudaTextureObject_t texObject, float x, float y, float z)
|
| 300 |
+
{
|
| 301 |
+
#ifdef __CUDA_ARCH__
|
| 302 |
+
T ret;
|
| 303 |
+
texCubemap(&ret, texObject, x, y, z);
|
| 304 |
+
return ret;
|
| 305 |
+
#endif
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
template <typename T>
|
| 310 |
+
static __device__ typename __nv_itex_trait<T>::type texCubemapLayered(T *ptr, cudaTextureObject_t obj, float x, float y, float z, int layer)
|
| 311 |
+
{
|
| 312 |
+
#ifdef __CUDA_ARCH__
|
| 313 |
+
__nv_tex_surf_handler("__itexCubemapLayered", ptr, obj, x, y, z, layer);
|
| 314 |
+
#endif
|
| 315 |
+
}
|
| 316 |
+
|
| 317 |
+
template <class T>
|
| 318 |
+
static __device__ T texCubemapLayered(cudaTextureObject_t texObject, float x, float y, float z, int layer)
|
| 319 |
+
{
|
| 320 |
+
#ifdef __CUDA_ARCH__
|
| 321 |
+
T ret;
|
| 322 |
+
texCubemapLayered(&ret, texObject, x, y, z, layer);
|
| 323 |
+
return ret;
|
| 324 |
+
#endif
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
template <typename T>
|
| 328 |
+
static __device__ typename __nv_itex_trait<T>::type tex2Dgather(T *ptr, cudaTextureObject_t obj, float x, float y, int comp = 0)
|
| 329 |
+
{
|
| 330 |
+
#ifdef __CUDA_ARCH__
|
| 331 |
+
__nv_tex_surf_handler("__itex2Dgather", ptr, obj, x, y, comp);
|
| 332 |
+
#endif
|
| 333 |
+
}
|
| 334 |
+
|
| 335 |
+
template <class T>
|
| 336 |
+
static __device__ T tex2Dgather(cudaTextureObject_t to, float x, float y, int comp = 0)
|
| 337 |
+
{
|
| 338 |
+
#ifdef __CUDA_ARCH__
|
| 339 |
+
T ret;
|
| 340 |
+
tex2Dgather(&ret, to, x, y, comp);
|
| 341 |
+
return ret;
|
| 342 |
+
#endif
|
| 343 |
+
}
|
| 344 |
+
|
| 345 |
+
#if __NV_TEX_SPARSE
|
| 346 |
+
template <typename T>
|
| 347 |
+
static __device__ typename __nv_itex_trait<T>::type tex2Dgather(T *ptr, cudaTextureObject_t obj, float x, float y, bool* isResident, int comp = 0)
|
| 348 |
+
{
|
| 349 |
+
#ifdef __CUDA_ARCH__
|
| 350 |
+
unsigned char res;
|
| 351 |
+
__nv_tex_surf_handler("__itex2Dgather_sparse", ptr, obj, x, y, comp, &res);
|
| 352 |
+
*isResident = (res != 0);
|
| 353 |
+
#endif
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
template <class T>
|
| 357 |
+
static __device__ T tex2Dgather(cudaTextureObject_t to, float x, float y, bool* isResident, int comp = 0)
|
| 358 |
+
{
|
| 359 |
+
#ifdef __CUDA_ARCH__
|
| 360 |
+
T ret;
|
| 361 |
+
tex2Dgather(&ret, to, x, y, isResident, comp);
|
| 362 |
+
return ret;
|
| 363 |
+
#endif
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
+
#endif /* __NV_TEX_SPARSE */
|
| 367 |
+
|
| 368 |
+
template <typename T>
|
| 369 |
+
static __device__ typename __nv_itex_trait<T>::type tex1DLod(T *ptr, cudaTextureObject_t obj, float x, float level)
|
| 370 |
+
{
|
| 371 |
+
#ifdef __CUDA_ARCH__
|
| 372 |
+
__nv_tex_surf_handler("__itex1DLod", ptr, obj, x, level);
|
| 373 |
+
#endif
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
template <class T>
|
| 377 |
+
static __device__ T tex1DLod(cudaTextureObject_t texObject, float x, float level)
|
| 378 |
+
{
|
| 379 |
+
#ifdef __CUDA_ARCH__
|
| 380 |
+
T ret;
|
| 381 |
+
tex1DLod(&ret, texObject, x, level);
|
| 382 |
+
return ret;
|
| 383 |
+
#endif
|
| 384 |
+
}
|
| 385 |
+
|
| 386 |
+
|
| 387 |
+
template <typename T>
|
| 388 |
+
static __device__ typename __nv_itex_trait<T>::type tex2DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float level)
|
| 389 |
+
{
|
| 390 |
+
#ifdef __CUDA_ARCH__
|
| 391 |
+
__nv_tex_surf_handler("__itex2DLod", ptr, obj, x, y, level);
|
| 392 |
+
#endif
|
| 393 |
+
}
|
| 394 |
+
|
| 395 |
+
template <class T>
|
| 396 |
+
static __device__ T tex2DLod(cudaTextureObject_t texObject, float x, float y, float level)
|
| 397 |
+
{
|
| 398 |
+
#ifdef __CUDA_ARCH__
|
| 399 |
+
T ret;
|
| 400 |
+
tex2DLod(&ret, texObject, x, y, level);
|
| 401 |
+
return ret;
|
| 402 |
+
#endif
|
| 403 |
+
}
|
| 404 |
+
|
| 405 |
+
#if __NV_TEX_SPARSE
|
| 406 |
+
|
| 407 |
+
template <typename T>
|
| 408 |
+
static __device__ typename __nv_itex_trait<T>::type tex2DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float level, bool* isResident)
|
| 409 |
+
{
|
| 410 |
+
#ifdef __CUDA_ARCH__
|
| 411 |
+
unsigned char res;
|
| 412 |
+
__nv_tex_surf_handler("__itex2DLod_sparse", ptr, obj, x, y, level, &res);
|
| 413 |
+
*isResident = (res != 0);
|
| 414 |
+
#endif
|
| 415 |
+
}
|
| 416 |
+
|
| 417 |
+
template <class T>
|
| 418 |
+
static __device__ T tex2DLod(cudaTextureObject_t texObject, float x, float y, float level, bool* isResident)
|
| 419 |
+
{
|
| 420 |
+
#ifdef __CUDA_ARCH__
|
| 421 |
+
T ret;
|
| 422 |
+
tex2DLod(&ret, texObject, x, y, level, isResident);
|
| 423 |
+
return ret;
|
| 424 |
+
#endif
|
| 425 |
+
}
|
| 426 |
+
|
| 427 |
+
#endif /* __NV_TEX_SPARSE */
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
template <typename T>
|
| 431 |
+
static __device__ typename __nv_itex_trait<T>::type tex3DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float level)
|
| 432 |
+
{
|
| 433 |
+
#ifdef __CUDA_ARCH__
|
| 434 |
+
__nv_tex_surf_handler("__itex3DLod", ptr, obj, x, y, z, level);
|
| 435 |
+
#endif
|
| 436 |
+
}
|
| 437 |
+
|
| 438 |
+
template <class T>
|
| 439 |
+
static __device__ T tex3DLod(cudaTextureObject_t texObject, float x, float y, float z, float level)
|
| 440 |
+
{
|
| 441 |
+
#ifdef __CUDA_ARCH__
|
| 442 |
+
T ret;
|
| 443 |
+
tex3DLod(&ret, texObject, x, y, z, level);
|
| 444 |
+
return ret;
|
| 445 |
+
#endif
|
| 446 |
+
}
|
| 447 |
+
|
| 448 |
+
#if __NV_TEX_SPARSE
|
| 449 |
+
template <typename T>
|
| 450 |
+
static __device__ typename __nv_itex_trait<T>::type tex3DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float level, bool* isResident)
|
| 451 |
+
{
|
| 452 |
+
#ifdef __CUDA_ARCH__
|
| 453 |
+
unsigned char res;
|
| 454 |
+
__nv_tex_surf_handler("__itex3DLod_sparse", ptr, obj, x, y, z, level, &res);
|
| 455 |
+
*isResident = (res != 0);
|
| 456 |
+
#endif
|
| 457 |
+
}
|
| 458 |
+
|
| 459 |
+
template <class T>
|
| 460 |
+
static __device__ T tex3DLod(cudaTextureObject_t texObject, float x, float y, float z, float level, bool* isResident)
|
| 461 |
+
{
|
| 462 |
+
#ifdef __CUDA_ARCH__
|
| 463 |
+
T ret;
|
| 464 |
+
tex3DLod(&ret, texObject, x, y, z, level, isResident);
|
| 465 |
+
return ret;
|
| 466 |
+
#endif
|
| 467 |
+
}
|
| 468 |
+
|
| 469 |
+
#endif /* __NV_TEX_SPARSE */
|
| 470 |
+
|
| 471 |
+
|
| 472 |
+
template <typename T>
|
| 473 |
+
static __device__ typename __nv_itex_trait<T>::type tex1DLayeredLod(T *ptr, cudaTextureObject_t obj, float x, int layer, float level)
|
| 474 |
+
{
|
| 475 |
+
#ifdef __CUDA_ARCH__
|
| 476 |
+
__nv_tex_surf_handler("__itex1DLayeredLod", ptr, obj, x, layer, level);
|
| 477 |
+
#endif
|
| 478 |
+
}
|
| 479 |
+
|
| 480 |
+
template <class T>
|
| 481 |
+
static __device__ T tex1DLayeredLod(cudaTextureObject_t texObject, float x, int layer, float level)
|
| 482 |
+
{
|
| 483 |
+
#ifdef __CUDA_ARCH__
|
| 484 |
+
T ret;
|
| 485 |
+
tex1DLayeredLod(&ret, texObject, x, layer, level);
|
| 486 |
+
return ret;
|
| 487 |
+
#endif
|
| 488 |
+
}
|
| 489 |
+
|
| 490 |
+
|
| 491 |
+
template <typename T>
|
| 492 |
+
static __device__ typename __nv_itex_trait<T>::type tex2DLayeredLod(T *ptr, cudaTextureObject_t obj, float x, float y, int layer, float level)
|
| 493 |
+
{
|
| 494 |
+
#ifdef __CUDA_ARCH__
|
| 495 |
+
__nv_tex_surf_handler("__itex2DLayeredLod", ptr, obj, x, y, layer, level);
|
| 496 |
+
#endif
|
| 497 |
+
}
|
| 498 |
+
|
| 499 |
+
template <class T>
|
| 500 |
+
static __device__ T tex2DLayeredLod(cudaTextureObject_t texObject, float x, float y, int layer, float level)
|
| 501 |
+
{
|
| 502 |
+
#ifdef __CUDA_ARCH__
|
| 503 |
+
T ret;
|
| 504 |
+
tex2DLayeredLod(&ret, texObject, x, y, layer, level);
|
| 505 |
+
return ret;
|
| 506 |
+
#endif
|
| 507 |
+
}
|
| 508 |
+
|
| 509 |
+
#if __NV_TEX_SPARSE
|
| 510 |
+
template <typename T>
|
| 511 |
+
static __device__ typename __nv_itex_trait<T>::type tex2DLayeredLod(T *ptr, cudaTextureObject_t obj, float x, float y, int layer, float level, bool* isResident)
|
| 512 |
+
{
|
| 513 |
+
#ifdef __CUDA_ARCH__
|
| 514 |
+
unsigned char res;
|
| 515 |
+
__nv_tex_surf_handler("__itex2DLayeredLod_sparse", ptr, obj, x, y, layer, level, &res);
|
| 516 |
+
*isResident = (res != 0);
|
| 517 |
+
#endif
|
| 518 |
+
}
|
| 519 |
+
|
| 520 |
+
template <class T>
|
| 521 |
+
static __device__ T tex2DLayeredLod(cudaTextureObject_t texObject, float x, float y, int layer, float level, bool* isResident)
|
| 522 |
+
{
|
| 523 |
+
#ifdef __CUDA_ARCH__
|
| 524 |
+
T ret;
|
| 525 |
+
tex2DLayeredLod(&ret, texObject, x, y, layer, level, isResident);
|
| 526 |
+
return ret;
|
| 527 |
+
#endif
|
| 528 |
+
}
|
| 529 |
+
#endif /* __NV_TEX_SPARSE */
|
| 530 |
+
|
| 531 |
+
template <typename T>
|
| 532 |
+
static __device__ typename __nv_itex_trait<T>::type texCubemapLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float level)
|
| 533 |
+
{
|
| 534 |
+
#ifdef __CUDA_ARCH__
|
| 535 |
+
__nv_tex_surf_handler("__itexCubemapLod", ptr, obj, x, y, z, level);
|
| 536 |
+
#endif
|
| 537 |
+
}
|
| 538 |
+
|
| 539 |
+
template <class T>
|
| 540 |
+
static __device__ T texCubemapLod(cudaTextureObject_t texObject, float x, float y, float z, float level)
|
| 541 |
+
{
|
| 542 |
+
#ifdef __CUDA_ARCH__
|
| 543 |
+
T ret;
|
| 544 |
+
texCubemapLod(&ret, texObject, x, y, z, level);
|
| 545 |
+
return ret;
|
| 546 |
+
#endif
|
| 547 |
+
}
|
| 548 |
+
|
| 549 |
+
|
| 550 |
+
template <typename T>
|
| 551 |
+
static __device__ typename __nv_itex_trait<T>::type texCubemapGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float4 dPdx, float4 dPdy)
|
| 552 |
+
{
|
| 553 |
+
#ifdef __CUDA_ARCH__
|
| 554 |
+
__nv_tex_surf_handler("__itexCubemapGrad_v2", ptr, obj, x, y, z, &dPdx, &dPdy);
|
| 555 |
+
#endif
|
| 556 |
+
}
|
| 557 |
+
|
| 558 |
+
template <class T>
|
| 559 |
+
static __device__ T texCubemapGrad(cudaTextureObject_t texObject, float x, float y, float z, float4 dPdx, float4 dPdy)
|
| 560 |
+
{
|
| 561 |
+
#ifdef __CUDA_ARCH__
|
| 562 |
+
T ret;
|
| 563 |
+
texCubemapGrad(&ret, texObject, x, y, z, dPdx, dPdy);
|
| 564 |
+
return ret;
|
| 565 |
+
#endif
|
| 566 |
+
}
|
| 567 |
+
|
| 568 |
+
template <typename T>
|
| 569 |
+
static __device__ typename __nv_itex_trait<T>::type texCubemapLayeredLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, int layer, float level)
|
| 570 |
+
{
|
| 571 |
+
#ifdef __CUDA_ARCH__
|
| 572 |
+
__nv_tex_surf_handler("__itexCubemapLayeredLod", ptr, obj, x, y, z, layer, level);
|
| 573 |
+
#endif
|
| 574 |
+
}
|
| 575 |
+
|
| 576 |
+
template <class T>
|
| 577 |
+
static __device__ T texCubemapLayeredLod(cudaTextureObject_t texObject, float x, float y, float z, int layer, float level)
|
| 578 |
+
{
|
| 579 |
+
#ifdef __CUDA_ARCH__
|
| 580 |
+
T ret;
|
| 581 |
+
texCubemapLayeredLod(&ret, texObject, x, y, z, layer, level);
|
| 582 |
+
return ret;
|
| 583 |
+
#endif
|
| 584 |
+
}
|
| 585 |
+
|
| 586 |
+
template <typename T>
|
| 587 |
+
static __device__ typename __nv_itex_trait<T>::type tex1DGrad(T *ptr, cudaTextureObject_t obj, float x, float dPdx, float dPdy)
|
| 588 |
+
{
|
| 589 |
+
#ifdef __CUDA_ARCH__
|
| 590 |
+
__nv_tex_surf_handler("__itex1DGrad", ptr, obj, x, dPdx, dPdy);
|
| 591 |
+
#endif
|
| 592 |
+
}
|
| 593 |
+
|
| 594 |
+
template <class T>
|
| 595 |
+
static __device__ T tex1DGrad(cudaTextureObject_t texObject, float x, float dPdx, float dPdy)
|
| 596 |
+
{
|
| 597 |
+
#ifdef __CUDA_ARCH__
|
| 598 |
+
T ret;
|
| 599 |
+
tex1DGrad(&ret, texObject, x, dPdx, dPdy);
|
| 600 |
+
return ret;
|
| 601 |
+
#endif
|
| 602 |
+
}
|
| 603 |
+
|
| 604 |
+
|
| 605 |
+
template <typename T>
|
| 606 |
+
static __device__ typename __nv_itex_trait<T>::type tex2DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float2 dPdx, float2 dPdy)
|
| 607 |
+
{
|
| 608 |
+
#ifdef __CUDA_ARCH__
|
| 609 |
+
__nv_tex_surf_handler("__itex2DGrad_v2", ptr, obj, x, y, &dPdx, &dPdy);
|
| 610 |
+
#endif
|
| 611 |
+
|
| 612 |
+
}
|
| 613 |
+
|
| 614 |
+
template <class T>
|
| 615 |
+
static __device__ T tex2DGrad(cudaTextureObject_t texObject, float x, float y, float2 dPdx, float2 dPdy)
|
| 616 |
+
{
|
| 617 |
+
#ifdef __CUDA_ARCH__
|
| 618 |
+
T ret;
|
| 619 |
+
tex2DGrad(&ret, texObject, x, y, dPdx, dPdy);
|
| 620 |
+
return ret;
|
| 621 |
+
#endif
|
| 622 |
+
}
|
| 623 |
+
|
| 624 |
+
#if __NV_TEX_SPARSE
|
| 625 |
+
template <typename T>
|
| 626 |
+
static __device__ typename __nv_itex_trait<T>::type tex2DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float2 dPdx, float2 dPdy, bool* isResident)
|
| 627 |
+
{
|
| 628 |
+
#ifdef __CUDA_ARCH__
|
| 629 |
+
unsigned char res;
|
| 630 |
+
__nv_tex_surf_handler("__itex2DGrad_sparse", ptr, obj, x, y, &dPdx, &dPdy, &res);
|
| 631 |
+
*isResident = (res != 0);
|
| 632 |
+
#endif
|
| 633 |
+
|
| 634 |
+
}
|
| 635 |
+
|
| 636 |
+
template <class T>
|
| 637 |
+
static __device__ T tex2DGrad(cudaTextureObject_t texObject, float x, float y, float2 dPdx, float2 dPdy, bool* isResident)
|
| 638 |
+
{
|
| 639 |
+
#ifdef __CUDA_ARCH__
|
| 640 |
+
T ret;
|
| 641 |
+
tex2DGrad(&ret, texObject, x, y, dPdx, dPdy, isResident);
|
| 642 |
+
return ret;
|
| 643 |
+
#endif
|
| 644 |
+
}
|
| 645 |
+
#endif /* __NV_TEX_SPARSE */
|
| 646 |
+
|
| 647 |
+
|
| 648 |
+
template <typename T>
|
| 649 |
+
static __device__ typename __nv_itex_trait<T>::type tex3DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float4 dPdx, float4 dPdy)
|
| 650 |
+
{
|
| 651 |
+
#ifdef __CUDA_ARCH__
|
| 652 |
+
__nv_tex_surf_handler("__itex3DGrad_v2", ptr, obj, x, y, z, &dPdx, &dPdy);
|
| 653 |
+
#endif
|
| 654 |
+
}
|
| 655 |
+
|
| 656 |
+
template <class T>
|
| 657 |
+
static __device__ T tex3DGrad(cudaTextureObject_t texObject, float x, float y, float z, float4 dPdx, float4 dPdy)
|
| 658 |
+
{
|
| 659 |
+
#ifdef __CUDA_ARCH__
|
| 660 |
+
T ret;
|
| 661 |
+
tex3DGrad(&ret, texObject, x, y, z, dPdx, dPdy);
|
| 662 |
+
return ret;
|
| 663 |
+
#endif
|
| 664 |
+
}
|
| 665 |
+
|
| 666 |
+
#if __NV_TEX_SPARSE
|
| 667 |
+
template <typename T>
|
| 668 |
+
static __device__ typename __nv_itex_trait<T>::type tex3DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float4 dPdx, float4 dPdy, bool* isResident)
|
| 669 |
+
{
|
| 670 |
+
#ifdef __CUDA_ARCH__
|
| 671 |
+
unsigned char res;
|
| 672 |
+
__nv_tex_surf_handler("__itex3DGrad_sparse", ptr, obj, x, y, z, &dPdx, &dPdy, &res);
|
| 673 |
+
*isResident = (res != 0);
|
| 674 |
+
#endif
|
| 675 |
+
}
|
| 676 |
+
|
| 677 |
+
template <class T>
|
| 678 |
+
static __device__ T tex3DGrad(cudaTextureObject_t texObject, float x, float y, float z, float4 dPdx, float4 dPdy, bool* isResident)
|
| 679 |
+
{
|
| 680 |
+
#ifdef __CUDA_ARCH__
|
| 681 |
+
T ret;
|
| 682 |
+
tex3DGrad(&ret, texObject, x, y, z, dPdx, dPdy, isResident);
|
| 683 |
+
return ret;
|
| 684 |
+
#endif
|
| 685 |
+
}
|
| 686 |
+
|
| 687 |
+
#endif /* __NV_TEX_SPARSE */
|
| 688 |
+
|
| 689 |
+
|
| 690 |
+
template <typename T>
|
| 691 |
+
static __device__ typename __nv_itex_trait<T>::type tex1DLayeredGrad(T *ptr, cudaTextureObject_t obj, float x, int layer, float dPdx, float dPdy)
|
| 692 |
+
{
|
| 693 |
+
#ifdef __CUDA_ARCH__
|
| 694 |
+
__nv_tex_surf_handler("__itex1DLayeredGrad", ptr, obj, x, layer, dPdx, dPdy);
|
| 695 |
+
#endif
|
| 696 |
+
}
|
| 697 |
+
|
| 698 |
+
template <class T>
|
| 699 |
+
static __device__ T tex1DLayeredGrad(cudaTextureObject_t texObject, float x, int layer, float dPdx, float dPdy)
|
| 700 |
+
{
|
| 701 |
+
#ifdef __CUDA_ARCH__
|
| 702 |
+
T ret;
|
| 703 |
+
tex1DLayeredGrad(&ret, texObject, x, layer, dPdx, dPdy);
|
| 704 |
+
return ret;
|
| 705 |
+
#endif
|
| 706 |
+
}
|
| 707 |
+
|
| 708 |
+
|
| 709 |
+
template <typename T>
|
| 710 |
+
static __device__ typename __nv_itex_trait<T>::type tex2DLayeredGrad(T * ptr, cudaTextureObject_t obj, float x, float y, int layer, float2 dPdx, float2 dPdy)
|
| 711 |
+
{
|
| 712 |
+
#ifdef __CUDA_ARCH__
|
| 713 |
+
__nv_tex_surf_handler("__itex2DLayeredGrad_v2", ptr, obj, x, y, layer, &dPdx, &dPdy);
|
| 714 |
+
#endif
|
| 715 |
+
}
|
| 716 |
+
|
| 717 |
+
template <class T>
|
| 718 |
+
static __device__ T tex2DLayeredGrad(cudaTextureObject_t texObject, float x, float y, int layer, float2 dPdx, float2 dPdy)
|
| 719 |
+
{
|
| 720 |
+
#ifdef __CUDA_ARCH__
|
| 721 |
+
T ret;
|
| 722 |
+
tex2DLayeredGrad(&ret, texObject, x, y, layer, dPdx, dPdy);
|
| 723 |
+
return ret;
|
| 724 |
+
#endif
|
| 725 |
+
}
|
| 726 |
+
|
| 727 |
+
#if __NV_TEX_SPARSE
|
| 728 |
+
template <typename T>
|
| 729 |
+
static __device__ typename __nv_itex_trait<T>::type tex2DLayeredGrad(T * ptr, cudaTextureObject_t obj, float x, float y, int layer, float2 dPdx, float2 dPdy, bool* isResident)
|
| 730 |
+
{
|
| 731 |
+
#ifdef __CUDA_ARCH__
|
| 732 |
+
unsigned char res;
|
| 733 |
+
__nv_tex_surf_handler("__itex2DLayeredGrad_sparse", ptr, obj, x, y, layer, &dPdx, &dPdy, &res);
|
| 734 |
+
*isResident = (res != 0);
|
| 735 |
+
#endif
|
| 736 |
+
}
|
| 737 |
+
|
| 738 |
+
template <class T>
|
| 739 |
+
static __device__ T tex2DLayeredGrad(cudaTextureObject_t texObject, float x, float y, int layer, float2 dPdx, float2 dPdy, bool* isResident)
|
| 740 |
+
{
|
| 741 |
+
#ifdef __CUDA_ARCH__
|
| 742 |
+
T ret;
|
| 743 |
+
tex2DLayeredGrad(&ret, texObject, x, y, layer, dPdx, dPdy, isResident);
|
| 744 |
+
return ret;
|
| 745 |
+
#endif
|
| 746 |
+
}
|
| 747 |
+
#endif /* __NV_TEX_SPARSE */
|
| 748 |
+
|
| 749 |
+
|
| 750 |
+
template <typename T>
|
| 751 |
+
static __device__ typename __nv_itex_trait<T>::type texCubemapLayeredGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
|
| 752 |
+
{
|
| 753 |
+
#ifdef __CUDA_ARCH__
|
| 754 |
+
__nv_tex_surf_handler("__itexCubemapLayeredGrad_v2", ptr, obj, x, y, z, layer, &dPdx, &dPdy);
|
| 755 |
+
#endif
|
| 756 |
+
}
|
| 757 |
+
|
| 758 |
+
template <class T>
|
| 759 |
+
static __device__ T texCubemapLayeredGrad(cudaTextureObject_t texObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
|
| 760 |
+
{
|
| 761 |
+
#ifdef __CUDA_ARCH__
|
| 762 |
+
T ret;
|
| 763 |
+
texCubemapLayeredGrad(&ret, texObject, x, y, z, layer, dPdx, dPdy);
|
| 764 |
+
return ret;
|
| 765 |
+
#endif
|
| 766 |
+
}
|
| 767 |
+
|
| 768 |
+
#undef __NV_TEX_SPARSE
|
| 769 |
+
|
| 770 |
+
#endif // __cplusplus && __CUDACC__
|
| 771 |
+
#endif // __TEXTURE_INDIRECT_FUNCTIONS_H__
|