koichi12 commited on
Commit
a249ee4
·
verified ·
1 Parent(s): eda6db7

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/caching.cpython-311.pyc +0 -0
  2. tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/fuse.cpython-311.pyc +0 -0
  3. tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/dask.cpython-311.pyc +0 -0
  4. tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/webhdfs.cpython-311.pyc +0 -0
  5. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudaGL_meta.h +116 -0
  6. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudaVDPAU_meta.h +46 -0
  7. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudart_removed_meta.h +55 -0
  8. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/nvperf_target.h +570 -0
  9. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl.hpp +0 -0
  10. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_egl.h +123 -0
  11. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_gl.h +154 -0
  12. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_gl_ext.h +44 -0
  13. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_platform.h +1414 -0
  14. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/opencl.h +40 -0
  15. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/__pycache__/__init__.cpython-311.pyc +0 -0
  16. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/builtin_types.h +64 -0
  17. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/channel_descriptor.h +595 -0
  18. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups.h +1828 -0
  19. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/coalesced_reduce.h +108 -0
  20. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/functional.h +207 -0
  21. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/helpers.h +707 -0
  22. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/info.h +323 -0
  23. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/partitioning.h +133 -0
  24. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/reduce.h +430 -0
  25. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/scan.h +324 -0
  26. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/memcpy_async.h +62 -0
  27. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/reduce.h +63 -0
  28. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda.h +0 -0
  29. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaEGLTypedefs.h +96 -0
  30. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaGLTypedefs.h +123 -0
  31. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaTypedefs.h +959 -0
  32. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaVDPAUTypedefs.h +90 -0
  33. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_bf16.h +0 -0
  34. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp16.h +0 -0
  35. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp16.hpp +2614 -0
  36. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp8.h +360 -0
  37. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_gl_interop.h +508 -0
  38. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_runtime_api.h +0 -0
  39. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_vdpau_interop.h +198 -0
  40. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_functions.h +65 -0
  41. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/math_constants.h +152 -0
  42. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_intrinsics.h +1551 -0
  43. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_intrinsics.hpp +221 -0
  44. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_30_intrinsics.h +215 -0
  45. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_30_intrinsics.hpp +604 -0
  46. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_35_atomic_functions.h +58 -0
  47. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_61_intrinsics.hpp +161 -0
  48. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/surface_indirect_functions.h +286 -0
  49. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/surface_types.h +119 -0
  50. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_indirect_functions.h +771 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/caching.cpython-311.pyc ADDED
Binary file (37.6 kB). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/fuse.cpython-311.pyc ADDED
Binary file (17.1 kB). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/dask.cpython-311.pyc ADDED
Binary file (7.78 kB). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/webhdfs.cpython-311.pyc ADDED
Binary file (24.8 kB). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudaGL_meta.h ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // This file is generated. Any changes you make will be lost during the next clean build.
2
+
3
+ // Dependent includes
4
+ #ifdef __APPLE__
5
+ #include <OpenGL/gl.h>
6
+ #else
7
+ #include <GL/gl.h>
8
+ #endif
9
+
10
+ // CUDA public interface, for type definitions and cu* function prototypes
11
+ #include "cudaGL.h"
12
+
13
+
14
+ // *************************************************************************
15
+ // Definitions of structs to hold parameters for each function
16
+ // *************************************************************************
17
+
18
+ typedef struct cuGraphicsGLRegisterBuffer_params_st {
19
+ CUgraphicsResource *pCudaResource;
20
+ GLuint buffer;
21
+ unsigned int Flags;
22
+ } cuGraphicsGLRegisterBuffer_params;
23
+
24
+ typedef struct cuGraphicsGLRegisterImage_params_st {
25
+ CUgraphicsResource *pCudaResource;
26
+ GLuint image;
27
+ GLenum target;
28
+ unsigned int Flags;
29
+ } cuGraphicsGLRegisterImage_params;
30
+
31
+ typedef struct cuGLGetDevices_v2_params_st {
32
+ unsigned int *pCudaDeviceCount;
33
+ CUdevice *pCudaDevices;
34
+ unsigned int cudaDeviceCount;
35
+ CUGLDeviceList deviceList;
36
+ } cuGLGetDevices_v2_params;
37
+
38
+ typedef struct cuGLCtxCreate_v2_params_st {
39
+ CUcontext *pCtx;
40
+ unsigned int Flags;
41
+ CUdevice device;
42
+ } cuGLCtxCreate_v2_params;
43
+
44
+ typedef struct cuGLRegisterBufferObject_params_st {
45
+ GLuint buffer;
46
+ } cuGLRegisterBufferObject_params;
47
+
48
+ typedef struct cuGLMapBufferObject_v2_ptds_params_st {
49
+ CUdeviceptr *dptr;
50
+ size_t *size;
51
+ GLuint buffer;
52
+ } cuGLMapBufferObject_v2_ptds_params;
53
+
54
+ typedef struct cuGLUnmapBufferObject_params_st {
55
+ GLuint buffer;
56
+ } cuGLUnmapBufferObject_params;
57
+
58
+ typedef struct cuGLUnregisterBufferObject_params_st {
59
+ GLuint buffer;
60
+ } cuGLUnregisterBufferObject_params;
61
+
62
+ typedef struct cuGLSetBufferObjectMapFlags_params_st {
63
+ GLuint buffer;
64
+ unsigned int Flags;
65
+ } cuGLSetBufferObjectMapFlags_params;
66
+
67
+ typedef struct cuGLMapBufferObjectAsync_v2_ptsz_params_st {
68
+ CUdeviceptr *dptr;
69
+ size_t *size;
70
+ GLuint buffer;
71
+ CUstream hStream;
72
+ } cuGLMapBufferObjectAsync_v2_ptsz_params;
73
+
74
+ typedef struct cuGLUnmapBufferObjectAsync_params_st {
75
+ GLuint buffer;
76
+ CUstream hStream;
77
+ } cuGLUnmapBufferObjectAsync_params;
78
+
79
+ typedef struct cuGLGetDevices_params_st {
80
+ unsigned int *pCudaDeviceCount;
81
+ CUdevice *pCudaDevices;
82
+ unsigned int cudaDeviceCount;
83
+ CUGLDeviceList deviceList;
84
+ } cuGLGetDevices_params;
85
+
86
+ typedef struct cuGLMapBufferObject_v2_params_st {
87
+ CUdeviceptr *dptr;
88
+ size_t *size;
89
+ GLuint buffer;
90
+ } cuGLMapBufferObject_v2_params;
91
+
92
+ typedef struct cuGLMapBufferObjectAsync_v2_params_st {
93
+ CUdeviceptr *dptr;
94
+ size_t *size;
95
+ GLuint buffer;
96
+ CUstream hStream;
97
+ } cuGLMapBufferObjectAsync_v2_params;
98
+
99
+ typedef struct cuGLCtxCreate_params_st {
100
+ CUcontext *pCtx;
101
+ unsigned int Flags;
102
+ CUdevice device;
103
+ } cuGLCtxCreate_params;
104
+
105
+ typedef struct cuGLMapBufferObject_params_st {
106
+ CUdeviceptr_v1 *dptr;
107
+ unsigned int *size;
108
+ GLuint buffer;
109
+ } cuGLMapBufferObject_params;
110
+
111
+ typedef struct cuGLMapBufferObjectAsync_params_st {
112
+ CUdeviceptr_v1 *dptr;
113
+ unsigned int *size;
114
+ GLuint buffer;
115
+ CUstream hStream;
116
+ } cuGLMapBufferObjectAsync_params;
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudaVDPAU_meta.h ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // This file is generated. Any changes you make will be lost during the next clean build.
2
+
3
+ // Dependent includes
4
+ #include <vdpau/vdpau.h>
5
+
6
+ // CUDA public interface, for type definitions and cu* function prototypes
7
+ #include "cudaVDPAU.h"
8
+
9
+
10
+ // *************************************************************************
11
+ // Definitions of structs to hold parameters for each function
12
+ // *************************************************************************
13
+
14
+ typedef struct cuVDPAUGetDevice_params_st {
15
+ CUdevice *pDevice;
16
+ VdpDevice vdpDevice;
17
+ VdpGetProcAddress *vdpGetProcAddress;
18
+ } cuVDPAUGetDevice_params;
19
+
20
+ typedef struct cuVDPAUCtxCreate_v2_params_st {
21
+ CUcontext *pCtx;
22
+ unsigned int flags;
23
+ CUdevice device;
24
+ VdpDevice vdpDevice;
25
+ VdpGetProcAddress *vdpGetProcAddress;
26
+ } cuVDPAUCtxCreate_v2_params;
27
+
28
+ typedef struct cuGraphicsVDPAURegisterVideoSurface_params_st {
29
+ CUgraphicsResource *pCudaResource;
30
+ VdpVideoSurface vdpSurface;
31
+ unsigned int flags;
32
+ } cuGraphicsVDPAURegisterVideoSurface_params;
33
+
34
+ typedef struct cuGraphicsVDPAURegisterOutputSurface_params_st {
35
+ CUgraphicsResource *pCudaResource;
36
+ VdpOutputSurface vdpSurface;
37
+ unsigned int flags;
38
+ } cuGraphicsVDPAURegisterOutputSurface_params;
39
+
40
+ typedef struct cuVDPAUCtxCreate_params_st {
41
+ CUcontext *pCtx;
42
+ unsigned int flags;
43
+ CUdevice device;
44
+ VdpDevice vdpDevice;
45
+ VdpGetProcAddress *vdpGetProcAddress;
46
+ } cuVDPAUCtxCreate_params;
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudart_removed_meta.h ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // This file is generated. Any changes you make will be lost during the next clean build.
2
+
3
+ // CUDA public interface, for type definitions and api function prototypes
4
+ #include "cudart_removed.h"
5
+
6
+ // *************************************************************************
7
+ // Definitions of structs to hold parameters for each function
8
+ // *************************************************************************
9
+
10
+ // Currently used parameter trace structures
11
+ typedef struct cudaStreamDestroy_v3020_params_st {
12
+ cudaStream_t stream;
13
+ } cudaStreamDestroy_v3020_params;
14
+
15
+ typedef struct cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6000_params_st {
16
+ int *numBlocks;
17
+ const void *func;
18
+ size_t numDynamicSmemBytes;
19
+ } cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6000_params;
20
+
21
+ typedef struct cudaConfigureCall_v3020_params_st {
22
+ dim3 gridDim;
23
+ dim3 blockDim;
24
+ size_t sharedMem __dv;
25
+ cudaStream_t stream __dv;
26
+ } cudaConfigureCall_v3020_params;
27
+
28
+ typedef struct cudaSetupArgument_v3020_params_st {
29
+ const void *arg;
30
+ size_t size;
31
+ size_t offset;
32
+ } cudaSetupArgument_v3020_params;
33
+
34
+ typedef struct cudaLaunch_v3020_params_st {
35
+ const void *func;
36
+ } cudaLaunch_v3020_params;
37
+
38
+ typedef struct cudaLaunch_ptsz_v7000_params_st {
39
+ const void *func;
40
+ } cudaLaunch_ptsz_v7000_params;
41
+
42
+ typedef struct cudaStreamSetFlags_v10200_params_st {
43
+ cudaStream_t hStream;
44
+ unsigned int flags;
45
+ } cudaStreamSetFlags_v10200_params;
46
+
47
+ typedef struct cudaStreamSetFlags_ptsz_v10200_params_st {
48
+ cudaStream_t hStream;
49
+ unsigned int flags;
50
+ } cudaStreamSetFlags_ptsz_v10200_params;
51
+
52
+ // Parameter trace structures for removed functions
53
+
54
+
55
+ // End of parameter trace structures
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/nvperf_target.h ADDED
@@ -0,0 +1,570 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef NVPERF_TARGET_H
2
+ #define NVPERF_TARGET_H
3
+
4
+ /*
5
+ * Copyright 2014-2022 NVIDIA Corporation. All rights reserved.
6
+ *
7
+ * NOTICE TO USER:
8
+ *
9
+ * This source code is subject to NVIDIA ownership rights under U.S. and
10
+ * international Copyright laws.
11
+ *
12
+ * This software and the information contained herein is PROPRIETARY and
13
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
14
+ * of a form of NVIDIA software license agreement.
15
+ *
16
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
17
+ * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
18
+ * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
19
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
20
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
21
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
22
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
23
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
24
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
25
+ * OR PERFORMANCE OF THIS SOURCE CODE.
26
+ *
27
+ * U.S. Government End Users. This source code is a "commercial item" as
28
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
29
+ * "commercial computer software" and "commercial computer software
30
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
31
+ * and is provided to the U.S. Government only as a commercial end item.
32
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
33
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
34
+ * source code with only those rights set forth herein.
35
+ *
36
+ * Any use of this source code in individual and commercial software must
37
+ * include, in the user documentation and internal comments to the code,
38
+ * the above Disclaimer and U.S. Government End Users Notice.
39
+ */
40
+
41
+ #include <stddef.h>
42
+ #include <stdint.h>
43
+ #include "nvperf_common.h"
44
+
45
+ #if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
46
+ #pragma GCC visibility push(default)
47
+ #if !defined(NVPW_LOCAL)
48
+ #define NVPW_LOCAL __attribute__ ((visibility ("hidden")))
49
+ #endif
50
+ #else
51
+ #if !defined(NVPW_LOCAL)
52
+ #define NVPW_LOCAL
53
+ #endif
54
+ #endif
55
+
56
+ #ifdef __cplusplus
57
+ extern "C" {
58
+ #endif
59
+
60
+ /**
61
+ * @file nvperf_target.h
62
+ */
63
+
64
+ #ifndef NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_DEFINED
65
+ #define NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_DEFINED
66
+ /// GPU architecture support level
67
+ typedef enum NVPW_GpuArchitectureSupportLevel
68
+ {
69
+ NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_UNKNOWN = 0,
70
+ NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_UNSUPPORTED,
71
+ NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_SUPPORTED
72
+ } NVPW_GpuArchitectureSupportLevel;
73
+ #endif //NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_DEFINED
74
+
75
+ #ifndef NVPW_SLI_SUPPORT_LEVEL_DEFINED
76
+ #define NVPW_SLI_SUPPORT_LEVEL_DEFINED
77
+ /// SLI configuration support level
78
+ typedef enum NVPW_SliSupportLevel
79
+ {
80
+ NVPW_SLI_SUPPORT_LEVEL_UNKNOWN = 0,
81
+ NVPW_SLI_SUPPORT_LEVEL_UNSUPPORTED,
82
+ /// Only Non-SLI configurations are supported.
83
+ NVPW_SLI_SUPPORT_LEVEL_SUPPORTED_NON_SLI_CONFIGURATION
84
+ } NVPW_SliSupportLevel;
85
+ #endif //NVPW_SLI_SUPPORT_LEVEL_DEFINED
86
+
87
+ #ifndef NVPW_VGPU_SUPPORT_LEVEL_DEFINED
88
+ #define NVPW_VGPU_SUPPORT_LEVEL_DEFINED
89
+ /// Virtualized GPU configuration support level
90
+ typedef enum NVPW_VGpuSupportLevel
91
+ {
92
+ NVPW_VGPU_SUPPORT_LEVEL_UNKNOWN = 0,
93
+ NVPW_VGPU_SUPPORT_LEVEL_UNSUPPORTED,
94
+ /// Supported but not allowed by system admin.
95
+ NVPW_VGPU_SUPPORT_LEVEL_SUPPORTED_DISALLOWED,
96
+ NVPW_VGPU_SUPPORT_LEVEL_SUPPORTED_ALLOWED,
97
+ NVPW_VGPU_SUPPORT_LEVEL_SUPPORTED_NON_VGPU_CONFIGURATION
98
+ } NVPW_VGpuSupportLevel;
99
+ #endif //NVPW_VGPU_SUPPORT_LEVEL_DEFINED
100
+
101
+ #ifndef NVPW_CONF_COMPUTE_SUPPORT_LEVEL_DEFINED
102
+ #define NVPW_CONF_COMPUTE_SUPPORT_LEVEL_DEFINED
103
+ /// Confidential Compute mode support level
104
+ typedef enum NVPW_ConfidentialComputeSupportLevel
105
+ {
106
+ NVPW_CONF_COMPUTE_SUPPORT_LEVEL_UNKNOWN = 0,
107
+ NVPW_CONF_COMPUTE_SUPPORT_LEVEL_UNSUPPORTED,
108
+ NVPW_CONF_COMPUTE_SUPPORT_LEVEL_SUPPORTED_NON_CONF_COMPUTE_CONFIGURATION
109
+ } NVPW_ConfidentialComputeSupportLevel;
110
+ #endif //NVPW_CONF_COMPUTE_SUPPORT_LEVEL_DEFINED
111
+
112
+ #ifndef NVPW_CMP_SUPPORT_LEVEL_DEFINED
113
+ #define NVPW_CMP_SUPPORT_LEVEL_DEFINED
114
+ /// CMP support level
115
+ typedef enum NVPW_CmpSupportLevel
116
+ {
117
+ NVPW_CMP_SUPPORT_LEVEL_UNKNOWN = 0,
118
+ NVPW_CMP_SUPPORT_LEVEL_UNSUPPORTED,
119
+ NVPW_CMP_SUPPORT_LEVEL_SUPPORTED_NON_CMP_CONFIGURATON
120
+ } NVPW_CmpSupportLevel;
121
+ #endif //NVPW_CMP_SUPPORT_LEVEL_DEFINED
122
+
123
+ #ifndef NVPW_WSL_SUPPORT_LEVEL_DEFINED
124
+ #define NVPW_WSL_SUPPORT_LEVEL_DEFINED
125
+ /// WSL support level
126
+ typedef enum NVPW_WslSupportLevel
127
+ {
128
+ NVPW_WSL_SUPPORT_LEVEL_UNKNOWN = 0,
129
+ NVPW_WSL_SUPPORT_LEVEL_UNSUPPORTED_INSUFFICIENT_DRIVER_VERSION,
130
+ NVPW_WSL_SUPPORT_LEVEL_SUPPORTED,
131
+ NVPW_WSL_SUPPORT_LEVEL_SUPPORTED_NON_WSL_CONFIGURATION
132
+ } NVPW_WslSupportLevel;
133
+ #endif //NVPW_WSL_SUPPORT_LEVEL_DEFINED
134
+
135
+ typedef struct NVPW_InitializeTarget_Params
136
+ {
137
+ /// [in]
138
+ size_t structSize;
139
+ /// [in] assign to NULL
140
+ void* pPriv;
141
+ } NVPW_InitializeTarget_Params;
142
+ #define NVPW_InitializeTarget_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_InitializeTarget_Params, pPriv)
143
+
144
+ /// Load the target library.
145
+ NVPA_Status NVPW_InitializeTarget(NVPW_InitializeTarget_Params* pParams);
146
+
147
+ typedef struct NVPW_GetDeviceCount_Params
148
+ {
149
+ /// [in]
150
+ size_t structSize;
151
+ /// [in] assign to NULL
152
+ void* pPriv;
153
+ size_t numDevices;
154
+ } NVPW_GetDeviceCount_Params;
155
+ #define NVPW_GetDeviceCount_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_GetDeviceCount_Params, numDevices)
156
+
157
+ NVPA_Status NVPW_GetDeviceCount(NVPW_GetDeviceCount_Params* pParams);
158
+
159
+ typedef struct NVPW_Device_GetNames_Params
160
+ {
161
+ /// [in]
162
+ size_t structSize;
163
+ /// [in] assign to NULL
164
+ void* pPriv;
165
+ size_t deviceIndex;
166
+ const char* pDeviceName;
167
+ const char* pChipName;
168
+ } NVPW_Device_GetNames_Params;
169
+ #define NVPW_Device_GetNames_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetNames_Params, pChipName)
170
+
171
+ NVPA_Status NVPW_Device_GetNames(NVPW_Device_GetNames_Params* pParams);
172
+
173
+ typedef struct NVPW_PciBusId
174
+ {
175
+ /// The PCI domain on which the device bus resides.
176
+ uint32_t domain;
177
+ /// The bus on which the device resides.
178
+ uint16_t bus;
179
+ /// device ID.
180
+ uint16_t device;
181
+ } NVPW_PciBusId;
182
+ #define NVPW_PciBusId_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PciBusId, device)
183
+
184
+ typedef struct NVPW_Device_GetPciBusIds_Params
185
+ {
186
+ /// [in]
187
+ size_t structSize;
188
+ /// [in] assign to NULL
189
+ void* pPriv;
190
+ /// [in] caller-allocated array of NVPW_PciBusId, indexed by NVPW deviceIndex
191
+ NVPW_PciBusId* pBusIds;
192
+ /// [in] size of the pBusIDs array; use result from NVPW_GetDeviceCount
193
+ size_t numDevices;
194
+ } NVPW_Device_GetPciBusIds_Params;
195
+ #define NVPW_Device_GetPciBusIds_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetPciBusIds_Params, numDevices)
196
+
197
+ NVPA_Status NVPW_Device_GetPciBusIds(NVPW_Device_GetPciBusIds_Params* pParams);
198
+
199
+
200
+ #define NVPW_DEVICE_MIG_GPU_INSTANCE_ID_INVALID 0xFFFFFFFFu
201
+ #define NVPW_DEVICE_MIG_GPU_INSTANCE_ID_FULLCHIP 0xFFFFFFFEu
202
+
203
+
204
+ typedef struct NVPW_Device_GetMigAttributes_Params
205
+ {
206
+ /// [in]
207
+ size_t structSize;
208
+ /// [in] assign to NULL
209
+ void* pPriv;
210
+ /// [in]
211
+ size_t deviceIndex;
212
+ /// [out]
213
+ NVPA_Bool isMigPartition;
214
+ /// [out]
215
+ uint32_t gpuInstanceId;
216
+ /// [out]
217
+ uint32_t computeInstanceId;
218
+ } NVPW_Device_GetMigAttributes_Params;
219
+ #define NVPW_Device_GetMigAttributes_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetMigAttributes_Params, computeInstanceId)
220
+
221
+ NVPA_Status NVPW_Device_GetMigAttributes(NVPW_Device_GetMigAttributes_Params* pParams);
222
+
223
+ typedef struct NVPW_Adapter_GetDeviceIndex_Params
224
+ {
225
+ /// [in]
226
+ size_t structSize;
227
+ /// [in] assign to NULL
228
+ void* pPriv;
229
+ /// [in]
230
+ struct IDXGIAdapter* pAdapter;
231
+ /// [in]
232
+ size_t sliIndex;
233
+ /// [out]
234
+ size_t deviceIndex;
235
+ } NVPW_Adapter_GetDeviceIndex_Params;
236
+ #define NVPW_Adapter_GetDeviceIndex_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Adapter_GetDeviceIndex_Params, deviceIndex)
237
+
238
+ NVPA_Status NVPW_Adapter_GetDeviceIndex(NVPW_Adapter_GetDeviceIndex_Params* pParams);
239
+
240
+ typedef struct NVPW_CounterData_GetNumRanges_Params
241
+ {
242
+ /// [in]
243
+ size_t structSize;
244
+ /// [in] assign to NULL
245
+ void* pPriv;
246
+ const uint8_t* pCounterDataImage;
247
+ size_t numRanges;
248
+ } NVPW_CounterData_GetNumRanges_Params;
249
+ #define NVPW_CounterData_GetNumRanges_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_GetNumRanges_Params, numRanges)
250
+
251
+ NVPA_Status NVPW_CounterData_GetNumRanges(NVPW_CounterData_GetNumRanges_Params* pParams);
252
+
253
+ typedef struct NVPW_CounterData_GetChipName_Params
254
+ {
255
+ /// [in]
256
+ size_t structSize;
257
+ /// [in] assign to NULL
258
+ void* pPriv;
259
+ /// [in]
260
+ const uint8_t* pCounterDataImage;
261
+ /// [in]
262
+ size_t counterDataImageSize;
263
+ /// [out]
264
+ const char* pChipName;
265
+ } NVPW_CounterData_GetChipName_Params;
266
+ #define NVPW_CounterData_GetChipName_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_GetChipName_Params, pChipName)
267
+
268
+ NVPA_Status NVPW_CounterData_GetChipName(NVPW_CounterData_GetChipName_Params* pParams);
269
+
270
+ typedef struct NVPW_Config_GetNumPasses_Params
271
+ {
272
+ /// [in]
273
+ size_t structSize;
274
+ /// [in] assign to NULL
275
+ void* pPriv;
276
+ /// [in]
277
+ const uint8_t* pConfig;
278
+ /// [out]
279
+ size_t numPipelinedPasses;
280
+ /// [out]
281
+ size_t numIsolatedPasses;
282
+ } NVPW_Config_GetNumPasses_Params;
283
+ #define NVPW_Config_GetNumPasses_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Config_GetNumPasses_Params, numIsolatedPasses)
284
+
285
+ /// Total num passes = numPipelinedPasses + numIsolatedPasses * numNestingLevels
286
+ NVPA_Status NVPW_Config_GetNumPasses(NVPW_Config_GetNumPasses_Params* pParams);
287
+
288
+ typedef struct NVPW_Config_GetNumPasses_V2_Params
289
+ {
290
+ /// [in]
291
+ size_t structSize;
292
+ /// [in] assign to NULL
293
+ void* pPriv;
294
+ /// [in]
295
+ const uint8_t* pConfig;
296
+ /// [out]
297
+ size_t numPasses;
298
+ } NVPW_Config_GetNumPasses_V2_Params;
299
+ #define NVPW_Config_GetNumPasses_V2_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Config_GetNumPasses_V2_Params, numPasses)
300
+
301
+ /// Total num passes = numPasses * numNestingLevels
302
+ NVPA_Status NVPW_Config_GetNumPasses_V2(NVPW_Config_GetNumPasses_V2_Params* pParams);
303
+
304
+ #define NVPW_API_SET_CUDA_PROFILER 0x18209d0775b2f89dULL
305
+
306
+ #define NVPW_API_SET_D3D11_PROFILER 0xca55c6738445db2bULL
307
+
308
+ #define NVPW_API_SET_D3D12_PROFILER 0xc0c2d46dd7c7ad78ULL
309
+
310
+ #define NVPW_API_SET_EGL_PROFILER 0x3c3747dae1f9565cULL
311
+
312
+ #define NVPW_API_SET_GPU_PERIODICSAMPLER 0x9f4c2571fc0b2e8aULL
313
+
314
+ #define NVPW_API_SET_METRICSCONTEXT 0x7c8579f6f2144beaULL
315
+
316
+ #define NVPW_API_SET_METRICSEVALUATOR 0x0368a8768d811af9ULL
317
+
318
+ #define NVPW_API_SET_METRICS_GA100_COMP 0x16b7d8c20d8b4915ULL
319
+
320
+ #define NVPW_API_SET_METRICS_GA100_GRFX 0xc94eaabec04a94faULL
321
+
322
+ #define NVPW_API_SET_METRICS_GA10X_COMP 0xb5d6391c2e299ab5ULL
323
+
324
+ #define NVPW_API_SET_METRICS_GA10X_GRFX 0x6ebc121178b5ce0bULL
325
+
326
+ #define NVPW_API_SET_METRICS_GV100_COMP 0x863705cc57919f72ULL
327
+
328
+ #define NVPW_API_SET_METRICS_GV100_GRFX 0x9900da75d164fecfULL
329
+
330
+ #define NVPW_API_SET_METRICS_GV11B_COMP 0xd3f79a859235848fULL
331
+
332
+ #define NVPW_API_SET_METRICS_GV11B_GRFX 0xeb8e26220106e227ULL
333
+
334
+ #define NVPW_API_SET_METRICS_TU10X_COMP 0x70f40be0afd35da8ULL
335
+
336
+ #define NVPW_API_SET_METRICS_TU10X_GRFX 0xdf219cb838db6968ULL
337
+
338
+ #define NVPW_API_SET_METRICS_TU11X_COMP 0xeb0069d7d0956678ULL
339
+
340
+ #define NVPW_API_SET_METRICS_TU11X_GRFX 0x0977d9342bd62743ULL
341
+
342
+ #define NVPW_API_SET_OPENGL_PROFILER 0xe4cd9ea40f2ee777ULL
343
+
344
+ #define NVPW_API_SET_VULKAN_PROFILER 0x8c56b6a03d779689ULL
345
+
346
+ typedef struct NVPW_QueryVersionNumber_Params
347
+ {
348
+ /// [in]
349
+ size_t structSize;
350
+ /// [in] assign to NULL
351
+ void* pPriv;
352
+ /// [in]
353
+ uint64_t apiSet;
354
+ /// [out]
355
+ uint32_t major;
356
+ /// [out]
357
+ uint32_t minor;
358
+ /// [out]
359
+ uint32_t patch;
360
+ /// [out]
361
+ uint32_t relMajor;
362
+ /// [out]
363
+ uint32_t relMinor;
364
+ /// [out]
365
+ uint32_t relPatch;
366
+ } NVPW_QueryVersionNumber_Params;
367
+ #define NVPW_QueryVersionNumber_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_QueryVersionNumber_Params, relPatch)
368
+
369
+ /// Query version number of an API set
370
+ NVPA_Status NVPW_QueryVersionNumber(NVPW_QueryVersionNumber_Params* pParams);
371
+
372
+ typedef enum NVPW_Device_ClockStatus
373
+ {
374
+ /// clock status is unknown
375
+ NVPW_DEVICE_CLOCK_STATUS_UNKNOWN,
376
+ /// clocks are locked to rated tdp values
377
+ NVPW_DEVICE_CLOCK_STATUS_LOCKED_TO_RATED_TDP,
378
+ /// clocks are not locked and can boost above rated tdp
379
+ NVPW_DEVICE_CLOCK_STATUS_BOOST_ENABLED,
380
+ /// clocks are not locked and will not go above rated tdp
381
+ NVPW_DEVICE_CLOCK_STATUS_BOOST_DISABLED,
382
+ NVPW_DEVICE_CLOCK_STATUS__COUNT
383
+ } NVPW_Device_ClockStatus;
384
+
385
+ typedef struct NVPW_Device_GetClockStatus_Params
386
+ {
387
+ /// [in]
388
+ size_t structSize;
389
+ /// [in] assign to NULL
390
+ void* pPriv;
391
+ size_t deviceIndex;
392
+ /// [in]
393
+ NVPW_Device_ClockStatus clockStatus;
394
+ } NVPW_Device_GetClockStatus_Params;
395
+ #define NVPW_Device_GetClockStatus_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetClockStatus_Params, clockStatus)
396
+
397
+ NVPA_Status NVPW_Device_GetClockStatus(NVPW_Device_GetClockStatus_Params* pParams);
398
+
399
+ typedef enum NVPW_Device_ClockSetting
400
+ {
401
+ /// invalid op, specify valid clocks operation during profiling
402
+ NVPW_DEVICE_CLOCK_SETTING_INVALID,
403
+ /// default to driver/application config (normally unlocked and not boosted, but could be unlocked boosted, or
404
+ /// locked to rated TDP)
405
+ NVPW_DEVICE_CLOCK_SETTING_DEFAULT,
406
+ /// lock clocks at rated tdp base values
407
+ NVPW_DEVICE_CLOCK_SETTING_LOCK_TO_RATED_TDP,
408
+ NVPW_DEVICE_CLOCK_SETTING__COUNT
409
+ } NVPW_Device_ClockSetting;
410
+
411
+ typedef struct NVPW_Device_SetClockSetting_Params
412
+ {
413
+ /// [in]
414
+ size_t structSize;
415
+ /// [in] assign to NULL
416
+ void* pPriv;
417
+ size_t deviceIndex;
418
+ /// [in]
419
+ NVPW_Device_ClockSetting clockSetting;
420
+ } NVPW_Device_SetClockSetting_Params;
421
+ #define NVPW_Device_SetClockSetting_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_SetClockSetting_Params, clockSetting)
422
+
423
+ NVPA_Status NVPW_Device_SetClockSetting(NVPW_Device_SetClockSetting_Params* pParams);
424
+
425
+ typedef struct NVPW_CounterData_GetRangeDescriptions_Params
426
+ {
427
+ /// [in]
428
+ size_t structSize;
429
+ /// [in] assign to NULL
430
+ void* pPriv;
431
+ const uint8_t* pCounterDataImage;
432
+ size_t rangeIndex;
433
+ /// [inout] Number of descriptions allocated in ppDescriptions
434
+ size_t numDescriptions;
435
+ const char** ppDescriptions;
436
+ } NVPW_CounterData_GetRangeDescriptions_Params;
437
+ #define NVPW_CounterData_GetRangeDescriptions_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_GetRangeDescriptions_Params, ppDescriptions)
438
+
439
+ NVPA_Status NVPW_CounterData_GetRangeDescriptions(NVPW_CounterData_GetRangeDescriptions_Params* pParams);
440
+
441
+ typedef struct NVPW_Profiler_CounterData_GetRangeDescriptions_Params
442
+ {
443
+ /// [in]
444
+ size_t structSize;
445
+ /// [in] assign to NULL
446
+ void* pPriv;
447
+ const uint8_t* pCounterDataImage;
448
+ size_t rangeIndex;
449
+ /// [inout] Number of descriptions allocated in ppDescriptions
450
+ size_t numDescriptions;
451
+ const char** ppDescriptions;
452
+ } NVPW_Profiler_CounterData_GetRangeDescriptions_Params;
453
+ #define NVPW_Profiler_CounterData_GetRangeDescriptions_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Profiler_CounterData_GetRangeDescriptions_Params, ppDescriptions)
454
+
455
+ NVPA_Status NVPW_Profiler_CounterData_GetRangeDescriptions(NVPW_Profiler_CounterData_GetRangeDescriptions_Params* pParams);
456
+
457
+ #ifndef NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_DEFINED
458
+ #define NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_DEFINED
459
+ typedef enum NVPW_PeriodicSampler_CounterData_AppendMode
460
+ {
461
+ NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_LINEAR = 0,
462
+ NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_CIRCULAR = 1,
463
+ NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE__COUNT
464
+ } NVPW_PeriodicSampler_CounterData_AppendMode;
465
+ #endif //NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_DEFINED
466
+
467
+ typedef struct NVPW_PeriodicSampler_CounterData_GetSampleTime_Params
468
+ {
469
+ /// [in]
470
+ size_t structSize;
471
+ /// [in] assign to NULL
472
+ void* pPriv;
473
+ /// [in]
474
+ const uint8_t* pCounterDataImage;
475
+ /// [in]
476
+ size_t rangeIndex;
477
+ /// [out]
478
+ uint64_t timestampStart;
479
+ /// [out]
480
+ uint64_t timestampEnd;
481
+ } NVPW_PeriodicSampler_CounterData_GetSampleTime_Params;
482
+ #define NVPW_PeriodicSampler_CounterData_GetSampleTime_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_GetSampleTime_Params, timestampEnd)
483
+
484
+ NVPA_Status NVPW_PeriodicSampler_CounterData_GetSampleTime(NVPW_PeriodicSampler_CounterData_GetSampleTime_Params* pParams);
485
+
486
+ typedef struct NVPW_PeriodicSampler_CounterData_TrimInPlace_Params
487
+ {
488
+ /// [in]
489
+ size_t structSize;
490
+ /// [in] assign to NULL
491
+ void* pPriv;
492
+ /// [in]
493
+ uint8_t* pCounterDataImage;
494
+ /// [in]
495
+ size_t counterDataImageSize;
496
+ /// [out]
497
+ size_t counterDataImageTrimmedSize;
498
+ } NVPW_PeriodicSampler_CounterData_TrimInPlace_Params;
499
+ #define NVPW_PeriodicSampler_CounterData_TrimInPlace_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_TrimInPlace_Params, counterDataImageTrimmedSize)
500
+
501
+ NVPA_Status NVPW_PeriodicSampler_CounterData_TrimInPlace(NVPW_PeriodicSampler_CounterData_TrimInPlace_Params* pParams);
502
+
503
+ typedef struct NVPW_PeriodicSampler_CounterData_GetInfo_Params
504
+ {
505
+ /// [in]
506
+ size_t structSize;
507
+ /// [in] assign to NULL
508
+ void* pPriv;
509
+ /// [in]
510
+ const uint8_t* pCounterDataImage;
511
+ /// [in]
512
+ size_t counterDataImageSize;
513
+ /// [out] total number of ranges in the counter data
514
+ size_t numTotalRanges;
515
+ /// [out] if in "linear" mode, this API returns the number of "populated" ranges; if it's in "circular" mode,
516
+ /// then it returns the last "populated" range index + 1, when there is no such range, it returns 0.
517
+ size_t numPopulatedRanges;
518
+ /// [out] if in "linear" mode, this API returns the number of "completed" ranges; if it's in "circular" mode,
519
+ /// then it returns the last "completed" range index + 1, when there is no such range, it returns 0.
520
+ size_t numCompletedRanges;
521
+ } NVPW_PeriodicSampler_CounterData_GetInfo_Params;
522
+ #define NVPW_PeriodicSampler_CounterData_GetInfo_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_GetInfo_Params, numCompletedRanges)
523
+
524
+ /// In periodic sampler, a range in counter data stores exactly one sample's data. For better performance, periodic
525
+ /// sampler may operate in an out-of-order fashion when populating sample data, i.e. it may not fully populate all
526
+ /// counters of a sample/range before starting to populate the next sample/range. As a result, we have two concepts
527
+ /// here, "populated" & "completed": a range is considered "populated" even if only partial counters have been
528
+ /// written; on the other hand, a range is only considered "completed" if all the collecting counters have been
529
+ /// written.
530
+ NVPA_Status NVPW_PeriodicSampler_CounterData_GetInfo(NVPW_PeriodicSampler_CounterData_GetInfo_Params* pParams);
531
+
532
+ typedef struct NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params
533
+ {
534
+ /// [in]
535
+ size_t structSize;
536
+ /// [in] assign to NULL
537
+ void* pPriv;
538
+ /// [in]
539
+ const uint8_t* pCounterDataImage;
540
+ /// [in]
541
+ size_t counterDataImageSize;
542
+ /// [in]
543
+ size_t rangeIndex;
544
+ /// [out]
545
+ uint32_t triggerCount;
546
+ } NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params;
547
+ #define NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params, triggerCount)
548
+
549
+ NVPA_Status NVPW_PeriodicSampler_CounterData_GetTriggerCount(NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params* pParams);
550
+
551
+
552
+ typedef struct NVPW_TimestampReport
553
+ {
554
+ uint32_t payload;
555
+ uint8_t reserved0004[4];
556
+ uint64_t timestamp;
557
+ } NVPW_TimestampReport;
558
+
559
+
560
+
561
+
562
+ #ifdef __cplusplus
563
+ } // extern "C"
564
+ #endif
565
+
566
+ #if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
567
+ #pragma GCC visibility pop
568
+ #endif
569
+
570
+ #endif // NVPERF_TARGET_H
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl.hpp ADDED
The diff for this file is too large to render. See raw diff
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_egl.h ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*******************************************************************************
2
+ * Copyright (c) 2008-2020 The Khronos Group Inc.
3
+ *
4
+ * Licensed under the Apache License, Version 2.0 (the "License");
5
+ * you may not use this file except in compliance with the License.
6
+ * You may obtain a copy of the License at
7
+ *
8
+ * http://www.apache.org/licenses/LICENSE-2.0
9
+ *
10
+ * Unless required by applicable law or agreed to in writing, software
11
+ * distributed under the License is distributed on an "AS IS" BASIS,
12
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ * See the License for the specific language governing permissions and
14
+ * limitations under the License.
15
+ ******************************************************************************/
16
+
17
+ #ifndef __OPENCL_CL_EGL_H
18
+ #define __OPENCL_CL_EGL_H
19
+
20
+ #ifdef __APPLE__
21
+ #else
22
+ #include <CL/cl.h>
23
+ #endif
24
+
25
+ #ifdef __cplusplus
26
+ extern "C" {
27
+ #endif
28
+
29
+
30
+ /* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
31
+ #define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR 0x202F
32
+ #define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR 0x202D
33
+ #define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR 0x202E
34
+
35
+ /* Error type for clCreateFromEGLImageKHR */
36
+ #define CL_INVALID_EGL_OBJECT_KHR -1093
37
+ #define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR -1092
38
+
39
+ /* CLeglImageKHR is an opaque handle to an EGLImage */
40
+ typedef void* CLeglImageKHR;
41
+
42
+ /* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
43
+ typedef void* CLeglDisplayKHR;
44
+
45
+ /* CLeglSyncKHR is an opaque handle to an EGLSync object */
46
+ typedef void* CLeglSyncKHR;
47
+
48
+ /* properties passed to clCreateFromEGLImageKHR */
49
+ typedef intptr_t cl_egl_image_properties_khr;
50
+
51
+
52
+ #define cl_khr_egl_image 1
53
+
54
+ extern CL_API_ENTRY cl_mem CL_API_CALL
55
+ clCreateFromEGLImageKHR(cl_context context,
56
+ CLeglDisplayKHR egldisplay,
57
+ CLeglImageKHR eglimage,
58
+ cl_mem_flags flags,
59
+ const cl_egl_image_properties_khr * properties,
60
+ cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
61
+
62
+ typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
63
+ cl_context context,
64
+ CLeglDisplayKHR egldisplay,
65
+ CLeglImageKHR eglimage,
66
+ cl_mem_flags flags,
67
+ const cl_egl_image_properties_khr * properties,
68
+ cl_int * errcode_ret);
69
+
70
+
71
+ extern CL_API_ENTRY cl_int CL_API_CALL
72
+ clEnqueueAcquireEGLObjectsKHR(cl_command_queue command_queue,
73
+ cl_uint num_objects,
74
+ const cl_mem * mem_objects,
75
+ cl_uint num_events_in_wait_list,
76
+ const cl_event * event_wait_list,
77
+ cl_event * event) CL_API_SUFFIX__VERSION_1_0;
78
+
79
+ typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
80
+ cl_command_queue command_queue,
81
+ cl_uint num_objects,
82
+ const cl_mem * mem_objects,
83
+ cl_uint num_events_in_wait_list,
84
+ const cl_event * event_wait_list,
85
+ cl_event * event);
86
+
87
+
88
+ extern CL_API_ENTRY cl_int CL_API_CALL
89
+ clEnqueueReleaseEGLObjectsKHR(cl_command_queue command_queue,
90
+ cl_uint num_objects,
91
+ const cl_mem * mem_objects,
92
+ cl_uint num_events_in_wait_list,
93
+ const cl_event * event_wait_list,
94
+ cl_event * event) CL_API_SUFFIX__VERSION_1_0;
95
+
96
+ typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
97
+ cl_command_queue command_queue,
98
+ cl_uint num_objects,
99
+ const cl_mem * mem_objects,
100
+ cl_uint num_events_in_wait_list,
101
+ const cl_event * event_wait_list,
102
+ cl_event * event);
103
+
104
+
105
+ #define cl_khr_egl_event 1
106
+
107
+ extern CL_API_ENTRY cl_event CL_API_CALL
108
+ clCreateEventFromEGLSyncKHR(cl_context context,
109
+ CLeglSyncKHR sync,
110
+ CLeglDisplayKHR display,
111
+ cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
112
+
113
+ typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
114
+ cl_context context,
115
+ CLeglSyncKHR sync,
116
+ CLeglDisplayKHR display,
117
+ cl_int * errcode_ret);
118
+
119
+ #ifdef __cplusplus
120
+ }
121
+ #endif
122
+
123
+ #endif /* __OPENCL_CL_EGL_H */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_gl.h ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*******************************************************************************
2
+ * Copyright (c) 2008-2020 The Khronos Group Inc.
3
+ *
4
+ * Licensed under the Apache License, Version 2.0 (the "License");
5
+ * you may not use this file except in compliance with the License.
6
+ * You may obtain a copy of the License at
7
+ *
8
+ * http://www.apache.org/licenses/LICENSE-2.0
9
+ *
10
+ * Unless required by applicable law or agreed to in writing, software
11
+ * distributed under the License is distributed on an "AS IS" BASIS,
12
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ * See the License for the specific language governing permissions and
14
+ * limitations under the License.
15
+ ******************************************************************************/
16
+
17
+ #ifndef __OPENCL_CL_GL_H
18
+ #define __OPENCL_CL_GL_H
19
+
20
+ #ifdef __APPLE__
21
+ #include <OpenCL/cl.h>
22
+ #else
23
+ #include <CL/cl.h>
24
+ #endif
25
+
26
+ #ifdef __cplusplus
27
+ extern "C" {
28
+ #endif
29
+
30
+ typedef cl_uint cl_gl_object_type;
31
+ typedef cl_uint cl_gl_texture_info;
32
+ typedef cl_uint cl_gl_platform_info;
33
+ typedef struct __GLsync *cl_GLsync;
34
+
35
+ /* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken */
36
+ #define CL_GL_OBJECT_BUFFER 0x2000
37
+ #define CL_GL_OBJECT_TEXTURE2D 0x2001
38
+ #define CL_GL_OBJECT_TEXTURE3D 0x2002
39
+ #define CL_GL_OBJECT_RENDERBUFFER 0x2003
40
+ #define CL_GL_OBJECT_TEXTURE2D_ARRAY 0x200E
41
+ #define CL_GL_OBJECT_TEXTURE1D 0x200F
42
+ #define CL_GL_OBJECT_TEXTURE1D_ARRAY 0x2010
43
+ #define CL_GL_OBJECT_TEXTURE_BUFFER 0x2011
44
+
45
+ /* cl_gl_texture_info */
46
+ #define CL_GL_TEXTURE_TARGET 0x2004
47
+ #define CL_GL_MIPMAP_LEVEL 0x2005
48
+ #define CL_GL_NUM_SAMPLES 0x2012
49
+
50
+ extern CL_API_ENTRY cl_mem CL_API_CALL
51
+ clCreateFromGLBuffer(cl_context context,
52
+ cl_mem_flags flags,
53
+ cl_GLuint bufobj,
54
+ cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
55
+
56
+ extern CL_API_ENTRY cl_mem CL_API_CALL
57
+ clCreateFromGLTexture(cl_context context,
58
+ cl_mem_flags flags,
59
+ cl_GLenum target,
60
+ cl_GLint miplevel,
61
+ cl_GLuint texture,
62
+ cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
63
+
64
+ extern CL_API_ENTRY cl_mem CL_API_CALL
65
+ clCreateFromGLRenderbuffer(cl_context context,
66
+ cl_mem_flags flags,
67
+ cl_GLuint renderbuffer,
68
+ cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
69
+
70
+ extern CL_API_ENTRY cl_int CL_API_CALL
71
+ clGetGLObjectInfo(cl_mem memobj,
72
+ cl_gl_object_type * gl_object_type,
73
+ cl_GLuint * gl_object_name) CL_API_SUFFIX__VERSION_1_0;
74
+
75
+ extern CL_API_ENTRY cl_int CL_API_CALL
76
+ clGetGLTextureInfo(cl_mem memobj,
77
+ cl_gl_texture_info param_name,
78
+ size_t param_value_size,
79
+ void * param_value,
80
+ size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
81
+
82
+ extern CL_API_ENTRY cl_int CL_API_CALL
83
+ clEnqueueAcquireGLObjects(cl_command_queue command_queue,
84
+ cl_uint num_objects,
85
+ const cl_mem * mem_objects,
86
+ cl_uint num_events_in_wait_list,
87
+ const cl_event * event_wait_list,
88
+ cl_event * event) CL_API_SUFFIX__VERSION_1_0;
89
+
90
+ extern CL_API_ENTRY cl_int CL_API_CALL
91
+ clEnqueueReleaseGLObjects(cl_command_queue command_queue,
92
+ cl_uint num_objects,
93
+ const cl_mem * mem_objects,
94
+ cl_uint num_events_in_wait_list,
95
+ const cl_event * event_wait_list,
96
+ cl_event * event) CL_API_SUFFIX__VERSION_1_0;
97
+
98
+
99
+ /* Deprecated OpenCL 1.1 APIs */
100
+ extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
101
+ clCreateFromGLTexture2D(cl_context context,
102
+ cl_mem_flags flags,
103
+ cl_GLenum target,
104
+ cl_GLint miplevel,
105
+ cl_GLuint texture,
106
+ cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
107
+
108
+ extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
109
+ clCreateFromGLTexture3D(cl_context context,
110
+ cl_mem_flags flags,
111
+ cl_GLenum target,
112
+ cl_GLint miplevel,
113
+ cl_GLuint texture,
114
+ cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
115
+
116
+ /* cl_khr_gl_sharing extension */
117
+
118
+ #define cl_khr_gl_sharing 1
119
+
120
+ typedef cl_uint cl_gl_context_info;
121
+
122
+ /* Additional Error Codes */
123
+ #define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000
124
+
125
+ /* cl_gl_context_info */
126
+ #define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006
127
+ #define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007
128
+
129
+ /* Additional cl_context_properties */
130
+ #define CL_GL_CONTEXT_KHR 0x2008
131
+ #define CL_EGL_DISPLAY_KHR 0x2009
132
+ #define CL_GLX_DISPLAY_KHR 0x200A
133
+ #define CL_WGL_HDC_KHR 0x200B
134
+ #define CL_CGL_SHAREGROUP_KHR 0x200C
135
+
136
+ extern CL_API_ENTRY cl_int CL_API_CALL
137
+ clGetGLContextInfoKHR(const cl_context_properties * properties,
138
+ cl_gl_context_info param_name,
139
+ size_t param_value_size,
140
+ void * param_value,
141
+ size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
142
+
143
+ typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
144
+ const cl_context_properties * properties,
145
+ cl_gl_context_info param_name,
146
+ size_t param_value_size,
147
+ void * param_value,
148
+ size_t * param_value_size_ret);
149
+
150
+ #ifdef __cplusplus
151
+ }
152
+ #endif
153
+
154
+ #endif /* __OPENCL_CL_GL_H */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_gl_ext.h ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*******************************************************************************
2
+ * Copyright (c) 2008-2020 The Khronos Group Inc.
3
+ *
4
+ * Licensed under the Apache License, Version 2.0 (the "License");
5
+ * you may not use this file except in compliance with the License.
6
+ * You may obtain a copy of the License at
7
+ *
8
+ * http://www.apache.org/licenses/LICENSE-2.0
9
+ *
10
+ * Unless required by applicable law or agreed to in writing, software
11
+ * distributed under the License is distributed on an "AS IS" BASIS,
12
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ * See the License for the specific language governing permissions and
14
+ * limitations under the License.
15
+ ******************************************************************************/
16
+
17
+ #ifndef __OPENCL_CL_GL_EXT_H
18
+ #define __OPENCL_CL_GL_EXT_H
19
+
20
+ #ifdef __cplusplus
21
+ extern "C" {
22
+ #endif
23
+
24
+ #ifdef __APPLE__
25
+ #include <OpenCL/cl_gl.h>
26
+ #else
27
+ #include <CL/cl_gl.h>
28
+ #endif
29
+
30
+ /*
31
+ * cl_khr_gl_event extension
32
+ */
33
+ #define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D
34
+
35
+ extern CL_API_ENTRY cl_event CL_API_CALL
36
+ clCreateEventFromGLsyncKHR(cl_context context,
37
+ cl_GLsync sync,
38
+ cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
39
+
40
+ #ifdef __cplusplus
41
+ }
42
+ #endif
43
+
44
+ #endif /* __OPENCL_CL_GL_EXT_H */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_platform.h ADDED
@@ -0,0 +1,1414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*******************************************************************************
2
+ * Copyright (c) 2008-2020 The Khronos Group Inc.
3
+ *
4
+ * Licensed under the Apache License, Version 2.0 (the "License");
5
+ * you may not use this file except in compliance with the License.
6
+ * You may obtain a copy of the License at
7
+ *
8
+ * http://www.apache.org/licenses/LICENSE-2.0
9
+ *
10
+ * Unless required by applicable law or agreed to in writing, software
11
+ * distributed under the License is distributed on an "AS IS" BASIS,
12
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ * See the License for the specific language governing permissions and
14
+ * limitations under the License.
15
+ ******************************************************************************/
16
+
17
+ #ifndef __CL_PLATFORM_H
18
+ #define __CL_PLATFORM_H
19
+
20
+ #ifdef __APPLE__
21
+ /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */
22
+ #include <AvailabilityMacros.h>
23
+ #endif
24
+
25
+ #ifdef __cplusplus
26
+ extern "C" {
27
+ #endif
28
+
29
+ #if defined(_WIN32)
30
+ #define CL_API_ENTRY
31
+ #define CL_API_CALL __stdcall
32
+ #define CL_CALLBACK __stdcall
33
+ #else
34
+ #define CL_API_ENTRY
35
+ #define CL_API_CALL
36
+ #define CL_CALLBACK
37
+ #endif
38
+
39
+ /*
40
+ * Deprecation flags refer to the last version of the header in which the
41
+ * feature was not deprecated.
42
+ *
43
+ * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without
44
+ * deprecation but is deprecated in versions later than 1.1.
45
+ */
46
+ #ifdef __APPLE__
47
+ #define CL_EXTENSION_WEAK_LINK __attribute__((weak_import))
48
+ #define CL_API_SUFFIX__VERSION_1_0 AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
49
+ #define CL_EXT_SUFFIX__VERSION_1_0 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
50
+ #define CL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
51
+ #define GCL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
52
+ #define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
53
+ #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
54
+
55
+ #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
56
+ #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
57
+ #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
58
+ #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
59
+ #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
60
+ #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
61
+ #else
62
+ #warning This path should never happen outside of internal operating system development. AvailabilityMacros do not function correctly here!
63
+ #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
64
+ #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
65
+ #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
66
+ #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
67
+ #endif
68
+
69
+
70
+
71
+ #else
72
+ #define CL_EXTENSION_WEAK_LINK
73
+ #define CL_API_SUFFIX__VERSION_1_0
74
+ #define CL_EXT_SUFFIX__VERSION_1_0
75
+ #define CL_API_SUFFIX__VERSION_1_1
76
+ #define CL_EXT_SUFFIX__VERSION_1_1
77
+ #define CL_API_SUFFIX__VERSION_1_2
78
+ #define CL_EXT_SUFFIX__VERSION_1_2
79
+ #define CL_API_SUFFIX__VERSION_2_0
80
+ #define CL_EXT_SUFFIX__VERSION_2_0
81
+ #define CL_API_SUFFIX__VERSION_2_1
82
+ #define CL_EXT_SUFFIX__VERSION_2_1
83
+ #define CL_API_SUFFIX__VERSION_2_2
84
+ #define CL_EXT_SUFFIX__VERSION_2_2
85
+ #define CL_API_SUFFIX__VERSION_3_0
86
+ #define CL_EXT_SUFFIX__VERSION_3_0
87
+ #define CL_API_SUFFIX__EXPERIMENTAL
88
+ #define CL_EXT_SUFFIX__EXPERIMENTAL
89
+
90
+ #ifdef __GNUC__
91
+ #define CL_EXT_SUFFIX_DEPRECATED __attribute__((deprecated))
92
+ #define CL_EXT_PREFIX_DEPRECATED
93
+ #elif defined(_WIN32)
94
+ #define CL_EXT_SUFFIX_DEPRECATED
95
+ #define CL_EXT_PREFIX_DEPRECATED __declspec(deprecated)
96
+ #else
97
+ #define CL_EXT_SUFFIX_DEPRECATED
98
+ #define CL_EXT_PREFIX_DEPRECATED
99
+ #endif
100
+
101
+ #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
102
+ #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
103
+ #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
104
+ #else
105
+ #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
106
+ #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED
107
+ #endif
108
+
109
+ #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
110
+ #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
111
+ #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
112
+ #else
113
+ #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
114
+ #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED
115
+ #endif
116
+
117
+ #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
118
+ #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
119
+ #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
120
+ #else
121
+ #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
122
+ #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED
123
+ #endif
124
+
125
+ #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS
126
+ #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
127
+ #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
128
+ #else
129
+ #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
130
+ #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED
131
+ #endif
132
+
133
+ #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS
134
+ #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
135
+ #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
136
+ #else
137
+ #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
138
+ #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED
139
+ #endif
140
+
141
+ #ifdef CL_USE_DEPRECATED_OPENCL_2_2_APIS
142
+ #define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED
143
+ #define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED
144
+ #else
145
+ #define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
146
+ #define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED
147
+ #endif
148
+ #endif
149
+
150
+ #if (defined (_WIN32) && defined(_MSC_VER))
151
+
152
+ /* scalar types */
153
+ typedef signed __int8 cl_char;
154
+ typedef unsigned __int8 cl_uchar;
155
+ typedef signed __int16 cl_short;
156
+ typedef unsigned __int16 cl_ushort;
157
+ typedef signed __int32 cl_int;
158
+ typedef unsigned __int32 cl_uint;
159
+ typedef signed __int64 cl_long;
160
+ typedef unsigned __int64 cl_ulong;
161
+
162
+ typedef unsigned __int16 cl_half;
163
+ typedef float cl_float;
164
+ typedef double cl_double;
165
+
166
+ /* Macro names and corresponding values defined by OpenCL */
167
+ #define CL_CHAR_BIT 8
168
+ #define CL_SCHAR_MAX 127
169
+ #define CL_SCHAR_MIN (-127-1)
170
+ #define CL_CHAR_MAX CL_SCHAR_MAX
171
+ #define CL_CHAR_MIN CL_SCHAR_MIN
172
+ #define CL_UCHAR_MAX 255
173
+ #define CL_SHRT_MAX 32767
174
+ #define CL_SHRT_MIN (-32767-1)
175
+ #define CL_USHRT_MAX 65535
176
+ #define CL_INT_MAX 2147483647
177
+ #define CL_INT_MIN (-2147483647-1)
178
+ #define CL_UINT_MAX 0xffffffffU
179
+ #define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
180
+ #define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
181
+ #define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
182
+
183
+ #define CL_FLT_DIG 6
184
+ #define CL_FLT_MANT_DIG 24
185
+ #define CL_FLT_MAX_10_EXP +38
186
+ #define CL_FLT_MAX_EXP +128
187
+ #define CL_FLT_MIN_10_EXP -37
188
+ #define CL_FLT_MIN_EXP -125
189
+ #define CL_FLT_RADIX 2
190
+ #define CL_FLT_MAX 340282346638528859811704183484516925440.0f
191
+ #define CL_FLT_MIN 1.175494350822287507969e-38f
192
+ #define CL_FLT_EPSILON 1.1920928955078125e-7f
193
+
194
+ #define CL_HALF_DIG 3
195
+ #define CL_HALF_MANT_DIG 11
196
+ #define CL_HALF_MAX_10_EXP +4
197
+ #define CL_HALF_MAX_EXP +16
198
+ #define CL_HALF_MIN_10_EXP -4
199
+ #define CL_HALF_MIN_EXP -13
200
+ #define CL_HALF_RADIX 2
201
+ #define CL_HALF_MAX 65504.0f
202
+ #define CL_HALF_MIN 6.103515625e-05f
203
+ #define CL_HALF_EPSILON 9.765625e-04f
204
+
205
+ #define CL_DBL_DIG 15
206
+ #define CL_DBL_MANT_DIG 53
207
+ #define CL_DBL_MAX_10_EXP +308
208
+ #define CL_DBL_MAX_EXP +1024
209
+ #define CL_DBL_MIN_10_EXP -307
210
+ #define CL_DBL_MIN_EXP -1021
211
+ #define CL_DBL_RADIX 2
212
+ #define CL_DBL_MAX 1.7976931348623158e+308
213
+ #define CL_DBL_MIN 2.225073858507201383090e-308
214
+ #define CL_DBL_EPSILON 2.220446049250313080847e-16
215
+
216
+ #define CL_M_E 2.7182818284590452354
217
+ #define CL_M_LOG2E 1.4426950408889634074
218
+ #define CL_M_LOG10E 0.43429448190325182765
219
+ #define CL_M_LN2 0.69314718055994530942
220
+ #define CL_M_LN10 2.30258509299404568402
221
+ #define CL_M_PI 3.14159265358979323846
222
+ #define CL_M_PI_2 1.57079632679489661923
223
+ #define CL_M_PI_4 0.78539816339744830962
224
+ #define CL_M_1_PI 0.31830988618379067154
225
+ #define CL_M_2_PI 0.63661977236758134308
226
+ #define CL_M_2_SQRTPI 1.12837916709551257390
227
+ #define CL_M_SQRT2 1.41421356237309504880
228
+ #define CL_M_SQRT1_2 0.70710678118654752440
229
+
230
+ #define CL_M_E_F 2.718281828f
231
+ #define CL_M_LOG2E_F 1.442695041f
232
+ #define CL_M_LOG10E_F 0.434294482f
233
+ #define CL_M_LN2_F 0.693147181f
234
+ #define CL_M_LN10_F 2.302585093f
235
+ #define CL_M_PI_F 3.141592654f
236
+ #define CL_M_PI_2_F 1.570796327f
237
+ #define CL_M_PI_4_F 0.785398163f
238
+ #define CL_M_1_PI_F 0.318309886f
239
+ #define CL_M_2_PI_F 0.636619772f
240
+ #define CL_M_2_SQRTPI_F 1.128379167f
241
+ #define CL_M_SQRT2_F 1.414213562f
242
+ #define CL_M_SQRT1_2_F 0.707106781f
243
+
244
+ #define CL_NAN (CL_INFINITY - CL_INFINITY)
245
+ #define CL_HUGE_VALF ((cl_float) 1e50)
246
+ #define CL_HUGE_VAL ((cl_double) 1e500)
247
+ #define CL_MAXFLOAT CL_FLT_MAX
248
+ #define CL_INFINITY CL_HUGE_VALF
249
+
250
+ #else
251
+
252
+ #include <stdint.h>
253
+
254
+ /* scalar types */
255
+ typedef int8_t cl_char;
256
+ typedef uint8_t cl_uchar;
257
+ typedef int16_t cl_short;
258
+ typedef uint16_t cl_ushort;
259
+ typedef int32_t cl_int;
260
+ typedef uint32_t cl_uint;
261
+ typedef int64_t cl_long;
262
+ typedef uint64_t cl_ulong;
263
+
264
+ typedef uint16_t cl_half;
265
+ typedef float cl_float;
266
+ typedef double cl_double;
267
+
268
+ /* Macro names and corresponding values defined by OpenCL */
269
+ #define CL_CHAR_BIT 8
270
+ #define CL_SCHAR_MAX 127
271
+ #define CL_SCHAR_MIN (-127-1)
272
+ #define CL_CHAR_MAX CL_SCHAR_MAX
273
+ #define CL_CHAR_MIN CL_SCHAR_MIN
274
+ #define CL_UCHAR_MAX 255
275
+ #define CL_SHRT_MAX 32767
276
+ #define CL_SHRT_MIN (-32767-1)
277
+ #define CL_USHRT_MAX 65535
278
+ #define CL_INT_MAX 2147483647
279
+ #define CL_INT_MIN (-2147483647-1)
280
+ #define CL_UINT_MAX 0xffffffffU
281
+ #define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
282
+ #define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
283
+ #define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
284
+
285
+ #define CL_FLT_DIG 6
286
+ #define CL_FLT_MANT_DIG 24
287
+ #define CL_FLT_MAX_10_EXP +38
288
+ #define CL_FLT_MAX_EXP +128
289
+ #define CL_FLT_MIN_10_EXP -37
290
+ #define CL_FLT_MIN_EXP -125
291
+ #define CL_FLT_RADIX 2
292
+ #define CL_FLT_MAX 340282346638528859811704183484516925440.0f
293
+ #define CL_FLT_MIN 1.175494350822287507969e-38f
294
+ #define CL_FLT_EPSILON 1.1920928955078125e-7f
295
+
296
+ #define CL_HALF_DIG 3
297
+ #define CL_HALF_MANT_DIG 11
298
+ #define CL_HALF_MAX_10_EXP +4
299
+ #define CL_HALF_MAX_EXP +16
300
+ #define CL_HALF_MIN_10_EXP -4
301
+ #define CL_HALF_MIN_EXP -13
302
+ #define CL_HALF_RADIX 2
303
+ #define CL_HALF_MAX 65504.0f
304
+ #define CL_HALF_MIN 6.103515625e-05f
305
+ #define CL_HALF_EPSILON 9.765625e-04f
306
+
307
+ #define CL_DBL_DIG 15
308
+ #define CL_DBL_MANT_DIG 53
309
+ #define CL_DBL_MAX_10_EXP +308
310
+ #define CL_DBL_MAX_EXP +1024
311
+ #define CL_DBL_MIN_10_EXP -307
312
+ #define CL_DBL_MIN_EXP -1021
313
+ #define CL_DBL_RADIX 2
314
+ #define CL_DBL_MAX 179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
315
+ #define CL_DBL_MIN 2.225073858507201383090e-308
316
+ #define CL_DBL_EPSILON 2.220446049250313080847e-16
317
+
318
+ #define CL_M_E 2.7182818284590452354
319
+ #define CL_M_LOG2E 1.4426950408889634074
320
+ #define CL_M_LOG10E 0.43429448190325182765
321
+ #define CL_M_LN2 0.69314718055994530942
322
+ #define CL_M_LN10 2.30258509299404568402
323
+ #define CL_M_PI 3.14159265358979323846
324
+ #define CL_M_PI_2 1.57079632679489661923
325
+ #define CL_M_PI_4 0.78539816339744830962
326
+ #define CL_M_1_PI 0.31830988618379067154
327
+ #define CL_M_2_PI 0.63661977236758134308
328
+ #define CL_M_2_SQRTPI 1.12837916709551257390
329
+ #define CL_M_SQRT2 1.41421356237309504880
330
+ #define CL_M_SQRT1_2 0.70710678118654752440
331
+
332
+ #define CL_M_E_F 2.718281828f
333
+ #define CL_M_LOG2E_F 1.442695041f
334
+ #define CL_M_LOG10E_F 0.434294482f
335
+ #define CL_M_LN2_F 0.693147181f
336
+ #define CL_M_LN10_F 2.302585093f
337
+ #define CL_M_PI_F 3.141592654f
338
+ #define CL_M_PI_2_F 1.570796327f
339
+ #define CL_M_PI_4_F 0.785398163f
340
+ #define CL_M_1_PI_F 0.318309886f
341
+ #define CL_M_2_PI_F 0.636619772f
342
+ #define CL_M_2_SQRTPI_F 1.128379167f
343
+ #define CL_M_SQRT2_F 1.414213562f
344
+ #define CL_M_SQRT1_2_F 0.707106781f
345
+
346
+ #if defined( __GNUC__ )
347
+ #define CL_HUGE_VALF __builtin_huge_valf()
348
+ #define CL_HUGE_VAL __builtin_huge_val()
349
+ #define CL_NAN __builtin_nanf( "" )
350
+ #else
351
+ #define CL_HUGE_VALF ((cl_float) 1e50)
352
+ #define CL_HUGE_VAL ((cl_double) 1e500)
353
+ float nanf( const char * );
354
+ #define CL_NAN nanf( "" )
355
+ #endif
356
+ #define CL_MAXFLOAT CL_FLT_MAX
357
+ #define CL_INFINITY CL_HUGE_VALF
358
+
359
+ #endif
360
+
361
+ #include <stddef.h>
362
+
363
+ /* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */
364
+ typedef unsigned int cl_GLuint;
365
+ typedef int cl_GLint;
366
+ typedef unsigned int cl_GLenum;
367
+
368
+ /*
369
+ * Vector types
370
+ *
371
+ * Note: OpenCL requires that all types be naturally aligned.
372
+ * This means that vector types must be naturally aligned.
373
+ * For example, a vector of four floats must be aligned to
374
+ * a 16 byte boundary (calculated as 4 * the natural 4-byte
375
+ * alignment of the float). The alignment qualifiers here
376
+ * will only function properly if your compiler supports them
377
+ * and if you don't actively work to defeat them. For example,
378
+ * in order for a cl_float4 to be 16 byte aligned in a struct,
379
+ * the start of the struct must itself be 16-byte aligned.
380
+ *
381
+ * Maintaining proper alignment is the user's responsibility.
382
+ */
383
+
384
+ /* Define basic vector types */
385
+ #if defined( __VEC__ )
386
+ #if !defined(__clang__)
387
+ #include <altivec.h> /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
388
+ #endif
389
+ typedef __vector unsigned char __cl_uchar16;
390
+ typedef __vector signed char __cl_char16;
391
+ typedef __vector unsigned short __cl_ushort8;
392
+ typedef __vector signed short __cl_short8;
393
+ typedef __vector unsigned int __cl_uint4;
394
+ typedef __vector signed int __cl_int4;
395
+ typedef __vector float __cl_float4;
396
+ #define __CL_UCHAR16__ 1
397
+ #define __CL_CHAR16__ 1
398
+ #define __CL_USHORT8__ 1
399
+ #define __CL_SHORT8__ 1
400
+ #define __CL_UINT4__ 1
401
+ #define __CL_INT4__ 1
402
+ #define __CL_FLOAT4__ 1
403
+ #endif
404
+
405
+ #if defined( __SSE__ )
406
+ #if defined( __MINGW64__ )
407
+ #include <intrin.h>
408
+ #else
409
+ #include <xmmintrin.h>
410
+ #endif
411
+ #if defined( __GNUC__ )
412
+ typedef float __cl_float4 __attribute__((vector_size(16)));
413
+ #else
414
+ typedef __m128 __cl_float4;
415
+ #endif
416
+ #define __CL_FLOAT4__ 1
417
+ #endif
418
+
419
+ #if defined( __SSE2__ )
420
+ #if defined( __MINGW64__ )
421
+ #include <intrin.h>
422
+ #else
423
+ #include <emmintrin.h>
424
+ #endif
425
+ #if defined( __GNUC__ )
426
+ typedef cl_uchar __cl_uchar16 __attribute__((vector_size(16)));
427
+ typedef cl_char __cl_char16 __attribute__((vector_size(16)));
428
+ typedef cl_ushort __cl_ushort8 __attribute__((vector_size(16)));
429
+ typedef cl_short __cl_short8 __attribute__((vector_size(16)));
430
+ typedef cl_uint __cl_uint4 __attribute__((vector_size(16)));
431
+ typedef cl_int __cl_int4 __attribute__((vector_size(16)));
432
+ typedef cl_ulong __cl_ulong2 __attribute__((vector_size(16)));
433
+ typedef cl_long __cl_long2 __attribute__((vector_size(16)));
434
+ typedef cl_double __cl_double2 __attribute__((vector_size(16)));
435
+ #else
436
+ typedef __m128i __cl_uchar16;
437
+ typedef __m128i __cl_char16;
438
+ typedef __m128i __cl_ushort8;
439
+ typedef __m128i __cl_short8;
440
+ typedef __m128i __cl_uint4;
441
+ typedef __m128i __cl_int4;
442
+ typedef __m128i __cl_ulong2;
443
+ typedef __m128i __cl_long2;
444
+ typedef __m128d __cl_double2;
445
+ #endif
446
+ #define __CL_UCHAR16__ 1
447
+ #define __CL_CHAR16__ 1
448
+ #define __CL_USHORT8__ 1
449
+ #define __CL_SHORT8__ 1
450
+ #define __CL_INT4__ 1
451
+ #define __CL_UINT4__ 1
452
+ #define __CL_ULONG2__ 1
453
+ #define __CL_LONG2__ 1
454
+ #define __CL_DOUBLE2__ 1
455
+ #endif
456
+
457
+ #if defined( __MMX__ )
458
+ #include <mmintrin.h>
459
+ #if defined( __GNUC__ )
460
+ typedef cl_uchar __cl_uchar8 __attribute__((vector_size(8)));
461
+ typedef cl_char __cl_char8 __attribute__((vector_size(8)));
462
+ typedef cl_ushort __cl_ushort4 __attribute__((vector_size(8)));
463
+ typedef cl_short __cl_short4 __attribute__((vector_size(8)));
464
+ typedef cl_uint __cl_uint2 __attribute__((vector_size(8)));
465
+ typedef cl_int __cl_int2 __attribute__((vector_size(8)));
466
+ typedef cl_ulong __cl_ulong1 __attribute__((vector_size(8)));
467
+ typedef cl_long __cl_long1 __attribute__((vector_size(8)));
468
+ typedef cl_float __cl_float2 __attribute__((vector_size(8)));
469
+ #else
470
+ typedef __m64 __cl_uchar8;
471
+ typedef __m64 __cl_char8;
472
+ typedef __m64 __cl_ushort4;
473
+ typedef __m64 __cl_short4;
474
+ typedef __m64 __cl_uint2;
475
+ typedef __m64 __cl_int2;
476
+ typedef __m64 __cl_ulong1;
477
+ typedef __m64 __cl_long1;
478
+ typedef __m64 __cl_float2;
479
+ #endif
480
+ #define __CL_UCHAR8__ 1
481
+ #define __CL_CHAR8__ 1
482
+ #define __CL_USHORT4__ 1
483
+ #define __CL_SHORT4__ 1
484
+ #define __CL_INT2__ 1
485
+ #define __CL_UINT2__ 1
486
+ #define __CL_ULONG1__ 1
487
+ #define __CL_LONG1__ 1
488
+ #define __CL_FLOAT2__ 1
489
+ #endif
490
+
491
+ #if defined( __AVX__ )
492
+ #if defined( __MINGW64__ )
493
+ #include <intrin.h>
494
+ #else
495
+ #include <immintrin.h>
496
+ #endif
497
+ #if defined( __GNUC__ )
498
+ typedef cl_float __cl_float8 __attribute__((vector_size(32)));
499
+ typedef cl_double __cl_double4 __attribute__((vector_size(32)));
500
+ #else
501
+ typedef __m256 __cl_float8;
502
+ typedef __m256d __cl_double4;
503
+ #endif
504
+ #define __CL_FLOAT8__ 1
505
+ #define __CL_DOUBLE4__ 1
506
+ #endif
507
+
508
+ /* Define capabilities for anonymous struct members. */
509
+ #if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
510
+ #define __CL_HAS_ANON_STRUCT__ 1
511
+ #define __CL_ANON_STRUCT__
512
+ #elif defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
513
+ #define __CL_HAS_ANON_STRUCT__ 1
514
+ #define __CL_ANON_STRUCT__ __extension__
515
+ #elif defined( _WIN32) && defined(_MSC_VER) && ! defined(__STDC__)
516
+ #if _MSC_VER >= 1500
517
+ /* Microsoft Developer Studio 2008 supports anonymous structs, but
518
+ * complains by default. */
519
+ #define __CL_HAS_ANON_STRUCT__ 1
520
+ #define __CL_ANON_STRUCT__
521
+ /* Disable warning C4201: nonstandard extension used : nameless
522
+ * struct/union */
523
+ #pragma warning( push )
524
+ #pragma warning( disable : 4201 )
525
+ #endif
526
+ #else
527
+ #define __CL_HAS_ANON_STRUCT__ 0
528
+ #define __CL_ANON_STRUCT__
529
+ #endif
530
+
531
+ /* Define alignment keys */
532
+ #if defined( __GNUC__ )
533
+ #define CL_ALIGNED(_x) __attribute__ ((aligned(_x)))
534
+ #elif defined( _WIN32) && (_MSC_VER)
535
+ /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements */
536
+ /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx */
537
+ /* #include <crtdefs.h> */
538
+ /* #define CL_ALIGNED(_x) _CRT_ALIGN(_x) */
539
+ #define CL_ALIGNED(_x)
540
+ #else
541
+ #warning Need to implement some method to align data here
542
+ #define CL_ALIGNED(_x)
543
+ #endif
544
+
545
+ /* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
546
+ #if __CL_HAS_ANON_STRUCT__
547
+ /* .xyzw and .s0123...{f|F} are supported */
548
+ #define CL_HAS_NAMED_VECTOR_FIELDS 1
549
+ /* .hi and .lo are supported */
550
+ #define CL_HAS_HI_LO_VECTOR_FIELDS 1
551
+ #endif
552
+
553
+ /* Define cl_vector types */
554
+
555
+ /* ---- cl_charn ---- */
556
+ typedef union
557
+ {
558
+ cl_char CL_ALIGNED(2) s[2];
559
+ #if __CL_HAS_ANON_STRUCT__
560
+ __CL_ANON_STRUCT__ struct{ cl_char x, y; };
561
+ __CL_ANON_STRUCT__ struct{ cl_char s0, s1; };
562
+ __CL_ANON_STRUCT__ struct{ cl_char lo, hi; };
563
+ #endif
564
+ #if defined( __CL_CHAR2__)
565
+ __cl_char2 v2;
566
+ #endif
567
+ }cl_char2;
568
+
569
+ typedef union
570
+ {
571
+ cl_char CL_ALIGNED(4) s[4];
572
+ #if __CL_HAS_ANON_STRUCT__
573
+ __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; };
574
+ __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3; };
575
+ __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; };
576
+ #endif
577
+ #if defined( __CL_CHAR2__)
578
+ __cl_char2 v2[2];
579
+ #endif
580
+ #if defined( __CL_CHAR4__)
581
+ __cl_char4 v4;
582
+ #endif
583
+ }cl_char4;
584
+
585
+ /* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
586
+ typedef cl_char4 cl_char3;
587
+
588
+ typedef union
589
+ {
590
+ cl_char CL_ALIGNED(8) s[8];
591
+ #if __CL_HAS_ANON_STRUCT__
592
+ __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; };
593
+ __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7; };
594
+ __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; };
595
+ #endif
596
+ #if defined( __CL_CHAR2__)
597
+ __cl_char2 v2[4];
598
+ #endif
599
+ #if defined( __CL_CHAR4__)
600
+ __cl_char4 v4[2];
601
+ #endif
602
+ #if defined( __CL_CHAR8__ )
603
+ __cl_char8 v8;
604
+ #endif
605
+ }cl_char8;
606
+
607
+ typedef union
608
+ {
609
+ cl_char CL_ALIGNED(16) s[16];
610
+ #if __CL_HAS_ANON_STRUCT__
611
+ __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
612
+ __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
613
+ __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; };
614
+ #endif
615
+ #if defined( __CL_CHAR2__)
616
+ __cl_char2 v2[8];
617
+ #endif
618
+ #if defined( __CL_CHAR4__)
619
+ __cl_char4 v4[4];
620
+ #endif
621
+ #if defined( __CL_CHAR8__ )
622
+ __cl_char8 v8[2];
623
+ #endif
624
+ #if defined( __CL_CHAR16__ )
625
+ __cl_char16 v16;
626
+ #endif
627
+ }cl_char16;
628
+
629
+
630
+ /* ---- cl_ucharn ---- */
631
+ typedef union
632
+ {
633
+ cl_uchar CL_ALIGNED(2) s[2];
634
+ #if __CL_HAS_ANON_STRUCT__
635
+ __CL_ANON_STRUCT__ struct{ cl_uchar x, y; };
636
+ __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1; };
637
+ __CL_ANON_STRUCT__ struct{ cl_uchar lo, hi; };
638
+ #endif
639
+ #if defined( __cl_uchar2__)
640
+ __cl_uchar2 v2;
641
+ #endif
642
+ }cl_uchar2;
643
+
644
+ typedef union
645
+ {
646
+ cl_uchar CL_ALIGNED(4) s[4];
647
+ #if __CL_HAS_ANON_STRUCT__
648
+ __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; };
649
+ __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3; };
650
+ __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; };
651
+ #endif
652
+ #if defined( __CL_UCHAR2__)
653
+ __cl_uchar2 v2[2];
654
+ #endif
655
+ #if defined( __CL_UCHAR4__)
656
+ __cl_uchar4 v4;
657
+ #endif
658
+ }cl_uchar4;
659
+
660
+ /* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
661
+ typedef cl_uchar4 cl_uchar3;
662
+
663
+ typedef union
664
+ {
665
+ cl_uchar CL_ALIGNED(8) s[8];
666
+ #if __CL_HAS_ANON_STRUCT__
667
+ __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; };
668
+ __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7; };
669
+ __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; };
670
+ #endif
671
+ #if defined( __CL_UCHAR2__)
672
+ __cl_uchar2 v2[4];
673
+ #endif
674
+ #if defined( __CL_UCHAR4__)
675
+ __cl_uchar4 v4[2];
676
+ #endif
677
+ #if defined( __CL_UCHAR8__ )
678
+ __cl_uchar8 v8;
679
+ #endif
680
+ }cl_uchar8;
681
+
682
+ typedef union
683
+ {
684
+ cl_uchar CL_ALIGNED(16) s[16];
685
+ #if __CL_HAS_ANON_STRUCT__
686
+ __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
687
+ __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
688
+ __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; };
689
+ #endif
690
+ #if defined( __CL_UCHAR2__)
691
+ __cl_uchar2 v2[8];
692
+ #endif
693
+ #if defined( __CL_UCHAR4__)
694
+ __cl_uchar4 v4[4];
695
+ #endif
696
+ #if defined( __CL_UCHAR8__ )
697
+ __cl_uchar8 v8[2];
698
+ #endif
699
+ #if defined( __CL_UCHAR16__ )
700
+ __cl_uchar16 v16;
701
+ #endif
702
+ }cl_uchar16;
703
+
704
+
705
+ /* ---- cl_shortn ---- */
706
+ typedef union
707
+ {
708
+ cl_short CL_ALIGNED(4) s[2];
709
+ #if __CL_HAS_ANON_STRUCT__
710
+ __CL_ANON_STRUCT__ struct{ cl_short x, y; };
711
+ __CL_ANON_STRUCT__ struct{ cl_short s0, s1; };
712
+ __CL_ANON_STRUCT__ struct{ cl_short lo, hi; };
713
+ #endif
714
+ #if defined( __CL_SHORT2__)
715
+ __cl_short2 v2;
716
+ #endif
717
+ }cl_short2;
718
+
719
+ typedef union
720
+ {
721
+ cl_short CL_ALIGNED(8) s[4];
722
+ #if __CL_HAS_ANON_STRUCT__
723
+ __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; };
724
+ __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3; };
725
+ __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; };
726
+ #endif
727
+ #if defined( __CL_SHORT2__)
728
+ __cl_short2 v2[2];
729
+ #endif
730
+ #if defined( __CL_SHORT4__)
731
+ __cl_short4 v4;
732
+ #endif
733
+ }cl_short4;
734
+
735
+ /* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
736
+ typedef cl_short4 cl_short3;
737
+
738
+ typedef union
739
+ {
740
+ cl_short CL_ALIGNED(16) s[8];
741
+ #if __CL_HAS_ANON_STRUCT__
742
+ __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; };
743
+ __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7; };
744
+ __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; };
745
+ #endif
746
+ #if defined( __CL_SHORT2__)
747
+ __cl_short2 v2[4];
748
+ #endif
749
+ #if defined( __CL_SHORT4__)
750
+ __cl_short4 v4[2];
751
+ #endif
752
+ #if defined( __CL_SHORT8__ )
753
+ __cl_short8 v8;
754
+ #endif
755
+ }cl_short8;
756
+
757
+ typedef union
758
+ {
759
+ cl_short CL_ALIGNED(32) s[16];
760
+ #if __CL_HAS_ANON_STRUCT__
761
+ __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
762
+ __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
763
+ __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; };
764
+ #endif
765
+ #if defined( __CL_SHORT2__)
766
+ __cl_short2 v2[8];
767
+ #endif
768
+ #if defined( __CL_SHORT4__)
769
+ __cl_short4 v4[4];
770
+ #endif
771
+ #if defined( __CL_SHORT8__ )
772
+ __cl_short8 v8[2];
773
+ #endif
774
+ #if defined( __CL_SHORT16__ )
775
+ __cl_short16 v16;
776
+ #endif
777
+ }cl_short16;
778
+
779
+
780
+ /* ---- cl_ushortn ---- */
781
+ typedef union
782
+ {
783
+ cl_ushort CL_ALIGNED(4) s[2];
784
+ #if __CL_HAS_ANON_STRUCT__
785
+ __CL_ANON_STRUCT__ struct{ cl_ushort x, y; };
786
+ __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1; };
787
+ __CL_ANON_STRUCT__ struct{ cl_ushort lo, hi; };
788
+ #endif
789
+ #if defined( __CL_USHORT2__)
790
+ __cl_ushort2 v2;
791
+ #endif
792
+ }cl_ushort2;
793
+
794
+ typedef union
795
+ {
796
+ cl_ushort CL_ALIGNED(8) s[4];
797
+ #if __CL_HAS_ANON_STRUCT__
798
+ __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; };
799
+ __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3; };
800
+ __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; };
801
+ #endif
802
+ #if defined( __CL_USHORT2__)
803
+ __cl_ushort2 v2[2];
804
+ #endif
805
+ #if defined( __CL_USHORT4__)
806
+ __cl_ushort4 v4;
807
+ #endif
808
+ }cl_ushort4;
809
+
810
+ /* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
811
+ typedef cl_ushort4 cl_ushort3;
812
+
813
+ typedef union
814
+ {
815
+ cl_ushort CL_ALIGNED(16) s[8];
816
+ #if __CL_HAS_ANON_STRUCT__
817
+ __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; };
818
+ __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7; };
819
+ __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; };
820
+ #endif
821
+ #if defined( __CL_USHORT2__)
822
+ __cl_ushort2 v2[4];
823
+ #endif
824
+ #if defined( __CL_USHORT4__)
825
+ __cl_ushort4 v4[2];
826
+ #endif
827
+ #if defined( __CL_USHORT8__ )
828
+ __cl_ushort8 v8;
829
+ #endif
830
+ }cl_ushort8;
831
+
832
+ typedef union
833
+ {
834
+ cl_ushort CL_ALIGNED(32) s[16];
835
+ #if __CL_HAS_ANON_STRUCT__
836
+ __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
837
+ __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
838
+ __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; };
839
+ #endif
840
+ #if defined( __CL_USHORT2__)
841
+ __cl_ushort2 v2[8];
842
+ #endif
843
+ #if defined( __CL_USHORT4__)
844
+ __cl_ushort4 v4[4];
845
+ #endif
846
+ #if defined( __CL_USHORT8__ )
847
+ __cl_ushort8 v8[2];
848
+ #endif
849
+ #if defined( __CL_USHORT16__ )
850
+ __cl_ushort16 v16;
851
+ #endif
852
+ }cl_ushort16;
853
+
854
+
855
+ /* ---- cl_halfn ---- */
856
+ typedef union
857
+ {
858
+ cl_half CL_ALIGNED(4) s[2];
859
+ #if __CL_HAS_ANON_STRUCT__
860
+ __CL_ANON_STRUCT__ struct{ cl_half x, y; };
861
+ __CL_ANON_STRUCT__ struct{ cl_half s0, s1; };
862
+ __CL_ANON_STRUCT__ struct{ cl_half lo, hi; };
863
+ #endif
864
+ #if defined( __CL_HALF2__)
865
+ __cl_half2 v2;
866
+ #endif
867
+ }cl_half2;
868
+
869
+ typedef union
870
+ {
871
+ cl_half CL_ALIGNED(8) s[4];
872
+ #if __CL_HAS_ANON_STRUCT__
873
+ __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w; };
874
+ __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3; };
875
+ __CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; };
876
+ #endif
877
+ #if defined( __CL_HALF2__)
878
+ __cl_half2 v2[2];
879
+ #endif
880
+ #if defined( __CL_HALF4__)
881
+ __cl_half4 v4;
882
+ #endif
883
+ }cl_half4;
884
+
885
+ /* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */
886
+ typedef cl_half4 cl_half3;
887
+
888
+ typedef union
889
+ {
890
+ cl_half CL_ALIGNED(16) s[8];
891
+ #if __CL_HAS_ANON_STRUCT__
892
+ __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w; };
893
+ __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3, s4, s5, s6, s7; };
894
+ __CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; };
895
+ #endif
896
+ #if defined( __CL_HALF2__)
897
+ __cl_half2 v2[4];
898
+ #endif
899
+ #if defined( __CL_HALF4__)
900
+ __cl_half4 v4[2];
901
+ #endif
902
+ #if defined( __CL_HALF8__ )
903
+ __cl_half8 v8;
904
+ #endif
905
+ }cl_half8;
906
+
907
+ typedef union
908
+ {
909
+ cl_half CL_ALIGNED(32) s[16];
910
+ #if __CL_HAS_ANON_STRUCT__
911
+ __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
912
+ __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
913
+ __CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; };
914
+ #endif
915
+ #if defined( __CL_HALF2__)
916
+ __cl_half2 v2[8];
917
+ #endif
918
+ #if defined( __CL_HALF4__)
919
+ __cl_half4 v4[4];
920
+ #endif
921
+ #if defined( __CL_HALF8__ )
922
+ __cl_half8 v8[2];
923
+ #endif
924
+ #if defined( __CL_HALF16__ )
925
+ __cl_half16 v16;
926
+ #endif
927
+ }cl_half16;
928
+
929
+ /* ---- cl_intn ---- */
930
+ typedef union
931
+ {
932
+ cl_int CL_ALIGNED(8) s[2];
933
+ #if __CL_HAS_ANON_STRUCT__
934
+ __CL_ANON_STRUCT__ struct{ cl_int x, y; };
935
+ __CL_ANON_STRUCT__ struct{ cl_int s0, s1; };
936
+ __CL_ANON_STRUCT__ struct{ cl_int lo, hi; };
937
+ #endif
938
+ #if defined( __CL_INT2__)
939
+ __cl_int2 v2;
940
+ #endif
941
+ }cl_int2;
942
+
943
+ typedef union
944
+ {
945
+ cl_int CL_ALIGNED(16) s[4];
946
+ #if __CL_HAS_ANON_STRUCT__
947
+ __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; };
948
+ __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3; };
949
+ __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; };
950
+ #endif
951
+ #if defined( __CL_INT2__)
952
+ __cl_int2 v2[2];
953
+ #endif
954
+ #if defined( __CL_INT4__)
955
+ __cl_int4 v4;
956
+ #endif
957
+ }cl_int4;
958
+
959
+ /* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
960
+ typedef cl_int4 cl_int3;
961
+
962
+ typedef union
963
+ {
964
+ cl_int CL_ALIGNED(32) s[8];
965
+ #if __CL_HAS_ANON_STRUCT__
966
+ __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; };
967
+ __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7; };
968
+ __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; };
969
+ #endif
970
+ #if defined( __CL_INT2__)
971
+ __cl_int2 v2[4];
972
+ #endif
973
+ #if defined( __CL_INT4__)
974
+ __cl_int4 v4[2];
975
+ #endif
976
+ #if defined( __CL_INT8__ )
977
+ __cl_int8 v8;
978
+ #endif
979
+ }cl_int8;
980
+
981
+ typedef union
982
+ {
983
+ cl_int CL_ALIGNED(64) s[16];
984
+ #if __CL_HAS_ANON_STRUCT__
985
+ __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
986
+ __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
987
+ __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; };
988
+ #endif
989
+ #if defined( __CL_INT2__)
990
+ __cl_int2 v2[8];
991
+ #endif
992
+ #if defined( __CL_INT4__)
993
+ __cl_int4 v4[4];
994
+ #endif
995
+ #if defined( __CL_INT8__ )
996
+ __cl_int8 v8[2];
997
+ #endif
998
+ #if defined( __CL_INT16__ )
999
+ __cl_int16 v16;
1000
+ #endif
1001
+ }cl_int16;
1002
+
1003
+
1004
+ /* ---- cl_uintn ---- */
1005
+ typedef union
1006
+ {
1007
+ cl_uint CL_ALIGNED(8) s[2];
1008
+ #if __CL_HAS_ANON_STRUCT__
1009
+ __CL_ANON_STRUCT__ struct{ cl_uint x, y; };
1010
+ __CL_ANON_STRUCT__ struct{ cl_uint s0, s1; };
1011
+ __CL_ANON_STRUCT__ struct{ cl_uint lo, hi; };
1012
+ #endif
1013
+ #if defined( __CL_UINT2__)
1014
+ __cl_uint2 v2;
1015
+ #endif
1016
+ }cl_uint2;
1017
+
1018
+ typedef union
1019
+ {
1020
+ cl_uint CL_ALIGNED(16) s[4];
1021
+ #if __CL_HAS_ANON_STRUCT__
1022
+ __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; };
1023
+ __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3; };
1024
+ __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; };
1025
+ #endif
1026
+ #if defined( __CL_UINT2__)
1027
+ __cl_uint2 v2[2];
1028
+ #endif
1029
+ #if defined( __CL_UINT4__)
1030
+ __cl_uint4 v4;
1031
+ #endif
1032
+ }cl_uint4;
1033
+
1034
+ /* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
1035
+ typedef cl_uint4 cl_uint3;
1036
+
1037
+ typedef union
1038
+ {
1039
+ cl_uint CL_ALIGNED(32) s[8];
1040
+ #if __CL_HAS_ANON_STRUCT__
1041
+ __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; };
1042
+ __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7; };
1043
+ __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; };
1044
+ #endif
1045
+ #if defined( __CL_UINT2__)
1046
+ __cl_uint2 v2[4];
1047
+ #endif
1048
+ #if defined( __CL_UINT4__)
1049
+ __cl_uint4 v4[2];
1050
+ #endif
1051
+ #if defined( __CL_UINT8__ )
1052
+ __cl_uint8 v8;
1053
+ #endif
1054
+ }cl_uint8;
1055
+
1056
+ typedef union
1057
+ {
1058
+ cl_uint CL_ALIGNED(64) s[16];
1059
+ #if __CL_HAS_ANON_STRUCT__
1060
+ __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
1061
+ __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
1062
+ __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; };
1063
+ #endif
1064
+ #if defined( __CL_UINT2__)
1065
+ __cl_uint2 v2[8];
1066
+ #endif
1067
+ #if defined( __CL_UINT4__)
1068
+ __cl_uint4 v4[4];
1069
+ #endif
1070
+ #if defined( __CL_UINT8__ )
1071
+ __cl_uint8 v8[2];
1072
+ #endif
1073
+ #if defined( __CL_UINT16__ )
1074
+ __cl_uint16 v16;
1075
+ #endif
1076
+ }cl_uint16;
1077
+
1078
+ /* ---- cl_longn ---- */
1079
+ typedef union
1080
+ {
1081
+ cl_long CL_ALIGNED(16) s[2];
1082
+ #if __CL_HAS_ANON_STRUCT__
1083
+ __CL_ANON_STRUCT__ struct{ cl_long x, y; };
1084
+ __CL_ANON_STRUCT__ struct{ cl_long s0, s1; };
1085
+ __CL_ANON_STRUCT__ struct{ cl_long lo, hi; };
1086
+ #endif
1087
+ #if defined( __CL_LONG2__)
1088
+ __cl_long2 v2;
1089
+ #endif
1090
+ }cl_long2;
1091
+
1092
+ typedef union
1093
+ {
1094
+ cl_long CL_ALIGNED(32) s[4];
1095
+ #if __CL_HAS_ANON_STRUCT__
1096
+ __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; };
1097
+ __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3; };
1098
+ __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; };
1099
+ #endif
1100
+ #if defined( __CL_LONG2__)
1101
+ __cl_long2 v2[2];
1102
+ #endif
1103
+ #if defined( __CL_LONG4__)
1104
+ __cl_long4 v4;
1105
+ #endif
1106
+ }cl_long4;
1107
+
1108
+ /* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
1109
+ typedef cl_long4 cl_long3;
1110
+
1111
+ typedef union
1112
+ {
1113
+ cl_long CL_ALIGNED(64) s[8];
1114
+ #if __CL_HAS_ANON_STRUCT__
1115
+ __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; };
1116
+ __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7; };
1117
+ __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; };
1118
+ #endif
1119
+ #if defined( __CL_LONG2__)
1120
+ __cl_long2 v2[4];
1121
+ #endif
1122
+ #if defined( __CL_LONG4__)
1123
+ __cl_long4 v4[2];
1124
+ #endif
1125
+ #if defined( __CL_LONG8__ )
1126
+ __cl_long8 v8;
1127
+ #endif
1128
+ }cl_long8;
1129
+
1130
+ typedef union
1131
+ {
1132
+ cl_long CL_ALIGNED(128) s[16];
1133
+ #if __CL_HAS_ANON_STRUCT__
1134
+ __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
1135
+ __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
1136
+ __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; };
1137
+ #endif
1138
+ #if defined( __CL_LONG2__)
1139
+ __cl_long2 v2[8];
1140
+ #endif
1141
+ #if defined( __CL_LONG4__)
1142
+ __cl_long4 v4[4];
1143
+ #endif
1144
+ #if defined( __CL_LONG8__ )
1145
+ __cl_long8 v8[2];
1146
+ #endif
1147
+ #if defined( __CL_LONG16__ )
1148
+ __cl_long16 v16;
1149
+ #endif
1150
+ }cl_long16;
1151
+
1152
+
1153
+ /* ---- cl_ulongn ---- */
1154
+ typedef union
1155
+ {
1156
+ cl_ulong CL_ALIGNED(16) s[2];
1157
+ #if __CL_HAS_ANON_STRUCT__
1158
+ __CL_ANON_STRUCT__ struct{ cl_ulong x, y; };
1159
+ __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1; };
1160
+ __CL_ANON_STRUCT__ struct{ cl_ulong lo, hi; };
1161
+ #endif
1162
+ #if defined( __CL_ULONG2__)
1163
+ __cl_ulong2 v2;
1164
+ #endif
1165
+ }cl_ulong2;
1166
+
1167
+ typedef union
1168
+ {
1169
+ cl_ulong CL_ALIGNED(32) s[4];
1170
+ #if __CL_HAS_ANON_STRUCT__
1171
+ __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; };
1172
+ __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3; };
1173
+ __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; };
1174
+ #endif
1175
+ #if defined( __CL_ULONG2__)
1176
+ __cl_ulong2 v2[2];
1177
+ #endif
1178
+ #if defined( __CL_ULONG4__)
1179
+ __cl_ulong4 v4;
1180
+ #endif
1181
+ }cl_ulong4;
1182
+
1183
+ /* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
1184
+ typedef cl_ulong4 cl_ulong3;
1185
+
1186
+ typedef union
1187
+ {
1188
+ cl_ulong CL_ALIGNED(64) s[8];
1189
+ #if __CL_HAS_ANON_STRUCT__
1190
+ __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; };
1191
+ __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7; };
1192
+ __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; };
1193
+ #endif
1194
+ #if defined( __CL_ULONG2__)
1195
+ __cl_ulong2 v2[4];
1196
+ #endif
1197
+ #if defined( __CL_ULONG4__)
1198
+ __cl_ulong4 v4[2];
1199
+ #endif
1200
+ #if defined( __CL_ULONG8__ )
1201
+ __cl_ulong8 v8;
1202
+ #endif
1203
+ }cl_ulong8;
1204
+
1205
+ typedef union
1206
+ {
1207
+ cl_ulong CL_ALIGNED(128) s[16];
1208
+ #if __CL_HAS_ANON_STRUCT__
1209
+ __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
1210
+ __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
1211
+ __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; };
1212
+ #endif
1213
+ #if defined( __CL_ULONG2__)
1214
+ __cl_ulong2 v2[8];
1215
+ #endif
1216
+ #if defined( __CL_ULONG4__)
1217
+ __cl_ulong4 v4[4];
1218
+ #endif
1219
+ #if defined( __CL_ULONG8__ )
1220
+ __cl_ulong8 v8[2];
1221
+ #endif
1222
+ #if defined( __CL_ULONG16__ )
1223
+ __cl_ulong16 v16;
1224
+ #endif
1225
+ }cl_ulong16;
1226
+
1227
+
1228
+ /* --- cl_floatn ---- */
1229
+
1230
+ typedef union
1231
+ {
1232
+ cl_float CL_ALIGNED(8) s[2];
1233
+ #if __CL_HAS_ANON_STRUCT__
1234
+ __CL_ANON_STRUCT__ struct{ cl_float x, y; };
1235
+ __CL_ANON_STRUCT__ struct{ cl_float s0, s1; };
1236
+ __CL_ANON_STRUCT__ struct{ cl_float lo, hi; };
1237
+ #endif
1238
+ #if defined( __CL_FLOAT2__)
1239
+ __cl_float2 v2;
1240
+ #endif
1241
+ }cl_float2;
1242
+
1243
+ typedef union
1244
+ {
1245
+ cl_float CL_ALIGNED(16) s[4];
1246
+ #if __CL_HAS_ANON_STRUCT__
1247
+ __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; };
1248
+ __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3; };
1249
+ __CL_ANON_STRUCT__ struct{ cl_float2 lo, hi; };
1250
+ #endif
1251
+ #if defined( __CL_FLOAT2__)
1252
+ __cl_float2 v2[2];
1253
+ #endif
1254
+ #if defined( __CL_FLOAT4__)
1255
+ __cl_float4 v4;
1256
+ #endif
1257
+ }cl_float4;
1258
+
1259
+ /* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
1260
+ typedef cl_float4 cl_float3;
1261
+
1262
+ typedef union
1263
+ {
1264
+ cl_float CL_ALIGNED(32) s[8];
1265
+ #if __CL_HAS_ANON_STRUCT__
1266
+ __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; };
1267
+ __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7; };
1268
+ __CL_ANON_STRUCT__ struct{ cl_float4 lo, hi; };
1269
+ #endif
1270
+ #if defined( __CL_FLOAT2__)
1271
+ __cl_float2 v2[4];
1272
+ #endif
1273
+ #if defined( __CL_FLOAT4__)
1274
+ __cl_float4 v4[2];
1275
+ #endif
1276
+ #if defined( __CL_FLOAT8__ )
1277
+ __cl_float8 v8;
1278
+ #endif
1279
+ }cl_float8;
1280
+
1281
+ typedef union
1282
+ {
1283
+ cl_float CL_ALIGNED(64) s[16];
1284
+ #if __CL_HAS_ANON_STRUCT__
1285
+ __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
1286
+ __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
1287
+ __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; };
1288
+ #endif
1289
+ #if defined( __CL_FLOAT2__)
1290
+ __cl_float2 v2[8];
1291
+ #endif
1292
+ #if defined( __CL_FLOAT4__)
1293
+ __cl_float4 v4[4];
1294
+ #endif
1295
+ #if defined( __CL_FLOAT8__ )
1296
+ __cl_float8 v8[2];
1297
+ #endif
1298
+ #if defined( __CL_FLOAT16__ )
1299
+ __cl_float16 v16;
1300
+ #endif
1301
+ }cl_float16;
1302
+
1303
+ /* --- cl_doublen ---- */
1304
+
1305
+ typedef union
1306
+ {
1307
+ cl_double CL_ALIGNED(16) s[2];
1308
+ #if __CL_HAS_ANON_STRUCT__
1309
+ __CL_ANON_STRUCT__ struct{ cl_double x, y; };
1310
+ __CL_ANON_STRUCT__ struct{ cl_double s0, s1; };
1311
+ __CL_ANON_STRUCT__ struct{ cl_double lo, hi; };
1312
+ #endif
1313
+ #if defined( __CL_DOUBLE2__)
1314
+ __cl_double2 v2;
1315
+ #endif
1316
+ }cl_double2;
1317
+
1318
+ typedef union
1319
+ {
1320
+ cl_double CL_ALIGNED(32) s[4];
1321
+ #if __CL_HAS_ANON_STRUCT__
1322
+ __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; };
1323
+ __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3; };
1324
+ __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; };
1325
+ #endif
1326
+ #if defined( __CL_DOUBLE2__)
1327
+ __cl_double2 v2[2];
1328
+ #endif
1329
+ #if defined( __CL_DOUBLE4__)
1330
+ __cl_double4 v4;
1331
+ #endif
1332
+ }cl_double4;
1333
+
1334
+ /* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
1335
+ typedef cl_double4 cl_double3;
1336
+
1337
+ typedef union
1338
+ {
1339
+ cl_double CL_ALIGNED(64) s[8];
1340
+ #if __CL_HAS_ANON_STRUCT__
1341
+ __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; };
1342
+ __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7; };
1343
+ __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; };
1344
+ #endif
1345
+ #if defined( __CL_DOUBLE2__)
1346
+ __cl_double2 v2[4];
1347
+ #endif
1348
+ #if defined( __CL_DOUBLE4__)
1349
+ __cl_double4 v4[2];
1350
+ #endif
1351
+ #if defined( __CL_DOUBLE8__ )
1352
+ __cl_double8 v8;
1353
+ #endif
1354
+ }cl_double8;
1355
+
1356
+ typedef union
1357
+ {
1358
+ cl_double CL_ALIGNED(128) s[16];
1359
+ #if __CL_HAS_ANON_STRUCT__
1360
+ __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
1361
+ __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
1362
+ __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; };
1363
+ #endif
1364
+ #if defined( __CL_DOUBLE2__)
1365
+ __cl_double2 v2[8];
1366
+ #endif
1367
+ #if defined( __CL_DOUBLE4__)
1368
+ __cl_double4 v4[4];
1369
+ #endif
1370
+ #if defined( __CL_DOUBLE8__ )
1371
+ __cl_double8 v8[2];
1372
+ #endif
1373
+ #if defined( __CL_DOUBLE16__ )
1374
+ __cl_double16 v16;
1375
+ #endif
1376
+ }cl_double16;
1377
+
1378
+ /* Macro to facilitate debugging
1379
+ * Usage:
1380
+ * Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source.
1381
+ * The first line ends with: CL_PROGRAM_STRING_DEBUG_INFO \"
1382
+ * Each line thereafter of OpenCL C source must end with: \n\
1383
+ * The last line ends in ";
1384
+ *
1385
+ * Example:
1386
+ *
1387
+ * const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\
1388
+ * kernel void foo( int a, float * b ) \n\
1389
+ * { \n\
1390
+ * // my comment \n\
1391
+ * *b[ get_global_id(0)] = a; \n\
1392
+ * } \n\
1393
+ * ";
1394
+ *
1395
+ * This should correctly set up the line, (column) and file information for your source
1396
+ * string so you can do source level debugging.
1397
+ */
1398
+ #define __CL_STRINGIFY( _x ) # _x
1399
+ #define _CL_STRINGIFY( _x ) __CL_STRINGIFY( _x )
1400
+ #define CL_PROGRAM_STRING_DEBUG_INFO "#line " _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n"
1401
+
1402
+ #ifdef __cplusplus
1403
+ }
1404
+ #endif
1405
+
1406
+ #undef __CL_HAS_ANON_STRUCT__
1407
+ #undef __CL_ANON_STRUCT__
1408
+ #if defined( _WIN32) && defined(_MSC_VER) && ! defined(__STDC__)
1409
+ #if _MSC_VER >=1500
1410
+ #pragma warning( pop )
1411
+ #endif
1412
+ #endif
1413
+
1414
+ #endif /* __CL_PLATFORM_H */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/opencl.h ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*******************************************************************************
2
+ * Copyright (c) 2008-2020 The Khronos Group Inc.
3
+ *
4
+ * Licensed under the Apache License, Version 2.0 (the "License");
5
+ * you may not use this file except in compliance with the License.
6
+ * You may obtain a copy of the License at
7
+ *
8
+ * http://www.apache.org/licenses/LICENSE-2.0
9
+ *
10
+ * Unless required by applicable law or agreed to in writing, software
11
+ * distributed under the License is distributed on an "AS IS" BASIS,
12
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ * See the License for the specific language governing permissions and
14
+ * limitations under the License.
15
+ ******************************************************************************/
16
+
17
+ #ifndef __OPENCL_H
18
+ #define __OPENCL_H
19
+
20
+ #ifdef __cplusplus
21
+ extern "C" {
22
+ #endif
23
+
24
+ #ifdef __APPLE__
25
+ #include <OpenCL/cl.h>
26
+ #include <OpenCL/cl_gl.h>
27
+ #include <OpenCL/cl_gl_ext.h>
28
+ #include <OpenCL/cl_ext.h>
29
+ #else
30
+ #include <CL/cl.h>
31
+ #include <CL/cl_gl.h>
32
+ #include <CL/cl_gl_ext.h>
33
+ #include <CL/cl_ext.h>
34
+ #endif
35
+
36
+ #ifdef __cplusplus
37
+ }
38
+ #endif
39
+
40
+ #endif /* __OPENCL_H */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (228 Bytes). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/builtin_types.h ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /*******************************************************************************
51
+ * *
52
+ * *
53
+ * *
54
+ *******************************************************************************/
55
+
56
+ #include "device_types.h"
57
+ #if !defined(__CUDACC_RTC__)
58
+ #define EXCLUDE_FROM_RTC
59
+ #include "driver_types.h"
60
+ #undef EXCLUDE_FROM_RTC
61
+ #endif /* !__CUDACC_RTC__ */
62
+ #include "surface_types.h"
63
+ #include "texture_types.h"
64
+ #include "vector_types.h"
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/channel_descriptor.h ADDED
@@ -0,0 +1,595 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CHANNEL_DESCRIPTOR_H__)
51
+ #define __CHANNEL_DESCRIPTOR_H__
52
+
53
+ #if defined(__cplusplus)
54
+
55
+ /*******************************************************************************
56
+ * *
57
+ * *
58
+ * *
59
+ *******************************************************************************/
60
+
61
+ #include "cuda_runtime_api.h"
62
+
63
+ /*******************************************************************************
64
+ * *
65
+ * *
66
+ * *
67
+ *******************************************************************************/
68
+
69
+ /**
70
+ * \addtogroup CUDART_HIGHLEVEL
71
+ *
72
+ * @{
73
+ */
74
+
75
+ /**
76
+ * \brief \hl Returns a channel descriptor using the specified format
77
+ *
78
+ * Returns a channel descriptor with format \p f and number of bits of each
79
+ * component \p x, \p y, \p z, and \p w. The ::cudaChannelFormatDesc is
80
+ * defined as:
81
+ * \code
82
+ struct cudaChannelFormatDesc {
83
+ int x, y, z, w;
84
+ enum cudaChannelFormatKind f;
85
+ };
86
+ * \endcode
87
+ *
88
+ * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
89
+ * ::cudaChannelFormatKindUnsigned, cudaChannelFormatKindFloat,
90
+ * ::cudaChannelFormatKindSignedNormalized8X1, ::cudaChannelFormatKindSignedNormalized8X2,
91
+ * ::cudaChannelFormatKindSignedNormalized8X4,
92
+ * ::cudaChannelFormatKindUnsignedNormalized8X1, ::cudaChannelFormatKindUnsignedNormalized8X2,
93
+ * ::cudaChannelFormatKindUnsignedNormalized8X4,
94
+ * ::cudaChannelFormatKindSignedNormalized16X1, ::cudaChannelFormatKindSignedNormalized16X2,
95
+ * ::cudaChannelFormatKindSignedNormalized16X4,
96
+ * ::cudaChannelFormatKindUnsignedNormalized16X1, ::cudaChannelFormatKindUnsignedNormalized16X2,
97
+ * ::cudaChannelFormatKindUnsignedNormalized16X4
98
+ * or ::cudaChannelFormatKindNV12.
99
+ *
100
+ * The format is specified by the template specialization.
101
+ *
102
+ * The template function specializes for the following scalar types:
103
+ * char, signed char, unsigned char, short, unsigned short, int, unsigned int, long, unsigned long, and float.
104
+ * The template function specializes for the following vector types:
105
+ * char{1|2|4}, uchar{1|2|4}, short{1|2|4}, ushort{1|2|4}, int{1|2|4}, uint{1|2|4}, long{1|2|4}, ulong{1|2|4}, float{1|2|4}.
106
+ * The template function specializes for following cudaChannelFormatKind enum values:
107
+ * ::cudaChannelFormatKind{Uns|S}ignedNormalized{8|16}X{1|2|4}, and ::cudaChannelFormatKindNV12.
108
+ *
109
+ * Invoking the function on a type without a specialization defaults to creating a channel format of kind ::cudaChannelFormatKindNone
110
+ *
111
+ * \return
112
+ * Channel descriptor with format \p f
113
+ *
114
+ * \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)",
115
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
116
+ * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (High level)",
117
+ * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, size_t) "cudaBindTexture (High level, inherited channel descriptor)",
118
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (High level)",
119
+ * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (High level)",
120
+ * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (High level, inherited channel descriptor)",
121
+ * \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (High level)",
122
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (High level)"
123
+ */
124
+ template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
125
+ {
126
+ return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
127
+ }
128
+
129
+ static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void)
130
+ {
131
+ int e = (int)sizeof(unsigned short) * 8;
132
+
133
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
134
+ }
135
+
136
+ static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void)
137
+ {
138
+ int e = (int)sizeof(unsigned short) * 8;
139
+
140
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
141
+ }
142
+
143
+ static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void)
144
+ {
145
+ int e = (int)sizeof(unsigned short) * 8;
146
+
147
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
148
+ }
149
+
150
+ static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void)
151
+ {
152
+ int e = (int)sizeof(unsigned short) * 8;
153
+
154
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
155
+ }
156
+
157
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char>(void)
158
+ {
159
+ int e = (int)sizeof(char) * 8;
160
+
161
+ #if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__)
162
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
163
+ #else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
164
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
165
+ #endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
166
+ }
167
+
168
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<signed char>(void)
169
+ {
170
+ int e = (int)sizeof(signed char) * 8;
171
+
172
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
173
+ }
174
+
175
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned char>(void)
176
+ {
177
+ int e = (int)sizeof(unsigned char) * 8;
178
+
179
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
180
+ }
181
+
182
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char1>(void)
183
+ {
184
+ int e = (int)sizeof(signed char) * 8;
185
+
186
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
187
+ }
188
+
189
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar1>(void)
190
+ {
191
+ int e = (int)sizeof(unsigned char) * 8;
192
+
193
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
194
+ }
195
+
196
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char2>(void)
197
+ {
198
+ int e = (int)sizeof(signed char) * 8;
199
+
200
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
201
+ }
202
+
203
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar2>(void)
204
+ {
205
+ int e = (int)sizeof(unsigned char) * 8;
206
+
207
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
208
+ }
209
+
210
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char4>(void)
211
+ {
212
+ int e = (int)sizeof(signed char) * 8;
213
+
214
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
215
+ }
216
+
217
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar4>(void)
218
+ {
219
+ int e = (int)sizeof(unsigned char) * 8;
220
+
221
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
222
+ }
223
+
224
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short>(void)
225
+ {
226
+ int e = (int)sizeof(short) * 8;
227
+
228
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
229
+ }
230
+
231
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned short>(void)
232
+ {
233
+ int e = (int)sizeof(unsigned short) * 8;
234
+
235
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
236
+ }
237
+
238
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short1>(void)
239
+ {
240
+ int e = (int)sizeof(short) * 8;
241
+
242
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
243
+ }
244
+
245
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort1>(void)
246
+ {
247
+ int e = (int)sizeof(unsigned short) * 8;
248
+
249
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
250
+ }
251
+
252
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short2>(void)
253
+ {
254
+ int e = (int)sizeof(short) * 8;
255
+
256
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
257
+ }
258
+
259
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort2>(void)
260
+ {
261
+ int e = (int)sizeof(unsigned short) * 8;
262
+
263
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
264
+ }
265
+
266
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short4>(void)
267
+ {
268
+ int e = (int)sizeof(short) * 8;
269
+
270
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
271
+ }
272
+
273
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort4>(void)
274
+ {
275
+ int e = (int)sizeof(unsigned short) * 8;
276
+
277
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
278
+ }
279
+
280
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int>(void)
281
+ {
282
+ int e = (int)sizeof(int) * 8;
283
+
284
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
285
+ }
286
+
287
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned int>(void)
288
+ {
289
+ int e = (int)sizeof(unsigned int) * 8;
290
+
291
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
292
+ }
293
+
294
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int1>(void)
295
+ {
296
+ int e = (int)sizeof(int) * 8;
297
+
298
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
299
+ }
300
+
301
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint1>(void)
302
+ {
303
+ int e = (int)sizeof(unsigned int) * 8;
304
+
305
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
306
+ }
307
+
308
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int2>(void)
309
+ {
310
+ int e = (int)sizeof(int) * 8;
311
+
312
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
313
+ }
314
+
315
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint2>(void)
316
+ {
317
+ int e = (int)sizeof(unsigned int) * 8;
318
+
319
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
320
+ }
321
+
322
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int4>(void)
323
+ {
324
+ int e = (int)sizeof(int) * 8;
325
+
326
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
327
+ }
328
+
329
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint4>(void)
330
+ {
331
+ int e = (int)sizeof(unsigned int) * 8;
332
+
333
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
334
+ }
335
+
336
+ #if !defined(__LP64__)
337
+
338
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long>(void)
339
+ {
340
+ int e = (int)sizeof(long) * 8;
341
+
342
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
343
+ }
344
+
345
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned long>(void)
346
+ {
347
+ int e = (int)sizeof(unsigned long) * 8;
348
+
349
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
350
+ }
351
+
352
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long1>(void)
353
+ {
354
+ int e = (int)sizeof(long) * 8;
355
+
356
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
357
+ }
358
+
359
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong1>(void)
360
+ {
361
+ int e = (int)sizeof(unsigned long) * 8;
362
+
363
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
364
+ }
365
+
366
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long2>(void)
367
+ {
368
+ int e = (int)sizeof(long) * 8;
369
+
370
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
371
+ }
372
+
373
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong2>(void)
374
+ {
375
+ int e = (int)sizeof(unsigned long) * 8;
376
+
377
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
378
+ }
379
+
380
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long4>(void)
381
+ {
382
+ int e = (int)sizeof(long) * 8;
383
+
384
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
385
+ }
386
+
387
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong4>(void)
388
+ {
389
+ int e = (int)sizeof(unsigned long) * 8;
390
+
391
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
392
+ }
393
+
394
+ #endif /* !__LP64__ */
395
+
396
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float>(void)
397
+ {
398
+ int e = (int)sizeof(float) * 8;
399
+
400
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
401
+ }
402
+
403
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float1>(void)
404
+ {
405
+ int e = (int)sizeof(float) * 8;
406
+
407
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
408
+ }
409
+
410
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float2>(void)
411
+ {
412
+ int e = (int)sizeof(float) * 8;
413
+
414
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
415
+ }
416
+
417
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float4>(void)
418
+ {
419
+ int e = (int)sizeof(float) * 8;
420
+
421
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
422
+ }
423
+
424
+ static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescNV12(void)
425
+ {
426
+ int e = (int)sizeof(char) * 8;
427
+
428
+ return cudaCreateChannelDesc(e, e, e, 0, cudaChannelFormatKindNV12);
429
+ }
430
+
431
+ template<cudaChannelFormatKind> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
432
+ {
433
+ return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
434
+ }
435
+
436
+ /* Signed 8-bit normalized integer formats */
437
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X1>(void)
438
+ {
439
+ return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedNormalized8X1);
440
+ }
441
+
442
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X2>(void)
443
+ {
444
+ return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedNormalized8X2);
445
+ }
446
+
447
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X4>(void)
448
+ {
449
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindSignedNormalized8X4);
450
+ }
451
+
452
+ /* Unsigned 8-bit normalized integer formats */
453
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X1>(void)
454
+ {
455
+ return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized8X1);
456
+ }
457
+
458
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X2>(void)
459
+ {
460
+ return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedNormalized8X2);
461
+ }
462
+
463
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X4>(void)
464
+ {
465
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedNormalized8X4);
466
+ }
467
+
468
+ /* Signed 16-bit normalized integer formats */
469
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X1>(void)
470
+ {
471
+ return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindSignedNormalized16X1);
472
+ }
473
+
474
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X2>(void)
475
+ {
476
+ return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindSignedNormalized16X2);
477
+ }
478
+
479
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X4>(void)
480
+ {
481
+ return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindSignedNormalized16X4);
482
+ }
483
+
484
+ /* Unsigned 16-bit normalized integer formats */
485
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X1>(void)
486
+ {
487
+ return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized16X1);
488
+ }
489
+
490
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X2>(void)
491
+ {
492
+ return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindUnsignedNormalized16X2);
493
+ }
494
+
495
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X4>(void)
496
+ {
497
+ return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindUnsignedNormalized16X4);
498
+ }
499
+
500
+ /* NV12 format */
501
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindNV12>(void)
502
+ {
503
+ return cudaCreateChannelDesc(8, 8, 8, 0, cudaChannelFormatKindNV12);
504
+ }
505
+
506
+ /* BC1 format */
507
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1>(void)
508
+ {
509
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1);
510
+ }
511
+
512
+ /* BC1sRGB format */
513
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1SRGB>(void)
514
+ {
515
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1SRGB);
516
+ }
517
+
518
+ /* BC2 format */
519
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2>(void)
520
+ {
521
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2);
522
+ }
523
+
524
+ /* BC2sRGB format */
525
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2SRGB>(void)
526
+ {
527
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2SRGB);
528
+ }
529
+
530
+ /* BC3 format */
531
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3>(void)
532
+ {
533
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3);
534
+ }
535
+
536
+ /* BC3sRGB format */
537
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3SRGB>(void)
538
+ {
539
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3SRGB);
540
+ }
541
+
542
+ /* BC4 unsigned format */
543
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed4>(void)
544
+ {
545
+ return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed4);
546
+ }
547
+
548
+ /* BC4 signed format */
549
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed4>(void)
550
+ {
551
+ return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedBlockCompressed4);
552
+ }
553
+
554
+ /* BC5 unsigned format */
555
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed5>(void)
556
+ {
557
+ return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed5);
558
+ }
559
+
560
+ /* BC5 signed format */
561
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed5>(void)
562
+ {
563
+ return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedBlockCompressed5);
564
+ }
565
+
566
+ /* BC6H unsigned format */
567
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed6H>(void)
568
+ {
569
+ return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindUnsignedBlockCompressed6H);
570
+ }
571
+
572
+ /* BC6H signed format */
573
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed6H>(void)
574
+ {
575
+ return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindSignedBlockCompressed6H);
576
+ }
577
+
578
+ /* BC7 format */
579
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7>(void)
580
+ {
581
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7);
582
+ }
583
+
584
+ /* BC7sRGB format */
585
+ template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7SRGB>(void)
586
+ {
587
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7SRGB);
588
+ }
589
+
590
+ #endif /* __cplusplus */
591
+
592
+ /** @} */
593
+ /** @} */ /* END CUDART_TEXTURE_HL */
594
+
595
+ #endif /* !__CHANNEL_DESCRIPTOR_H__ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups.h ADDED
@@ -0,0 +1,1828 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef _COOPERATIVE_GROUPS_H_
51
+ #define _COOPERATIVE_GROUPS_H_
52
+
53
+ #if defined(__cplusplus) && defined(__CUDACC__)
54
+
55
+ #include "cooperative_groups/details/info.h"
56
+ #include "cooperative_groups/details/driver_abi.h"
57
+ #include "cooperative_groups/details/helpers.h"
58
+
59
+ #if defined(_CG_HAS_STL_ATOMICS)
60
+ #include <cuda/atomic>
61
+ #define _CG_THREAD_SCOPE(scope) _CG_STATIC_CONST_DECL cuda::thread_scope thread_scope = scope;
62
+ #else
63
+ #define _CG_THREAD_SCOPE(scope)
64
+ #endif
65
+
66
+ _CG_BEGIN_NAMESPACE
67
+
68
+ namespace details {
69
+ _CG_CONST_DECL unsigned int coalesced_group_id = 1;
70
+ _CG_CONST_DECL unsigned int multi_grid_group_id = 2;
71
+ _CG_CONST_DECL unsigned int grid_group_id = 3;
72
+ _CG_CONST_DECL unsigned int thread_block_id = 4;
73
+ _CG_CONST_DECL unsigned int multi_tile_group_id = 5;
74
+ _CG_CONST_DECL unsigned int cluster_group_id = 6;
75
+ }
76
+
77
+ /**
78
+ * class thread_group;
79
+ *
80
+ * Generic thread group type, into which all groups are convertible.
81
+ * It acts as a container for all storage necessary for the derived groups,
82
+ * and will dispatch the API calls to the correct derived group. This means
83
+ * that all derived groups must implement the same interface as thread_group.
84
+ */
85
+ class thread_group
86
+ {
87
+ protected:
88
+ struct group_data {
89
+ unsigned int _unused : 1;
90
+ unsigned int type : 7, : 0;
91
+ };
92
+
93
+ struct gg_data {
94
+ details::grid_workspace *gridWs;
95
+ };
96
+
97
+ #if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
98
+ struct mg_data {
99
+ unsigned long long _unused : 1;
100
+ unsigned long long type : 7;
101
+ unsigned long long handle : 56;
102
+ const details::multi_grid::multi_grid_functions *functions;
103
+ };
104
+ #endif
105
+
106
+ struct tg_data {
107
+ unsigned int is_tiled : 1;
108
+ unsigned int type : 7;
109
+ unsigned int size : 24;
110
+ // packed to 4b
111
+ unsigned int metaGroupSize : 16;
112
+ unsigned int metaGroupRank : 16;
113
+ // packed to 8b
114
+ unsigned int mask;
115
+ // packed to 12b
116
+ unsigned int _res;
117
+ };
118
+
119
+ friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
120
+ friend class thread_block;
121
+
122
+ union __align__(8) {
123
+ group_data group;
124
+ tg_data coalesced;
125
+ gg_data grid;
126
+ #if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
127
+ mg_data multi_grid;
128
+ #endif
129
+ } _data;
130
+
131
+ _CG_QUALIFIER thread_group operator=(const thread_group& src);
132
+
133
+ _CG_QUALIFIER thread_group(unsigned int type) {
134
+ _data.group.type = type;
135
+ _data.group._unused = false;
136
+ }
137
+
138
+ #ifdef _CG_CPP11_FEATURES
139
+ static_assert(sizeof(tg_data) <= 16, "Failed size check");
140
+ static_assert(sizeof(gg_data) <= 16, "Failed size check");
141
+ # ifdef _CG_ABI_EXPERIMENTAL
142
+ static_assert(sizeof(mg_data) <= 16, "Failed size check");
143
+ # endif
144
+ #endif
145
+
146
+ public:
147
+ _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
148
+
149
+ _CG_QUALIFIER unsigned long long size() const;
150
+ _CG_QUALIFIER unsigned long long num_threads() const;
151
+ _CG_QUALIFIER unsigned long long thread_rank() const;
152
+ _CG_QUALIFIER void sync() const;
153
+ _CG_QUALIFIER unsigned int get_type() const {
154
+ return _data.group.type;
155
+ }
156
+
157
+ };
158
+
159
+ template <unsigned int TyId>
160
+ struct thread_group_base : public thread_group {
161
+ _CG_QUALIFIER thread_group_base() : thread_group(TyId) {}
162
+ _CG_STATIC_CONST_DECL unsigned int id = TyId;
163
+ };
164
+
165
+ #if defined(_CG_HAS_MULTI_GRID_GROUP)
166
+
167
+ /**
168
+ * class multi_grid_group;
169
+ *
170
+ * Threads within this this group are guaranteed to be co-resident on the
171
+ * same system, on multiple devices within the same launched kernels.
172
+ * To use this group, the kernel must have been launched with
173
+ * cuLaunchCooperativeKernelMultiDevice (or the CUDA Runtime equivalent),
174
+ * and the device must support it (queryable device attribute).
175
+ *
176
+ * Constructed via this_multi_grid();
177
+ */
178
+
179
+
180
+ # if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
181
+ class multi_grid_group;
182
+
183
+ // Multi grid group requires these functions to be templated to prevent ptxas from trying to use CG syscalls
184
+ template <typename = void>
185
+ __device__ _CG_DEPRECATED multi_grid_group this_multi_grid();
186
+
187
+ class multi_grid_group : public thread_group_base<details::multi_grid_group_id>
188
+ {
189
+ private:
190
+ template <typename = void>
191
+ _CG_QUALIFIER multi_grid_group() {
192
+ _data.multi_grid.functions = details::multi_grid::load_grid_intrinsics();
193
+ _data.multi_grid.handle = _data.multi_grid.functions->get_intrinsic_handle();
194
+ }
195
+
196
+ friend multi_grid_group this_multi_grid<void>();
197
+
198
+ public:
199
+ _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
200
+
201
+ _CG_QUALIFIER bool is_valid() const {
202
+ return (_data.multi_grid.handle != 0);
203
+ }
204
+
205
+ _CG_QUALIFIER void sync() const {
206
+ if (!is_valid()) {
207
+ _CG_ABORT();
208
+ }
209
+ _data.multi_grid.functions->sync(_data.multi_grid.handle);
210
+ }
211
+
212
+ _CG_QUALIFIER unsigned long long num_threads() const {
213
+ _CG_ASSERT(is_valid());
214
+ return _data.multi_grid.functions->size(_data.multi_grid.handle);
215
+ }
216
+
217
+ _CG_QUALIFIER unsigned long long size() const {
218
+ return num_threads();
219
+ }
220
+
221
+ _CG_QUALIFIER unsigned long long thread_rank() const {
222
+ _CG_ASSERT(is_valid());
223
+ return _data.multi_grid.functions->thread_rank(_data.multi_grid.handle);
224
+ }
225
+
226
+ _CG_QUALIFIER unsigned int grid_rank() const {
227
+ _CG_ASSERT(is_valid());
228
+ return (_data.multi_grid.functions->grid_rank(_data.multi_grid.handle));
229
+ }
230
+
231
+ _CG_QUALIFIER unsigned int num_grids() const {
232
+ _CG_ASSERT(is_valid());
233
+ return (_data.multi_grid.functions->num_grids(_data.multi_grid.handle));
234
+ }
235
+ };
236
+ # else
237
+ class multi_grid_group
238
+ {
239
+ private:
240
+ unsigned long long _handle;
241
+ unsigned int _size;
242
+ unsigned int _rank;
243
+
244
+ friend _CG_QUALIFIER multi_grid_group this_multi_grid();
245
+
246
+ _CG_QUALIFIER multi_grid_group() {
247
+ _handle = details::multi_grid::get_intrinsic_handle();
248
+ _size = details::multi_grid::size(_handle);
249
+ _rank = details::multi_grid::thread_rank(_handle);
250
+ }
251
+
252
+ public:
253
+ _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
254
+
255
+ _CG_QUALIFIER _CG_DEPRECATED bool is_valid() const {
256
+ return (_handle != 0);
257
+ }
258
+
259
+ _CG_QUALIFIER _CG_DEPRECATED void sync() const {
260
+ if (!is_valid()) {
261
+ _CG_ABORT();
262
+ }
263
+ details::multi_grid::sync(_handle);
264
+ }
265
+
266
+ _CG_QUALIFIER _CG_DEPRECATED unsigned long long num_threads() const {
267
+ _CG_ASSERT(is_valid());
268
+ return _size;
269
+ }
270
+
271
+ _CG_QUALIFIER _CG_DEPRECATED unsigned long long size() const {
272
+ return num_threads();
273
+ }
274
+
275
+ _CG_QUALIFIER _CG_DEPRECATED unsigned long long thread_rank() const {
276
+ _CG_ASSERT(is_valid());
277
+ return _rank;
278
+ }
279
+
280
+ _CG_QUALIFIER _CG_DEPRECATED unsigned int grid_rank() const {
281
+ _CG_ASSERT(is_valid());
282
+ return (details::multi_grid::grid_rank(_handle));
283
+ }
284
+
285
+ _CG_QUALIFIER _CG_DEPRECATED unsigned int num_grids() const {
286
+ _CG_ASSERT(is_valid());
287
+ return (details::multi_grid::num_grids(_handle));
288
+ }
289
+ };
290
+ # endif
291
+
292
+ /**
293
+ * multi_grid_group this_multi_grid()
294
+ *
295
+ * Constructs a multi_grid_group
296
+ */
297
+ # if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
298
+ template <typename>
299
+ __device__
300
+ #else
301
+ _CG_QUALIFIER
302
+ # endif
303
+ _CG_DEPRECATED
304
+ multi_grid_group this_multi_grid()
305
+ {
306
+ return multi_grid_group();
307
+ }
308
+ #endif
309
+
310
+ /**
311
+ * class grid_group;
312
+ *
313
+ * Threads within this this group are guaranteed to be co-resident on the
314
+ * same device within the same launched kernel. To use this group, the kernel
315
+ * must have been launched with cuLaunchCooperativeKernel (or the CUDA Runtime equivalent),
316
+ * and the device must support it (queryable device attribute).
317
+ *
318
+ * Constructed via this_grid();
319
+ */
320
+ class grid_group : public thread_group_base<details::grid_group_id>
321
+ {
322
+ _CG_STATIC_CONST_DECL unsigned int _group_id = details::grid_group_id;
323
+ friend _CG_QUALIFIER grid_group this_grid();
324
+
325
+ private:
326
+ _CG_QUALIFIER grid_group(details::grid_workspace *gridWs) {
327
+ _data.grid.gridWs = gridWs;
328
+ }
329
+
330
+ public:
331
+ _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
332
+
333
+ _CG_QUALIFIER bool is_valid() const {
334
+ return (_data.grid.gridWs != NULL);
335
+ }
336
+
337
+ _CG_QUALIFIER void sync() const {
338
+ if (!is_valid()) {
339
+ _CG_ABORT();
340
+ }
341
+ details::grid::sync(&_data.grid.gridWs->barrier);
342
+ }
343
+
344
+ _CG_STATIC_QUALIFIER unsigned long long size() {
345
+ return details::grid::size();
346
+ }
347
+
348
+ _CG_STATIC_QUALIFIER unsigned long long thread_rank() {
349
+ return details::grid::thread_rank();
350
+ }
351
+
352
+ _CG_STATIC_QUALIFIER dim3 group_dim() {
353
+ return details::grid::grid_dim();
354
+ }
355
+
356
+ _CG_STATIC_QUALIFIER unsigned long long num_threads() {
357
+ return details::grid::num_threads();
358
+ }
359
+
360
+ _CG_STATIC_QUALIFIER dim3 dim_blocks() {
361
+ return details::grid::dim_blocks();
362
+ }
363
+
364
+ _CG_STATIC_QUALIFIER unsigned long long num_blocks() {
365
+ return details::grid::num_blocks();
366
+ }
367
+
368
+ _CG_STATIC_QUALIFIER dim3 block_index() {
369
+ return details::grid::block_index();
370
+ }
371
+
372
+ _CG_STATIC_QUALIFIER unsigned long long block_rank() {
373
+ return details::grid::block_rank();
374
+ }
375
+
376
+ # if defined(_CG_HAS_CLUSTER_GROUP)
377
+ _CG_STATIC_QUALIFIER dim3 dim_clusters() {
378
+ return details::grid::dim_clusters();
379
+ }
380
+
381
+ _CG_STATIC_QUALIFIER unsigned long long num_clusters() {
382
+ return details::grid::num_clusters();
383
+ }
384
+
385
+ _CG_STATIC_QUALIFIER dim3 cluster_index() {
386
+ return details::grid::cluster_index();
387
+ }
388
+
389
+ _CG_STATIC_QUALIFIER unsigned long long cluster_rank() {
390
+ return details::grid::cluster_rank();
391
+ }
392
+ # endif
393
+ };
394
+
395
+ _CG_QUALIFIER grid_group this_grid() {
396
+ // Load a workspace from the driver
397
+ grid_group gg(details::get_grid_workspace());
398
+ #ifdef _CG_DEBUG
399
+ // *all* threads must be available to synchronize
400
+ gg.sync();
401
+ #endif // _CG_DEBUG
402
+ return gg;
403
+ }
404
+
405
+ #if defined(_CG_HAS_CLUSTER_GROUP)
406
+ /**
407
+ * class cluster_group
408
+ *
409
+ * Every GPU kernel is executed by a grid of thread blocks. A grid can be evenly
410
+ * divided along all dimensions to form groups of blocks, each group of which is
411
+ * a block cluster. Clustered grids are subject to various restrictions and
412
+ * limitations. Primarily, a cluster consists of at most 8 blocks by default
413
+ * (although the user is allowed to opt-in to non-standard sizes,) and clustered
414
+ * grids are subject to additional occupancy limitations due to per-cluster
415
+ * hardware resource consumption. In exchange, a block cluster is guaranteed to
416
+ * be a cooperative group, with access to all cooperative group capabilities, as
417
+ * well as cluster specific capabilities and accelerations. A cluster_group
418
+ * represents a block cluster.
419
+ *
420
+ * Constructed via this_cluster_group();
421
+ */
422
+ class cluster_group : public thread_group_base<details::cluster_group_id>
423
+ {
424
+ // Friends
425
+ friend _CG_QUALIFIER cluster_group this_cluster();
426
+
427
+ // Disable constructor
428
+ _CG_QUALIFIER cluster_group()
429
+ {
430
+ }
431
+
432
+ public:
433
+ //_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_cluster)
434
+
435
+ // Functionality exposed by the group
436
+ _CG_STATIC_QUALIFIER void sync()
437
+ {
438
+ return details::cluster::sync();
439
+ }
440
+
441
+ _CG_STATIC_QUALIFIER void barrier_arrive()
442
+ {
443
+ return details::cluster::barrier_arrive();
444
+ }
445
+
446
+ _CG_STATIC_QUALIFIER void barrier_wait()
447
+ {
448
+ return details::cluster::barrier_wait();
449
+ }
450
+
451
+ _CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr)
452
+ {
453
+ return details::cluster::query_shared_rank(addr);
454
+ }
455
+
456
+ template <typename T>
457
+ _CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank)
458
+ {
459
+ return details::cluster::map_shared_rank(addr, rank);
460
+ }
461
+
462
+ _CG_STATIC_QUALIFIER dim3 block_index()
463
+ {
464
+ return details::cluster::block_index();
465
+ }
466
+
467
+ _CG_STATIC_QUALIFIER unsigned int block_rank()
468
+ {
469
+ return details::cluster::block_rank();
470
+ }
471
+
472
+ _CG_STATIC_QUALIFIER unsigned int thread_rank()
473
+ {
474
+ return details::cluster::thread_rank();
475
+ }
476
+
477
+ _CG_STATIC_QUALIFIER dim3 dim_blocks()
478
+ {
479
+ return details::cluster::dim_blocks();
480
+ }
481
+
482
+ _CG_STATIC_QUALIFIER unsigned int num_blocks()
483
+ {
484
+ return details::cluster::num_blocks();
485
+ }
486
+
487
+ _CG_STATIC_QUALIFIER dim3 dim_threads()
488
+ {
489
+ return details::cluster::dim_threads();
490
+ }
491
+
492
+ _CG_STATIC_QUALIFIER unsigned int num_threads()
493
+ {
494
+ return details::cluster::num_threads();
495
+ }
496
+
497
+ // Legacy aliases
498
+ _CG_STATIC_QUALIFIER unsigned int size()
499
+ {
500
+ return num_threads();
501
+ }
502
+ };
503
+
504
+ /*
505
+ * cluster_group this_cluster()
506
+ *
507
+ * Constructs a cluster_group
508
+ */
509
+ _CG_QUALIFIER cluster_group this_cluster()
510
+ {
511
+ cluster_group cg;
512
+ #ifdef _CG_DEBUG
513
+ cg.sync();
514
+ #endif
515
+ return cg;
516
+ }
517
+ #endif
518
+
519
+ #if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
520
+ namespace details {
521
+
522
+ _CG_CONSTEXPR_QUALIFIER unsigned int scratch_sync_memory_size(unsigned int max_block_size) {
523
+ // One barrier per possible size of the group rounded up to multiple of 4.
524
+ return 8 * sizeof(details::barrier_t);
525
+ }
526
+
527
+ _CG_CONSTEXPR_QUALIFIER unsigned int scratch_collectives_memory_size(unsigned int communication_size, unsigned int max_block_size) {
528
+ // One slot of collectives memory per warp.
529
+ return max_block_size / 32 * communication_size;
530
+ }
531
+
532
+ _CG_CONSTEXPR_QUALIFIER unsigned int scratch_size_needed(unsigned int communication_size, unsigned int max_block_size) {
533
+ return scratch_sync_memory_size(max_block_size) + scratch_collectives_memory_size(communication_size, max_block_size);
534
+ }
535
+
536
+ _CG_CONSTEXPR_QUALIFIER size_t scratch_alignment(unsigned int communication_size) {
537
+ return ((communication_size & (communication_size - 1) == 0) && communication_size > 8) ?
538
+ communication_size : 8;
539
+ }
540
+
541
+ _CG_CONST_DECL unsigned int default_tile_communication_size = 8;
542
+ _CG_CONST_DECL unsigned int default_max_block_size = 1024;
543
+
544
+ struct multi_warp_scratch {
545
+ char memory[1];
546
+ };
547
+ }
548
+
549
+ class thread_block;
550
+ namespace experimental {
551
+ template <unsigned int TileCommunicationSize = details::default_tile_communication_size,
552
+ unsigned int MaxBlockSize = details::default_max_block_size>
553
+ struct __align__(details::scratch_alignment(TileCommunicationSize)) block_tile_memory {
554
+ private:
555
+ char scratch[details::scratch_size_needed(TileCommunicationSize, MaxBlockSize)];
556
+
557
+ public:
558
+ _CG_QUALIFIER void* get_memory() {
559
+ return static_cast<void*>(scratch);
560
+ }
561
+
562
+ _CG_STATIC_QUALIFIER unsigned int get_size() {
563
+ return details::scratch_size_needed(TileCommunicationSize, MaxBlockSize);
564
+ }
565
+ };
566
+
567
+ template <unsigned int TileCommunicationSize, unsigned int MaxBlockSize>
568
+ _CG_QUALIFIER thread_block this_thread_block(experimental::block_tile_memory<TileCommunicationSize, MaxBlockSize>& scratch);
569
+ }
570
+ #endif
571
+
572
+ /**
573
+ * class thread_block
574
+ *
575
+ * Every GPU kernel is executed by a grid of thread blocks, and threads within
576
+ * each block are guaranteed to reside on the same streaming multiprocessor.
577
+ * A thread_block represents a thread block whose dimensions are not known until runtime.
578
+ *
579
+ * Constructed via this_thread_block();
580
+ */
581
+ class thread_block : public thread_group_base<details::thread_block_id>
582
+ {
583
+ // Friends
584
+ friend _CG_QUALIFIER thread_block this_thread_block();
585
+ friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
586
+ friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz);
587
+
588
+ #if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
589
+ template <unsigned int TileCommunicationSize, unsigned int MaxBlockSize>
590
+ friend _CG_QUALIFIER thread_block experimental::this_thread_block(
591
+ experimental::block_tile_memory<TileCommunicationSize, MaxBlockSize>& scratch);
592
+
593
+ const unsigned short communication_size;
594
+ const unsigned short max_block_size;
595
+ details::multi_warp_scratch* const tile_memory;
596
+
597
+ template <unsigned int Size>
598
+ friend class __static_size_multi_warp_tile_base;
599
+
600
+ template <unsigned int TileCommunicationSize, unsigned int MaxBlockSize>
601
+ _CG_QUALIFIER thread_block(experimental::block_tile_memory<TileCommunicationSize, MaxBlockSize>& scratch) :
602
+ tile_memory(reinterpret_cast<details::multi_warp_scratch*>(&scratch)),
603
+ communication_size(TileCommunicationSize), max_block_size(MaxBlockSize) {
604
+ if (thread_rank() < details::scratch_sync_memory_size(MaxBlockSize) / sizeof(details::barrier_t)) {
605
+ details::barrier_t* barriers = reinterpret_cast<details::barrier_t*>(&tile_memory->memory);
606
+ barriers[thread_rank()] = 0;
607
+ }
608
+ sync();
609
+ }
610
+ #endif
611
+
612
+ // Disable constructor
613
+ _CG_QUALIFIER thread_block()
614
+ #if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
615
+ : tile_memory(NULL), communication_size(0), max_block_size(0)
616
+ #endif
617
+ { }
618
+
619
+ // Internal Use
620
+ _CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const {
621
+ const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
622
+
623
+ // Invalid, immediately fail
624
+ if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
625
+ details::abort();
626
+ return (thread_block());
627
+ }
628
+
629
+ unsigned int mask;
630
+ unsigned int base_offset = thread_rank() & (~(tilesz - 1));
631
+ unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
632
+
633
+ mask = (unsigned int)(-1) >> (32 - masklength);
634
+ mask <<= (details::laneid() & ~(tilesz - 1));
635
+ thread_group tile = thread_group(details::coalesced_group_id);
636
+ tile._data.coalesced.mask = mask;
637
+ tile._data.coalesced.size = __popc(mask);
638
+ tile._data.coalesced.metaGroupSize = (details::cta::size() + tilesz - 1) / tilesz;
639
+ tile._data.coalesced.metaGroupRank = details::cta::thread_rank() / tilesz;
640
+ tile._data.coalesced.is_tiled = true;
641
+ return (tile);
642
+ }
643
+
644
+ public:
645
+ _CG_STATIC_CONST_DECL unsigned int _group_id = details::thread_block_id;
646
+ _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
647
+
648
+ _CG_STATIC_QUALIFIER void sync() {
649
+ details::cta::sync();
650
+ }
651
+
652
+ _CG_STATIC_QUALIFIER unsigned int size() {
653
+ return details::cta::size();
654
+ }
655
+
656
+ _CG_STATIC_QUALIFIER unsigned int thread_rank() {
657
+ return details::cta::thread_rank();
658
+ }
659
+
660
+ // Additional functionality exposed by the group
661
+ _CG_STATIC_QUALIFIER dim3 group_index() {
662
+ return details::cta::group_index();
663
+ }
664
+
665
+ _CG_STATIC_QUALIFIER dim3 thread_index() {
666
+ return details::cta::thread_index();
667
+ }
668
+
669
+ _CG_STATIC_QUALIFIER dim3 group_dim() {
670
+ return details::cta::block_dim();
671
+ }
672
+
673
+ _CG_STATIC_QUALIFIER dim3 dim_threads() {
674
+ return details::cta::dim_threads();
675
+ }
676
+
677
+ _CG_STATIC_QUALIFIER unsigned int num_threads() {
678
+ return details::cta::num_threads();
679
+ }
680
+
681
+ };
682
+
683
+ /**
684
+ * thread_block this_thread_block()
685
+ *
686
+ * Constructs a thread_block group
687
+ */
688
+ _CG_QUALIFIER thread_block this_thread_block()
689
+ {
690
+ return (thread_block());
691
+ }
692
+
693
+ #if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
694
+ namespace experimental {
695
+ template <unsigned int TileCommunicationSize, unsigned int MaxBlockSize>
696
+ _CG_QUALIFIER thread_block this_thread_block(experimental::block_tile_memory<TileCommunicationSize, MaxBlockSize>& scratch) {
697
+ return (thread_block(scratch));
698
+ }
699
+ }
700
+ #endif
701
+
702
+ /**
703
+ * class coalesced_group
704
+ *
705
+ * A group representing the current set of converged threads in a warp.
706
+ * The size of the group is not guaranteed and it may return a group of
707
+ * only one thread (itself).
708
+ *
709
+ * This group exposes warp-synchronous builtins.
710
+ * Constructed via coalesced_threads();
711
+ */
712
+ class coalesced_group : public thread_group_base<details::coalesced_group_id>
713
+ {
714
+ private:
715
+ friend _CG_QUALIFIER coalesced_group coalesced_threads();
716
+ friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
717
+ friend _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz);
718
+ friend class details::_coalesced_group_data_access;
719
+
720
+ _CG_QUALIFIER unsigned int _packLanes(unsigned laneMask) const {
721
+ unsigned int member_pack = 0;
722
+ unsigned int member_rank = 0;
723
+ for (int bit_idx = 0; bit_idx < 32; bit_idx++) {
724
+ unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
725
+ if (lane_bit) {
726
+ if (laneMask & lane_bit)
727
+ member_pack |= 1 << member_rank;
728
+ member_rank++;
729
+ }
730
+ }
731
+ return (member_pack);
732
+ }
733
+
734
+ // Internal Use
735
+ _CG_QUALIFIER coalesced_group _get_tiled_threads(unsigned int tilesz) const {
736
+ const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
737
+
738
+ // Invalid, immediately fail
739
+ if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
740
+ details::abort();
741
+ return (coalesced_group(0));
742
+ }
743
+ if (size() <= tilesz) {
744
+ return (*this);
745
+ }
746
+
747
+ if ((_data.coalesced.is_tiled == true) && pow2_tilesz) {
748
+ unsigned int base_offset = (thread_rank() & (~(tilesz - 1)));
749
+ unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
750
+ unsigned int mask = (unsigned int)(-1) >> (32 - masklength);
751
+
752
+ mask <<= (details::laneid() & ~(tilesz - 1));
753
+ coalesced_group coalesced_tile = coalesced_group(mask);
754
+ coalesced_tile._data.coalesced.metaGroupSize = size() / tilesz;
755
+ coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
756
+ coalesced_tile._data.coalesced.is_tiled = true;
757
+ return (coalesced_tile);
758
+ }
759
+ else if ((_data.coalesced.is_tiled == false) && pow2_tilesz) {
760
+ unsigned int mask = 0;
761
+ unsigned int member_rank = 0;
762
+ int seen_lanes = (thread_rank() / tilesz) * tilesz;
763
+ for (unsigned int bit_idx = 0; bit_idx < 32; bit_idx++) {
764
+ unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
765
+ if (lane_bit) {
766
+ if (seen_lanes <= 0 && member_rank < tilesz) {
767
+ mask |= lane_bit;
768
+ member_rank++;
769
+ }
770
+ seen_lanes--;
771
+ }
772
+ }
773
+ coalesced_group coalesced_tile = coalesced_group(mask);
774
+ // Override parent with the size of this group
775
+ coalesced_tile._data.coalesced.metaGroupSize = (size() + tilesz - 1) / tilesz;
776
+ coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
777
+ return coalesced_tile;
778
+ }
779
+ else {
780
+ // None in _CG_VERSION 1000
781
+ details::abort();
782
+ }
783
+
784
+ return (coalesced_group(0));
785
+ }
786
+
787
+ protected:
788
+ _CG_QUALIFIER coalesced_group(unsigned int mask) {
789
+ _data.coalesced.mask = mask;
790
+ _data.coalesced.size = __popc(mask);
791
+ _data.coalesced.metaGroupRank = 0;
792
+ _data.coalesced.metaGroupSize = 1;
793
+ _data.coalesced.is_tiled = false;
794
+ }
795
+
796
+ _CG_QUALIFIER unsigned int get_mask() const {
797
+ return (_data.coalesced.mask);
798
+ }
799
+
800
+ public:
801
+ _CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
802
+ _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
803
+
804
+ _CG_QUALIFIER unsigned int num_threads() const {
805
+ return _data.coalesced.size;
806
+ }
807
+
808
+ _CG_QUALIFIER unsigned int size() const {
809
+ return num_threads();
810
+ }
811
+
812
+ _CG_QUALIFIER unsigned int thread_rank() const {
813
+ return (__popc(_data.coalesced.mask & details::lanemask32_lt()));
814
+ }
815
+
816
+ // Rank of this group in the upper level of the hierarchy
817
+ _CG_QUALIFIER unsigned int meta_group_rank() const {
818
+ return _data.coalesced.metaGroupRank;
819
+ }
820
+
821
+ // Total num partitions created out of all CTAs when the group was created
822
+ _CG_QUALIFIER unsigned int meta_group_size() const {
823
+ return _data.coalesced.metaGroupSize;
824
+ }
825
+
826
+ _CG_QUALIFIER void sync() const {
827
+ __syncwarp(_data.coalesced.mask);
828
+ }
829
+
830
+ #ifdef _CG_CPP11_FEATURES
831
+ template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
832
+ _CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
833
+ unsigned int lane = (srcRank == 0) ? __ffs(_data.coalesced.mask) - 1 :
834
+ (size() == 32) ? srcRank : __fns(_data.coalesced.mask, 0, (srcRank + 1));
835
+
836
+ return details::tile::shuffle_dispatch<TyElem>::shfl(
837
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
838
+ }
839
+
840
+ template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
841
+ _CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
842
+ if (size() == 32) {
843
+ return details::tile::shuffle_dispatch<TyElem>::shfl_down(
844
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
845
+ }
846
+
847
+ unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
848
+
849
+ if (lane >= 32)
850
+ lane = details::laneid();
851
+
852
+ return details::tile::shuffle_dispatch<TyElem>::shfl(
853
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
854
+ }
855
+
856
+ template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
857
+ _CG_QUALIFIER TyRet shfl_up(TyElem&& elem, int delta) const {
858
+ if (size() == 32) {
859
+ return details::tile::shuffle_dispatch<TyElem>::shfl_up(
860
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
861
+ }
862
+
863
+ unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
864
+ if (lane >= 32)
865
+ lane = details::laneid();
866
+
867
+ return details::tile::shuffle_dispatch<TyElem>::shfl(
868
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
869
+ }
870
+ #else
871
+ template <typename TyIntegral>
872
+ _CG_QUALIFIER TyIntegral shfl(TyIntegral var, unsigned int src_rank) const {
873
+ details::assert_if_not_arithmetic<TyIntegral>();
874
+ unsigned int lane = (src_rank == 0) ? __ffs(_data.coalesced.mask) - 1 :
875
+ (size() == 32) ? src_rank : __fns(_data.coalesced.mask, 0, (src_rank + 1));
876
+ return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
877
+ }
878
+
879
+ template <typename TyIntegral>
880
+ _CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, int delta) const {
881
+ details::assert_if_not_arithmetic<TyIntegral>();
882
+ if (size() == 32) {
883
+ return (__shfl_up_sync(0xFFFFFFFF, var, delta, 32));
884
+ }
885
+ unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
886
+ if (lane >= 32) lane = details::laneid();
887
+ return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
888
+ }
889
+
890
+ template <typename TyIntegral>
891
+ _CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, int delta) const {
892
+ details::assert_if_not_arithmetic<TyIntegral>();
893
+ if (size() == 32) {
894
+ return (__shfl_down_sync(0xFFFFFFFF, var, delta, 32));
895
+ }
896
+ unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
897
+ if (lane >= 32) lane = details::laneid();
898
+ return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
899
+ }
900
+ #endif
901
+
902
+ _CG_QUALIFIER int any(int predicate) const {
903
+ return (__ballot_sync(_data.coalesced.mask, predicate) != 0);
904
+ }
905
+ _CG_QUALIFIER int all(int predicate) const {
906
+ return (__ballot_sync(_data.coalesced.mask, predicate) == _data.coalesced.mask);
907
+ }
908
+ _CG_QUALIFIER unsigned int ballot(int predicate) const {
909
+ if (size() == 32) {
910
+ return (__ballot_sync(0xFFFFFFFF, predicate));
911
+ }
912
+ unsigned int lane_ballot = __ballot_sync(_data.coalesced.mask, predicate);
913
+ return (_packLanes(lane_ballot));
914
+ }
915
+
916
+ #ifdef _CG_HAS_MATCH_COLLECTIVE
917
+
918
+ template <typename TyIntegral>
919
+ _CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
920
+ details::assert_if_not_arithmetic<TyIntegral>();
921
+ if (size() == 32) {
922
+ return (__match_any_sync(0xFFFFFFFF, val));
923
+ }
924
+ unsigned int lane_match = __match_any_sync(_data.coalesced.mask, val);
925
+ return (_packLanes(lane_match));
926
+ }
927
+
928
+ template <typename TyIntegral>
929
+ _CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
930
+ details::assert_if_not_arithmetic<TyIntegral>();
931
+ if (size() == 32) {
932
+ return (__match_all_sync(0xFFFFFFFF, val, &pred));
933
+ }
934
+ unsigned int lane_match = __match_all_sync(_data.coalesced.mask, val, &pred);
935
+ return (_packLanes(lane_match));
936
+ }
937
+
938
+ #endif /* !_CG_HAS_MATCH_COLLECTIVE */
939
+
940
+ };
941
+
942
+ _CG_QUALIFIER coalesced_group coalesced_threads()
943
+ {
944
+ return (coalesced_group(__activemask()));
945
+ }
946
+
947
+ namespace details {
948
+ template <unsigned int Size> struct verify_thread_block_tile_size;
949
+ template <> struct verify_thread_block_tile_size<32> { typedef void OK; };
950
+ template <> struct verify_thread_block_tile_size<16> { typedef void OK; };
951
+ template <> struct verify_thread_block_tile_size<8> { typedef void OK; };
952
+ template <> struct verify_thread_block_tile_size<4> { typedef void OK; };
953
+ template <> struct verify_thread_block_tile_size<2> { typedef void OK; };
954
+ template <> struct verify_thread_block_tile_size<1> { typedef void OK; };
955
+
956
+ #ifdef _CG_CPP11_FEATURES
957
+ template <unsigned int Size>
958
+ using _is_power_of_2 = _CG_STL_NAMESPACE::integral_constant<bool, (Size & (Size - 1)) == 0>;
959
+
960
+ template <unsigned int Size>
961
+ using _is_single_warp = _CG_STL_NAMESPACE::integral_constant<bool, Size <= 32>;
962
+ template <unsigned int Size>
963
+ using _is_multi_warp =
964
+ _CG_STL_NAMESPACE::integral_constant<bool, (Size > 32) && (Size <= 1024)>;
965
+
966
+ template <unsigned int Size>
967
+ using _is_valid_single_warp_tile =
968
+ _CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_single_warp<Size>::value>;
969
+ template <unsigned int Size>
970
+ using _is_valid_multi_warp_tile =
971
+ _CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_multi_warp<Size>::value>;
972
+ #else
973
+ template <unsigned int Size>
974
+ struct _is_multi_warp {
975
+ static const bool value = false;
976
+ };
977
+ #endif
978
+ }
979
+
980
+ template <unsigned int Size>
981
+ class __static_size_tile_base
982
+ {
983
+ protected:
984
+ _CG_STATIC_CONST_DECL unsigned int numThreads = Size;
985
+
986
+ public:
987
+ _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
988
+
989
+ // Rank of thread within tile
990
+ _CG_STATIC_QUALIFIER unsigned int thread_rank() {
991
+ return (details::cta::thread_rank() & (numThreads - 1));
992
+ }
993
+
994
+ // Number of threads within tile
995
+ _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int num_threads() {
996
+ return numThreads;
997
+ }
998
+
999
+ _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int size() {
1000
+ return num_threads();
1001
+ }
1002
+ };
1003
+
1004
+ template <unsigned int Size>
1005
+ class __static_size_thread_block_tile_base : public __static_size_tile_base<Size>
1006
+ {
1007
+ friend class details::_coalesced_group_data_access;
1008
+ typedef details::tile::tile_helpers<Size> th;
1009
+
1010
+ #ifdef _CG_CPP11_FEATURES
1011
+ static_assert(details::_is_valid_single_warp_tile<Size>::value, "Size must be one of 1/2/4/8/16/32");
1012
+ #else
1013
+ typedef typename details::verify_thread_block_tile_size<Size>::OK valid;
1014
+ #endif
1015
+ using __static_size_tile_base<Size>::numThreads;
1016
+ _CG_STATIC_CONST_DECL unsigned int fullMask = 0xFFFFFFFF;
1017
+
1018
+ protected:
1019
+ _CG_STATIC_QUALIFIER unsigned int build_mask() {
1020
+ unsigned int mask = fullMask;
1021
+ if (numThreads != 32) {
1022
+ // [0,31] representing the current active thread in the warp
1023
+ unsigned int laneId = details::laneid();
1024
+ // shift mask according to the partition it belongs to
1025
+ mask = th::tileMask << (laneId & ~(th::laneMask));
1026
+ }
1027
+ return (mask);
1028
+ }
1029
+
1030
+ public:
1031
+ _CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
1032
+
1033
+ _CG_STATIC_QUALIFIER void sync() {
1034
+ __syncwarp(build_mask());
1035
+ }
1036
+
1037
+ #ifdef _CG_CPP11_FEATURES
1038
+ // PTX supported collectives
1039
+ template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
1040
+ _CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
1041
+ return details::tile::shuffle_dispatch<TyElem>::shfl(
1042
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), srcRank, numThreads);
1043
+ }
1044
+
1045
+ template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
1046
+ _CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
1047
+ return details::tile::shuffle_dispatch<TyElem>::shfl_down(
1048
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
1049
+ }
1050
+
1051
+ template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
1052
+ _CG_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int delta) const {
1053
+ return details::tile::shuffle_dispatch<TyElem>::shfl_up(
1054
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
1055
+ }
1056
+
1057
+ template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
1058
+ _CG_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int laneMask) const {
1059
+ return details::tile::shuffle_dispatch<TyElem>::shfl_xor(
1060
+ _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), laneMask, numThreads);
1061
+ }
1062
+ #else
1063
+ template <typename TyIntegral>
1064
+ _CG_QUALIFIER TyIntegral shfl(TyIntegral var, int srcRank) const {
1065
+ details::assert_if_not_arithmetic<TyIntegral>();
1066
+ return (__shfl_sync(build_mask(), var, srcRank, numThreads));
1067
+ }
1068
+
1069
+ template <typename TyIntegral>
1070
+ _CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, unsigned int delta) const {
1071
+ details::assert_if_not_arithmetic<TyIntegral>();
1072
+ return (__shfl_down_sync(build_mask(), var, delta, numThreads));
1073
+ }
1074
+
1075
+ template <typename TyIntegral>
1076
+ _CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, unsigned int delta) const {
1077
+ details::assert_if_not_arithmetic<TyIntegral>();
1078
+ return (__shfl_up_sync(build_mask(), var, delta, numThreads));
1079
+ }
1080
+
1081
+ template <typename TyIntegral>
1082
+ _CG_QUALIFIER TyIntegral shfl_xor(TyIntegral var, unsigned int laneMask) const {
1083
+ details::assert_if_not_arithmetic<TyIntegral>();
1084
+ return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
1085
+ }
1086
+ #endif //_CG_CPP11_FEATURES
1087
+
1088
+ _CG_QUALIFIER int any(int predicate) const {
1089
+ unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
1090
+ return (lane_ballot != 0);
1091
+ }
1092
+ _CG_QUALIFIER int all(int predicate) const {
1093
+ unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
1094
+ return (lane_ballot == build_mask());
1095
+ }
1096
+ _CG_QUALIFIER unsigned int ballot(int predicate) const {
1097
+ unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
1098
+ return (lane_ballot >> (details::laneid() & (~(th::laneMask))));
1099
+ }
1100
+
1101
+ #ifdef _CG_HAS_MATCH_COLLECTIVE
1102
+ template <typename TyIntegral>
1103
+ _CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
1104
+ details::assert_if_not_arithmetic<TyIntegral>();
1105
+ unsigned int lane_match = __match_any_sync(build_mask(), val);
1106
+ return (lane_match >> (details::laneid() & (~(th::laneMask))));
1107
+ }
1108
+
1109
+ template <typename TyIntegral>
1110
+ _CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
1111
+ details::assert_if_not_arithmetic<TyIntegral>();
1112
+ unsigned int lane_match = __match_all_sync(build_mask(), val, &pred);
1113
+ return (lane_match >> (details::laneid() & (~(th::laneMask))));
1114
+ }
1115
+ #endif
1116
+
1117
+ };
1118
+
1119
+ template <unsigned int Size, typename ParentT>
1120
+ class __static_parent_thread_block_tile_base
1121
+ {
1122
+ public:
1123
+ // Rank of this group in the upper level of the hierarchy
1124
+ _CG_STATIC_QUALIFIER unsigned int meta_group_rank() {
1125
+ return ParentT::thread_rank() / Size;
1126
+ }
1127
+
1128
+ // Total num partitions created out of all CTAs when the group was created
1129
+ _CG_STATIC_QUALIFIER unsigned int meta_group_size() {
1130
+ return (ParentT::size() + Size - 1) / Size;
1131
+ }
1132
+ };
1133
+
1134
+ /**
1135
+ * class thread_block_tile<unsigned int Size, ParentT = void>
1136
+ *
1137
+ * Statically-sized group type, representing one tile of a thread block.
1138
+ * The only specializations currently supported are those with native
1139
+ * hardware support (1/2/4/8/16/32)
1140
+ *
1141
+ * This group exposes warp-synchronous builtins.
1142
+ * Can only be constructed via tiled_partition<Size>(ParentT&)
1143
+ */
1144
+
1145
+ template <unsigned int Size, typename ParentT = void>
1146
+ class __single_warp_thread_block_tile :
1147
+ public __static_size_thread_block_tile_base<Size>,
1148
+ public __static_parent_thread_block_tile_base<Size, ParentT>
1149
+ {
1150
+ typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
1151
+ friend class details::_coalesced_group_data_access;
1152
+
1153
+ protected:
1154
+ _CG_QUALIFIER __single_warp_thread_block_tile() { };
1155
+ _CG_QUALIFIER __single_warp_thread_block_tile(unsigned int, unsigned int) { };
1156
+
1157
+ _CG_STATIC_QUALIFIER unsigned int get_mask() {
1158
+ return __static_size_thread_block_tile_base<Size>::build_mask();
1159
+ }
1160
+ };
1161
+
1162
+ template <unsigned int Size>
1163
+ class __single_warp_thread_block_tile<Size, void> :
1164
+ public __static_size_thread_block_tile_base<Size>,
1165
+ public thread_group_base<details::coalesced_group_id>
1166
+ {
1167
+ _CG_STATIC_CONST_DECL unsigned int numThreads = Size;
1168
+
1169
+ template <unsigned int, typename ParentT> friend class __single_warp_thread_block_tile;
1170
+ friend class details::_coalesced_group_data_access;
1171
+
1172
+ typedef __static_size_thread_block_tile_base<numThreads> staticSizeBaseT;
1173
+
1174
+ protected:
1175
+ _CG_QUALIFIER __single_warp_thread_block_tile(unsigned int meta_group_rank, unsigned int meta_group_size) {
1176
+ _data.coalesced.mask = staticSizeBaseT::build_mask();
1177
+ _data.coalesced.size = numThreads;
1178
+ _data.coalesced.metaGroupRank = meta_group_rank;
1179
+ _data.coalesced.metaGroupSize = meta_group_size;
1180
+ _data.coalesced.is_tiled = true;
1181
+ }
1182
+
1183
+ _CG_QUALIFIER unsigned int get_mask() const {
1184
+ return (_data.coalesced.mask);
1185
+ }
1186
+
1187
+ public:
1188
+ using staticSizeBaseT::sync;
1189
+ using staticSizeBaseT::size;
1190
+ using staticSizeBaseT::num_threads;
1191
+ using staticSizeBaseT::thread_rank;
1192
+
1193
+ _CG_QUALIFIER unsigned int meta_group_rank() const {
1194
+ return _data.coalesced.metaGroupRank;
1195
+ }
1196
+
1197
+ _CG_QUALIFIER unsigned int meta_group_size() const {
1198
+ return _data.coalesced.metaGroupSize;
1199
+ }
1200
+ };
1201
+
1202
+ /**
1203
+ * Outer level API calls
1204
+ * void sync(GroupT) - see <group_type>.sync()
1205
+ * void thread_rank(GroupT) - see <group_type>.thread_rank()
1206
+ * void group_size(GroupT) - see <group_type>.size()
1207
+ */
1208
+ template <class GroupT>
1209
+ _CG_QUALIFIER void sync(GroupT const &g)
1210
+ {
1211
+ g.sync();
1212
+ }
1213
+
1214
+ // TODO: Use a static dispatch to determine appropriate return type
1215
+ // C++03 is stuck with unsigned long long for now
1216
+ #ifdef _CG_CPP11_FEATURES
1217
+ template <class GroupT>
1218
+ _CG_QUALIFIER auto thread_rank(GroupT const& g) -> decltype(g.thread_rank()) {
1219
+ return g.thread_rank();
1220
+ }
1221
+
1222
+
1223
+ template <class GroupT>
1224
+ _CG_QUALIFIER auto group_size(GroupT const &g) -> decltype(g.num_threads()) {
1225
+ return g.num_threads();
1226
+ }
1227
+ #else
1228
+ template <class GroupT>
1229
+ _CG_QUALIFIER unsigned long long thread_rank(GroupT const& g) {
1230
+ return static_cast<unsigned long long>(g.thread_rank());
1231
+ }
1232
+
1233
+
1234
+ template <class GroupT>
1235
+ _CG_QUALIFIER unsigned long long group_size(GroupT const &g) {
1236
+ return static_cast<unsigned long long>(g.num_threads());
1237
+ }
1238
+ #endif
1239
+
1240
+
1241
+ /**
1242
+ * tiled_partition
1243
+ *
1244
+ * The tiled_partition(parent, tilesz) method is a collective operation that
1245
+ * partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
1246
+ *
1247
+ * A total of ((size(parent)+tilesz-1)/tilesz) subgroups will
1248
+ * be created where threads having identical k = (thread_rank(parent)/tilesz)
1249
+ * will be members of the same subgroup.
1250
+ *
1251
+ * The implementation may cause the calling thread to wait until all the members
1252
+ * of the parent group have invoked the operation before resuming execution.
1253
+ *
1254
+ * Functionality is limited to power-of-two sized subgorup instances of at most
1255
+ * 32 threads. Only thread_block, thread_block_tile<>, and their subgroups can be
1256
+ * tiled_partition() in _CG_VERSION 1000.
1257
+ */
1258
+ _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz)
1259
+ {
1260
+ if (parent.get_type() == details::coalesced_group_id) {
1261
+ const coalesced_group *_cg = static_cast<const coalesced_group*>(&parent);
1262
+ return _cg->_get_tiled_threads(tilesz);
1263
+ }
1264
+ else {
1265
+ const thread_block *_tb = static_cast<const thread_block*>(&parent);
1266
+ return _tb->_get_tiled_threads(tilesz);
1267
+ }
1268
+ }
1269
+
1270
+ // Thread block type overload: returns a basic thread_group for now (may be specialized later)
1271
+ _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz)
1272
+ {
1273
+ return (parent._get_tiled_threads(tilesz));
1274
+ }
1275
+
1276
+ // Coalesced group type overload: retains its ability to stay coalesced
1277
+ _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz)
1278
+ {
1279
+ return (parent._get_tiled_threads(tilesz));
1280
+ }
1281
+
1282
+ namespace details {
1283
+ template <unsigned int Size, typename ParentT>
1284
+ class internal_thread_block_tile : public __single_warp_thread_block_tile<Size, ParentT> {};
1285
+
1286
+ template <unsigned int Size, typename ParentT>
1287
+ _CG_QUALIFIER internal_thread_block_tile<Size, ParentT> tiled_partition_internal() {
1288
+ return internal_thread_block_tile<Size, ParentT>();
1289
+ }
1290
+
1291
+ template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
1292
+ _CG_QUALIFIER TyVal multi_warp_collectives_helper(
1293
+ const GroupT& group,
1294
+ WarpLambda warp_lambda,
1295
+ InterWarpLambda inter_warp_lambda) {
1296
+ return group.template collectives_scheme<TyVal>(warp_lambda, inter_warp_lambda);
1297
+ }
1298
+
1299
+ template <typename T, typename GroupT>
1300
+ _CG_QUALIFIER T* multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id) {
1301
+ return group.template get_scratch_location<T>(warp_id);
1302
+ }
1303
+
1304
+ template <typename GroupT>
1305
+ _CG_QUALIFIER details::barrier_t* multi_warp_sync_location_getter(const GroupT& group) {
1306
+ return group.get_sync_location();
1307
+ }
1308
+
1309
+ }
1310
+ /**
1311
+ * tiled_partition<tilesz>
1312
+ *
1313
+ * The tiled_partition<tilesz>(parent) method is a collective operation that
1314
+ * partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
1315
+ *
1316
+ * A total of ((size(parent)/tilesz) subgroups will be created,
1317
+ * therefore the parent group size must be evenly divisible by the tilesz.
1318
+ * The allow parent groups are thread_block or thread_block_tile<size>.
1319
+ *
1320
+ * The implementation may cause the calling thread to wait until all the members
1321
+ * of the parent group have invoked the operation before resuming execution.
1322
+ *
1323
+ * Functionality is limited to native hardware sizes, 1/2/4/8/16/32.
1324
+ * The size(parent) must be greater than the template Size parameter
1325
+ * otherwise the results are undefined.
1326
+ */
1327
+
1328
+ #if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
1329
+ template <unsigned int Size>
1330
+ class __static_size_multi_warp_tile_base : public __static_size_tile_base<Size>
1331
+ {
1332
+ static_assert(details::_is_valid_multi_warp_tile<Size>::value, "Size must be one of 64/128/256/512");
1333
+
1334
+ template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
1335
+ friend TyVal details::multi_warp_collectives_helper(
1336
+ const GroupT& group,
1337
+ WarpLambda warp_lambda,
1338
+ InterWarpLambda inter_warp_lambda);
1339
+ template <typename T, typename GroupT>
1340
+ friend T* details::multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id);
1341
+ template <typename GroupT>
1342
+ friend details::barrier_t* details::multi_warp_sync_location_getter(const GroupT& group);
1343
+ template <unsigned int OtherSize>
1344
+ friend class __static_size_multi_warp_tile_base;
1345
+ using WarpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
1346
+ using ThisType = __static_size_multi_warp_tile_base<Size>;
1347
+ _CG_STATIC_CONST_DECL int numWarps = Size / 32;
1348
+ const unsigned short communication_size;
1349
+ const unsigned short max_block_size;
1350
+
1351
+ protected:
1352
+ details::multi_warp_scratch* const tile_memory;
1353
+
1354
+ template <typename GroupT>
1355
+ _CG_QUALIFIER __static_size_multi_warp_tile_base(const GroupT& g) :
1356
+ tile_memory(g.tile_memory), communication_size(g.communication_size), max_block_size(g.max_block_size) {}
1357
+
1358
+
1359
+ private:
1360
+ _CG_QUALIFIER details::barrier_t* get_sync_location() const {
1361
+ // Different group sizes use different barriers, all groups of a given size share one barrier.
1362
+ unsigned int sync_id = details::log2(Size / 64);
1363
+ return &(reinterpret_cast<details::barrier_t*>(tile_memory->memory)[sync_id]);
1364
+ }
1365
+
1366
+ template <typename T>
1367
+ _CG_QUALIFIER T* get_scratch_location(unsigned int warp_id) const {
1368
+ unsigned int sync_mem_size = details::scratch_sync_memory_size(max_block_size);
1369
+ unsigned int scratch_id = (details::cta::thread_rank() - thread_rank()) / 32 + warp_id;
1370
+ return reinterpret_cast<T*>(&tile_memory->memory[sync_mem_size + scratch_id * communication_size]);
1371
+ }
1372
+
1373
+ template <typename T>
1374
+ _CG_QUALIFIER T* get_scratch_location() const {
1375
+ unsigned int sync_mem_size = details::scratch_sync_memory_size(max_block_size);
1376
+ unsigned int scratch_id = details::cta::thread_rank() / 32;
1377
+ return reinterpret_cast<T*>(&tile_memory->memory[sync_mem_size + scratch_id * communication_size]);
1378
+ }
1379
+
1380
+ template <typename TyVal>
1381
+ _CG_QUALIFIER TyVal shfl_impl(TyVal val, unsigned int src) const {
1382
+ unsigned int src_warp = src / 32;
1383
+ auto warp = details::tiled_partition_internal<32, ThisType>();
1384
+ details::barrier_t* sync_location = get_sync_location();
1385
+
1386
+ // Get warp slot of the source threads warp.
1387
+ TyVal* warp_scratch_location = get_scratch_location<TyVal>(src_warp);
1388
+
1389
+ if (warp.meta_group_rank() == src_warp) {
1390
+ // Put shuffled value into my warp slot and let my warp arrive at the barrier.
1391
+ if (thread_rank() == src) {
1392
+ *warp_scratch_location = val;
1393
+ }
1394
+ details::sync_warps_arrive(sync_location, details::cta::thread_rank(), numWarps);
1395
+ TyVal result = *warp_scratch_location;
1396
+ details::sync_warps_wait(sync_location, details::cta::thread_rank());
1397
+ return result;
1398
+ }
1399
+ else {
1400
+ // Wait for the source warp to arrive on the barrier.
1401
+ details::sync_warps_wait_for_warps<details::wait_for_specific_warp>(
1402
+ (details::cta::thread_rank() / 32 - warp.meta_group_rank() + src_warp),
1403
+ sync_location, details::cta::thread_rank(),
1404
+ numWarps);
1405
+ TyVal result = *warp_scratch_location;
1406
+ details::sync_warps(sync_location, details::cta::thread_rank(), numWarps);
1407
+ return result;
1408
+ }
1409
+ }
1410
+
1411
+ template <typename TyVal>
1412
+ _CG_QUALIFIER TyVal shfl_iterative_impl(TyVal val, unsigned int src) const {
1413
+ auto warp = details::tiled_partition_internal<32, ThisType>();
1414
+
1415
+ details::copy_channel<numWarps> broadcast_channel{
1416
+ get_scratch_location<char>(0),
1417
+ get_sync_location(),
1418
+ (size_t) communication_size * numWarps};
1419
+
1420
+ if (warp.meta_group_rank() == src / 32) {
1421
+ val = warp.shfl(val, src);
1422
+ broadcast_channel.template send_value<
1423
+ TyVal, 32, decltype(broadcast_channel)::send_many_to_many>(
1424
+ val, warp.thread_rank(), details::cta::thread_rank() / 32);
1425
+ }
1426
+ else {
1427
+ broadcast_channel.template receive_value<TyVal>(val, warp.thread_rank() == 0);
1428
+ }
1429
+ sync();
1430
+ return val;
1431
+ }
1432
+
1433
+ template <typename TyVal, typename WarpLambda, typename InterWarpLambda>
1434
+ _CG_QUALIFIER TyVal collectives_scheme_impl(const WarpLambda& warp_lambda, const InterWarpLambda& inter_warp_lambda) const {
1435
+ auto warp = details::tiled_partition_internal<32, ThisType>();
1436
+ details::barrier_t* sync_location = get_sync_location();
1437
+ TyVal* warp_scratch_location = get_scratch_location<TyVal>();
1438
+
1439
+ warp_lambda(warp, warp_scratch_location);
1440
+
1441
+ if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), numWarps)) {
1442
+ auto subwarp = details::tiled_partition_internal<numWarps, decltype(warp)>();
1443
+ if (subwarp.meta_group_rank() == 0) {
1444
+ TyVal* thread_scratch_location = get_scratch_location<TyVal>(subwarp.thread_rank());
1445
+ inter_warp_lambda(subwarp, thread_scratch_location);
1446
+ }
1447
+ warp.sync();
1448
+ details::sync_warps_release(sync_location, warp.thread_rank() == 0, details::cta::thread_rank(), numWarps);
1449
+ }
1450
+ TyVal result = *warp_scratch_location;
1451
+ warp.sync(); // Added warpsync, if all collectives do sync before writing to reduce_location (they does right now),
1452
+ // we could delete it.
1453
+ return result;
1454
+ }
1455
+
1456
+ template <typename TyVal, typename WarpLambda, typename InterWarpLambda>
1457
+ _CG_QUALIFIER TyVal collectives_scheme_iterative_impl(
1458
+ const WarpLambda& warp_lambda,
1459
+ const InterWarpLambda& inter_warp_lambda) const {
1460
+ auto warp = details::tiled_partition_internal<32, ThisType>();
1461
+ details::barrier_t* sync_location = get_sync_location();
1462
+ details::copy_channel<numWarps> final_result_channel{
1463
+ get_scratch_location<char>(0),
1464
+ sync_location,
1465
+ (size_t) communication_size * numWarps};
1466
+
1467
+ TyVal warp_result;
1468
+ warp_lambda(warp, &warp_result);
1469
+
1470
+ if (warp.meta_group_rank() == 0) {
1471
+ auto subwarp = details::tiled_partition_internal<numWarps, decltype(warp)>();
1472
+ details::copy_channel<numWarps> partial_results_channel{
1473
+ get_scratch_location<char>(subwarp.thread_rank()),
1474
+ sync_location,
1475
+ (size_t) communication_size};
1476
+
1477
+ // Thread 0 in subwarp set as inactive to not overwrite warp 0 warp_result.
1478
+ partial_results_channel.template receive_value<TyVal>(
1479
+ warp_result,
1480
+ warp.thread_rank() == 0,
1481
+ subwarp.thread_rank() != 0 && subwarp.meta_group_rank() == 0);
1482
+ if (subwarp.meta_group_rank() == 0) {
1483
+ inter_warp_lambda(subwarp, &warp_result);
1484
+ }
1485
+ warp_result = warp.shfl(warp_result, 0);
1486
+ final_result_channel.template send_value<TyVal, 32, decltype(final_result_channel)::send_many_to_many>(
1487
+ warp_result,
1488
+ warp.thread_rank(),
1489
+ details::cta::thread_rank() / 32);
1490
+ }
1491
+ else {
1492
+ details::copy_channel<numWarps> partial_results_channel{get_scratch_location<char>(), sync_location, (size_t) communication_size};
1493
+ partial_results_channel.template send_value<TyVal, 32, decltype(partial_results_channel)::send_many_to_one>(
1494
+ warp_result,
1495
+ warp.thread_rank(),
1496
+ (details::cta::thread_rank() - thread_rank()) / 32);
1497
+ final_result_channel.template receive_value<TyVal>(warp_result, warp.thread_rank() == 0);
1498
+ }
1499
+ sync();
1500
+ return warp_result;
1501
+ }
1502
+
1503
+ template <typename TyVal, typename WarpLambda, typename InterWarpLambda>
1504
+ _CG_QUALIFIER TyVal collectives_scheme(const WarpLambda& warp_lambda, const InterWarpLambda& inter_warp_lambda) const {
1505
+ if (sizeof(TyVal) > communication_size) {
1506
+ return collectives_scheme_iterative_impl<TyVal, WarpLambda, InterWarpLambda>(warp_lambda, inter_warp_lambda);
1507
+ }
1508
+ else {
1509
+ return collectives_scheme_impl<TyVal, WarpLambda, InterWarpLambda>(warp_lambda, inter_warp_lambda);
1510
+ }
1511
+ }
1512
+
1513
+ public:
1514
+ _CG_STATIC_CONST_DECL unsigned int _group_id = details::multi_tile_group_id;
1515
+
1516
+ using __static_size_tile_base<Size>::thread_rank;
1517
+
1518
+ template <typename TyVal>
1519
+ _CG_QUALIFIER TyVal shfl(TyVal val, unsigned int src) const {
1520
+ if (sizeof(TyVal) > communication_size) {
1521
+ return shfl_iterative_impl(val, src);
1522
+ }
1523
+ else {
1524
+ return shfl_impl(val, src);
1525
+ }
1526
+ }
1527
+
1528
+ _CG_QUALIFIER void sync() const {
1529
+ details::sync_warps(get_sync_location(), details::cta::thread_rank(), numWarps);
1530
+ }
1531
+
1532
+ _CG_QUALIFIER int any(int predicate) const {
1533
+ auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
1534
+ *warp_scratch_location = __any_sync(0xFFFFFFFF, predicate);
1535
+ };
1536
+ auto inter_warp_lambda =
1537
+ [] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
1538
+ *thread_scratch_location = __any_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
1539
+ };
1540
+ return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
1541
+ }
1542
+
1543
+ _CG_QUALIFIER int all(int predicate) const {
1544
+ auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
1545
+ *warp_scratch_location = __all_sync(0xFFFFFFFF, predicate);
1546
+ };
1547
+ auto inter_warp_lambda =
1548
+ [] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
1549
+ *thread_scratch_location = __all_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
1550
+ };
1551
+ return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
1552
+ }
1553
+ };
1554
+
1555
+
1556
+ template <unsigned int Size, typename ParentT = void>
1557
+ class __multi_warp_thread_block_tile :
1558
+ public __static_size_multi_warp_tile_base<Size>,
1559
+ public __static_parent_thread_block_tile_base<Size, ParentT>
1560
+ {
1561
+ typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
1562
+ typedef __static_size_multi_warp_tile_base<Size> staticTileBaseT;
1563
+ protected:
1564
+ _CG_QUALIFIER __multi_warp_thread_block_tile(const ParentT& g) :
1565
+ __static_size_multi_warp_tile_base<Size>(g) {}
1566
+ };
1567
+
1568
+ template <unsigned int Size>
1569
+ class __multi_warp_thread_block_tile<Size, void> : public __static_size_multi_warp_tile_base<Size>
1570
+ {
1571
+ const unsigned int metaGroupRank;
1572
+ const unsigned int metaGroupSize;
1573
+
1574
+ protected:
1575
+ template <unsigned int OtherSize, typename ParentT>
1576
+ _CG_QUALIFIER __multi_warp_thread_block_tile(const __multi_warp_thread_block_tile<OtherSize, ParentT>& g) :
1577
+ __static_size_multi_warp_tile_base<Size>(g), metaGroupRank(g.meta_group_rank()), metaGroupSize(g.meta_group_size()) {}
1578
+
1579
+ public:
1580
+ _CG_QUALIFIER unsigned int meta_group_rank() const {
1581
+ return metaGroupRank;
1582
+ }
1583
+
1584
+ _CG_QUALIFIER unsigned int meta_group_size() const {
1585
+ return metaGroupSize;
1586
+ }
1587
+ };
1588
+ #endif
1589
+
1590
+ template <unsigned int Size, typename ParentT = void>
1591
+ class thread_block_tile;
1592
+
1593
+ namespace details {
1594
+ template <unsigned int Size, typename ParentT, bool IsMultiWarp>
1595
+ class thread_block_tile_impl;
1596
+
1597
+ template <unsigned int Size, typename ParentT>
1598
+ class thread_block_tile_impl<Size, ParentT, false>: public __single_warp_thread_block_tile<Size, ParentT>
1599
+ {
1600
+ protected:
1601
+ template <unsigned int OtherSize, typename OtherParentT, bool OtherIsMultiWarp>
1602
+ _CG_QUALIFIER thread_block_tile_impl(const thread_block_tile_impl<OtherSize, OtherParentT, OtherIsMultiWarp>& g) :
1603
+ __single_warp_thread_block_tile<Size, ParentT>(g.meta_group_rank(), g.meta_group_size()) {}
1604
+
1605
+ _CG_QUALIFIER thread_block_tile_impl(const thread_block& g) :
1606
+ __single_warp_thread_block_tile<Size, ParentT>() {}
1607
+ };
1608
+
1609
+ #if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
1610
+ template <unsigned int Size, typename ParentT>
1611
+ class thread_block_tile_impl<Size, ParentT, true> : public __multi_warp_thread_block_tile<Size, ParentT>
1612
+ {
1613
+ protected:
1614
+ template <typename GroupT>
1615
+ _CG_QUALIFIER thread_block_tile_impl(const GroupT& g) :
1616
+ __multi_warp_thread_block_tile<Size, ParentT>(g) {}
1617
+ };
1618
+ #else
1619
+ template <unsigned int Size, typename ParentT>
1620
+ class thread_block_tile_impl<Size, ParentT, true>
1621
+ {
1622
+ protected:
1623
+ template <typename GroupT>
1624
+ _CG_QUALIFIER thread_block_tile_impl(const GroupT& g) {}
1625
+ };
1626
+ #endif
1627
+ }
1628
+
1629
+ template <unsigned int Size, typename ParentT>
1630
+ class thread_block_tile : public details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>
1631
+ {
1632
+ friend _CG_QUALIFIER thread_block_tile<1, void> this_thread();
1633
+
1634
+ protected:
1635
+ _CG_QUALIFIER thread_block_tile(const ParentT& g) :
1636
+ details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>(g) {}
1637
+
1638
+ public:
1639
+ _CG_QUALIFIER operator thread_block_tile<Size, void>() const {
1640
+ return thread_block_tile<Size, void>(*this);
1641
+ }
1642
+ };
1643
+
1644
+ template <unsigned int Size>
1645
+ class thread_block_tile<Size, void> : public details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>
1646
+ {
1647
+ template <unsigned int, typename ParentT>
1648
+ friend class thread_block_tile;
1649
+
1650
+ protected:
1651
+ template <unsigned int OtherSize, typename OtherParentT>
1652
+ _CG_QUALIFIER thread_block_tile(const thread_block_tile<OtherSize, OtherParentT>& g) :
1653
+ details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
1654
+
1655
+ public:
1656
+ template <typename ParentT>
1657
+ _CG_QUALIFIER thread_block_tile(const thread_block_tile<Size, ParentT>& g) :
1658
+ details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
1659
+ };
1660
+
1661
+ namespace details {
1662
+ template <unsigned int Size, typename ParentT>
1663
+ struct tiled_partition_impl;
1664
+
1665
+ template <unsigned int Size>
1666
+ struct tiled_partition_impl<Size, thread_block> : public thread_block_tile<Size, thread_block> {
1667
+ _CG_QUALIFIER tiled_partition_impl(const thread_block& g) :
1668
+ thread_block_tile<Size, thread_block>(g) {}
1669
+ };
1670
+
1671
+ // ParentT = static thread_block_tile<ParentSize, GrandParent> specialization
1672
+ template <unsigned int Size, unsigned int ParentSize, typename GrandParent>
1673
+ struct tiled_partition_impl<Size, thread_block_tile<ParentSize, GrandParent> > :
1674
+ public thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> > {
1675
+ #ifdef _CG_CPP11_FEATURES
1676
+ static_assert(Size < ParentSize, "Tile size bigger or equal to the parent group size");
1677
+ #endif
1678
+ _CG_QUALIFIER tiled_partition_impl(const thread_block_tile<ParentSize, GrandParent>& g) :
1679
+ thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> >(g) {}
1680
+ };
1681
+
1682
+ }
1683
+
1684
+ namespace experimental {
1685
+ template <unsigned int Size, typename ParentT>
1686
+ _CG_QUALIFIER thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g)
1687
+ {
1688
+ #if defined(_CG_CPP11_FEATURES) && !defined(_CG_ABI_EXPERIMENTAL)
1689
+ static_assert(details::_is_single_warp<Size>::value, "_CG_ABI_EXPERIMENTAL needs to be defined"
1690
+ " before cooperative_groups header is included to enable experimental features");
1691
+ #endif
1692
+ return details::tiled_partition_impl<Size, ParentT>(g);
1693
+ }
1694
+
1695
+ }
1696
+
1697
+ template <unsigned int Size, typename ParentT>
1698
+ _CG_QUALIFIER thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g)
1699
+ {
1700
+ #ifdef _CG_CPP11_FEATURES
1701
+ static_assert(details::_is_single_warp<Size>::value, "Tiled partition with Size > 32 is supported only by"
1702
+ " cooperative_groups::experimental::tiled_partition available with experimental features enabled");
1703
+ #endif
1704
+ return details::tiled_partition_impl<Size, ParentT>(g);
1705
+ }
1706
+
1707
+ /**
1708
+ * thread_group this_thread()
1709
+ *
1710
+ * Constructs a generic thread_group containing only the calling thread
1711
+ */
1712
+ _CG_QUALIFIER thread_block_tile<1, void> this_thread()
1713
+ {
1714
+ // Make thread_block_tile<1, thread_block> parent of the returned group, so it will have its
1715
+ // meta group rank and size set to 0 and 1 respectively.
1716
+ return thread_block_tile<1, thread_block_tile<1, thread_block> >(this_thread_block());
1717
+ }
1718
+
1719
+ /**
1720
+ * <group_type>.sync()
1721
+ *
1722
+ * Executes a barrier across the group
1723
+ *
1724
+ * Implements both a compiler fence and an architectural fence to prevent,
1725
+ * memory reordering around the barrier.
1726
+ */
1727
+ _CG_QUALIFIER void thread_group::sync() const
1728
+ {
1729
+ switch (_data.group.type) {
1730
+ case details::coalesced_group_id:
1731
+ cooperative_groups::sync(*static_cast<const coalesced_group*>(this));
1732
+ break;
1733
+ case details::thread_block_id:
1734
+ cooperative_groups::sync(*static_cast<const thread_block*>(this));
1735
+ break;
1736
+ case details::grid_group_id:
1737
+ cooperative_groups::sync(*static_cast<const grid_group*>(this));
1738
+ break;
1739
+ #if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
1740
+ case details::multi_grid_group_id:
1741
+ cooperative_groups::sync(*static_cast<const multi_grid_group*>(this));
1742
+ break;
1743
+ #endif
1744
+ #if defined(_CG_HAS_CLUSTER_GROUP)
1745
+ case details::cluster_group_id:
1746
+ cooperative_groups::sync(*static_cast<const cluster_group*>(this));
1747
+ break;
1748
+ #endif
1749
+ default:
1750
+ break;
1751
+ }
1752
+ }
1753
+
1754
+ /**
1755
+ * <group_type>.size()
1756
+ *
1757
+ * Returns the total number of threads in the group.
1758
+ */
1759
+ _CG_QUALIFIER unsigned long long thread_group::size() const
1760
+ {
1761
+ unsigned long long size = 0;
1762
+ switch (_data.group.type) {
1763
+ case details::coalesced_group_id:
1764
+ size = cooperative_groups::group_size(*static_cast<const coalesced_group*>(this));
1765
+ break;
1766
+ case details::thread_block_id:
1767
+ size = cooperative_groups::group_size(*static_cast<const thread_block*>(this));
1768
+ break;
1769
+ case details::grid_group_id:
1770
+ size = cooperative_groups::group_size(*static_cast<const grid_group*>(this));
1771
+ break;
1772
+ #if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
1773
+ case details::multi_grid_group_id:
1774
+ size = cooperative_groups::group_size(*static_cast<const multi_grid_group*>(this));
1775
+ break;
1776
+ #endif
1777
+ #if defined(_CG_HAS_CLUSTER_GROUP)
1778
+ case details::cluster_group_id:
1779
+ size = cooperative_groups::group_size(*static_cast<const cluster_group*>(this));
1780
+ break;
1781
+ #endif
1782
+ default:
1783
+ break;
1784
+ }
1785
+ return size;
1786
+ }
1787
+
1788
+ /**
1789
+ * <group_type>.thread_rank()
1790
+ *
1791
+ * Returns the linearized rank of the calling thread along the interval [0, size()).
1792
+ */
1793
+ _CG_QUALIFIER unsigned long long thread_group::thread_rank() const
1794
+ {
1795
+ unsigned long long rank = 0;
1796
+ switch (_data.group.type) {
1797
+ case details::coalesced_group_id:
1798
+ rank = cooperative_groups::thread_rank(*static_cast<const coalesced_group*>(this));
1799
+ break;
1800
+ case details::thread_block_id:
1801
+ rank = cooperative_groups::thread_rank(*static_cast<const thread_block*>(this));
1802
+ break;
1803
+ case details::grid_group_id:
1804
+ rank = cooperative_groups::thread_rank(*static_cast<const grid_group*>(this));
1805
+ break;
1806
+ #if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
1807
+ case details::multi_grid_group_id:
1808
+ rank = cooperative_groups::thread_rank(*static_cast<const multi_grid_group*>(this));
1809
+ break;
1810
+ #endif
1811
+ #if defined(_CG_HAS_CLUSTER_GROUP)
1812
+ case details::cluster_group_id:
1813
+ rank = cooperative_groups::thread_rank(*static_cast<const cluster_group*>(this));
1814
+ break;
1815
+ #endif
1816
+ default:
1817
+ break;
1818
+ }
1819
+ return rank;
1820
+ }
1821
+
1822
+ _CG_END_NAMESPACE
1823
+
1824
+ #include <cooperative_groups/details/partitioning.h>
1825
+
1826
+ # endif /* ! (__cplusplus, __CUDACC__) */
1827
+
1828
+ #endif /* !_COOPERATIVE_GROUPS_H_ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/coalesced_reduce.h ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _CG_COALESCED_REDUCE_H_
50
+ #define _CG_COALESCED_REDUCE_H_
51
+
52
+ #include "info.h"
53
+ #include "helpers.h"
54
+ #include "cooperative_groups.h"
55
+ #include "partitioning.h"
56
+ #include "coalesced_scan.h"
57
+
58
+ _CG_BEGIN_NAMESPACE
59
+
60
+ namespace details {
61
+
62
+ template <typename TyVal, typename TyOp>
63
+ _CG_QUALIFIER auto coalesced_reduce_to_one(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
64
+ if (group.size() == 32) {
65
+ auto out = val;
66
+ for (int offset = group.size() >> 1; offset > 0; offset >>= 1) {
67
+ out = op(out, group.shfl_up(out, offset));
68
+ }
69
+ return out;
70
+ }
71
+ else {
72
+ auto scan_result =
73
+ inclusive_scan_non_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
74
+ return scan_result;
75
+ }
76
+ }
77
+
78
+ template <typename TyVal, typename TyOp>
79
+ _CG_QUALIFIER auto coalesced_reduce(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
80
+ auto out = coalesced_reduce_to_one(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
81
+ if (group.size() == 32) {
82
+ return group.shfl(out, 31);
83
+ }
84
+ else {
85
+ unsigned int group_mask = _coalesced_group_data_access::get_mask(group);
86
+ unsigned int last_thread_id = 31 - __clz(group_mask);
87
+ return details::tile::shuffle_dispatch<TyVal>::shfl(
88
+ _CG_STL_NAMESPACE::forward<TyVal>(out), group_mask, last_thread_id, 32);
89
+ }
90
+ }
91
+
92
+ template <typename TyVal, typename TyOp, unsigned int TySize, typename ParentT>
93
+ _CG_QUALIFIER auto coalesced_reduce(const __single_warp_thread_block_tile<TySize, ParentT>& group,
94
+ TyVal&& val,
95
+ TyOp&& op) -> decltype(op(val, val)) {
96
+ auto out = val;
97
+ for (int mask = TySize >> 1; mask > 0; mask >>= 1) {
98
+ out = op(out, group.shfl_xor(out, mask));
99
+ }
100
+
101
+ return out;
102
+ }
103
+
104
+ } // details
105
+
106
+ _CG_END_NAMESPACE
107
+
108
+ #endif // _CG_COALESCED_REDUCE_H_
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/functional.h ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _CG_FUNCTIONAL_H
50
+ #define _CG_FUNCTIONAL_H
51
+
52
+ #include "info.h"
53
+ #include "helpers.h"
54
+
55
+ #ifdef _CG_CPP11_FEATURES
56
+ #ifdef _CG_USE_CUDA_STL
57
+ # include <cuda/std/functional>
58
+ #endif
59
+
60
+ _CG_BEGIN_NAMESPACE
61
+
62
+ namespace details {
63
+ #ifdef _CG_USE_CUDA_STL
64
+ using cuda::std::plus;
65
+ using cuda::std::bit_and;
66
+ using cuda::std::bit_xor;
67
+ using cuda::std::bit_or;
68
+ #else
69
+ template <typename Ty> struct plus {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 + arg2;}};
70
+ template <typename Ty> struct bit_and {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 & arg2;}};
71
+ template <typename Ty> struct bit_xor {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 ^ arg2;}};
72
+ template <typename Ty> struct bit_or {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 | arg2;}};
73
+ #endif // _CG_USE_PLATFORM_STL
74
+ } // details
75
+
76
+ template <typename Ty>
77
+ struct plus : public details::plus<Ty> {};
78
+
79
+ template <typename Ty>
80
+ struct less {
81
+ __device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {
82
+ return (arg2 < arg1) ? arg2 : arg1;
83
+ }
84
+ };
85
+
86
+ template <typename Ty>
87
+ struct greater {
88
+ __device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {
89
+ return (arg1 < arg2) ? arg2 : arg1;
90
+ }
91
+ };
92
+
93
+ template <typename Ty>
94
+ struct bit_and : public details::bit_and<Ty> {};
95
+
96
+ template <typename Ty>
97
+ struct bit_xor : public details::bit_xor<Ty> {};
98
+
99
+ template <typename Ty>
100
+ struct bit_or : public details::bit_or<Ty> {};
101
+
102
+ #if defined(_CG_HAS_STL_ATOMICS) && defined(_CG_ABI_EXPERIMENTAL)
103
+ namespace details {
104
+ template <class Ty>
105
+ using _atomic_is_type_supported = _CG_STL_NAMESPACE::integral_constant<bool,
106
+ _CG_STL_NAMESPACE::is_integral<Ty>::value && (sizeof(Ty) == 4 || sizeof(Ty) == 8)>;
107
+
108
+ template <typename TyOp> struct _atomic_op_supported : public _CG_STL_NAMESPACE::false_type {};
109
+ template <typename Ty> struct _atomic_op_supported<cooperative_groups::plus<Ty>> : public _atomic_is_type_supported<Ty> {};
110
+ template <typename Ty> struct _atomic_op_supported<cooperative_groups::less<Ty>> : public _atomic_is_type_supported<Ty> {};
111
+ template <typename Ty> struct _atomic_op_supported<cooperative_groups::greater<Ty>> : public _atomic_is_type_supported<Ty> {};
112
+ template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_and<Ty>> : public _atomic_is_type_supported<Ty> {};
113
+ template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_or<Ty>> : public _atomic_is_type_supported<Ty> {};
114
+ template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_xor<Ty>> : public _atomic_is_type_supported<Ty> {};
115
+
116
+ template<typename TyAtomic, typename TyVal, typename TyOp>
117
+ _CG_QUALIFIER remove_qual<TyVal> atomic_cas_fallback(TyAtomic&& atomic, TyVal&& val, TyOp&& op) {
118
+ remove_qual<TyVal> old = atomic;
119
+ while(!atomic.compare_exchange_weak(old, op(old, val)));
120
+ return old;
121
+ }
122
+
123
+ template<typename TyOp>
124
+ struct op_picker;
125
+
126
+ template<typename TyVal>
127
+ struct op_picker<cooperative_groups::plus<TyVal>> {
128
+ template<typename TyAtomic>
129
+ _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
130
+ return atomic.fetch_add(val);
131
+ }
132
+ };
133
+
134
+ template<typename TyVal>
135
+ struct op_picker<cooperative_groups::less<TyVal>> {
136
+ template<typename TyAtomic>
137
+ _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
138
+ return atomic.fetch_min(val);
139
+ }
140
+ };
141
+
142
+ template<typename TyVal>
143
+ struct op_picker<cooperative_groups::greater<TyVal>> {
144
+ template<typename TyAtomic>
145
+ _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
146
+ return atomic.fetch_max(val);
147
+ }
148
+ };
149
+
150
+ template<typename TyVal>
151
+ struct op_picker<cooperative_groups::bit_and<TyVal>> {
152
+ template<typename TyAtomic>
153
+ _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
154
+ return atomic.fetch_and(val);
155
+ }
156
+ };
157
+
158
+ template<typename TyVal>
159
+ struct op_picker<cooperative_groups::bit_xor<TyVal>> {
160
+ template<typename TyAtomic>
161
+ _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
162
+ return atomic.fetch_xor(val);
163
+ }
164
+ };
165
+
166
+ template<typename TyVal>
167
+ struct op_picker<cooperative_groups::bit_or<TyVal>> {
168
+ template<typename TyAtomic>
169
+ _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
170
+ return atomic.fetch_or(val);
171
+ }
172
+ };
173
+
174
+ template<bool atomic_supported>
175
+ struct atomic_update_dispatch {};
176
+
177
+ template<>
178
+ struct atomic_update_dispatch<false> {
179
+ template<typename TyAtomic, typename TyVal, typename TyOp>
180
+ _CG_STATIC_QUALIFIER remove_qual<TyVal> atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) {
181
+ return atomic_cas_fallback(atomic, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
182
+ }
183
+ };
184
+
185
+ template<>
186
+ struct atomic_update_dispatch<true> {
187
+ template<typename TyAtomic, typename TyVal, typename TyOp>
188
+ _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val, TyOp&& op) {
189
+ using dispatch = op_picker<details::remove_qual<TyOp>>;
190
+
191
+ return dispatch::atomic_update(atomic, val);
192
+ }
193
+ };
194
+
195
+ template<typename TyAtomic, typename TyVal, typename TyOp>
196
+ _CG_QUALIFIER remove_qual<TyVal> atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) {
197
+ using dispatch = atomic_update_dispatch<_atomic_op_supported<details::remove_qual<TyOp>>::value>;
198
+
199
+ return dispatch::atomic_update(atomic, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
200
+ }
201
+ }
202
+ #endif
203
+
204
+ _CG_END_NAMESPACE
205
+
206
+ #endif
207
+ #endif //_CG_FUNCTIONAL_H
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/helpers.h ADDED
@@ -0,0 +1,707 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _COOPERATIVE_GROUPS_HELPERS_H_
50
+ # define _COOPERATIVE_GROUPS_HELPERS_H_
51
+
52
+ #include "info.h"
53
+ #include "sync.h"
54
+
55
+ _CG_BEGIN_NAMESPACE
56
+
57
+ namespace details {
58
+ #ifdef _CG_CPP11_FEATURES
59
+ template <typename Ty> struct _is_float_or_half : public _CG_STL_NAMESPACE::is_floating_point<Ty> {};
60
+ # ifdef _CG_HAS_FP16_COLLECTIVE
61
+ template <> struct _is_float_or_half<__half> : public _CG_STL_NAMESPACE::true_type {};
62
+ template <> struct _is_float_or_half<__half2> : public _CG_STL_NAMESPACE::true_type {};
63
+ # endif
64
+ template <typename Ty>
65
+ using is_float_or_half = _is_float_or_half<typename _CG_STL_NAMESPACE::remove_cv<Ty>::type>;
66
+
67
+ // Non-STL utility templates
68
+ template <typename Ty>
69
+ using remove_qual = typename _CG_STL_NAMESPACE::remove_cv<typename _CG_STL_NAMESPACE::remove_reference<Ty>::type>::type;
70
+
71
+ template <typename TyLhs, typename TyRhs>
72
+ using is_op_type_same = _CG_STL_NAMESPACE::is_same<remove_qual<TyLhs>, remove_qual<TyRhs>
73
+ >;
74
+ #endif
75
+
76
+ template <typename TyTrunc>
77
+ _CG_STATIC_QUALIFIER TyTrunc vec3_to_linear(dim3 index, dim3 nIndex) {
78
+ return ((TyTrunc)index.z * nIndex.y * nIndex.x) +
79
+ ((TyTrunc)index.y * nIndex.x) +
80
+ (TyTrunc)index.x;
81
+ }
82
+
83
+ namespace cta {
84
+
85
+ _CG_STATIC_QUALIFIER void sync()
86
+ {
87
+ __barrier_sync(0);
88
+ }
89
+
90
+ _CG_STATIC_QUALIFIER unsigned int num_threads()
91
+ {
92
+ return static_cast<unsigned int>(blockDim.x * blockDim.y * blockDim.z);
93
+ }
94
+
95
+ _CG_STATIC_QUALIFIER unsigned int thread_rank()
96
+ {
97
+ return vec3_to_linear<unsigned int>(threadIdx, blockDim);
98
+ }
99
+
100
+ _CG_STATIC_QUALIFIER dim3 group_index()
101
+ {
102
+ return dim3(blockIdx.x, blockIdx.y, blockIdx.z);
103
+ }
104
+
105
+ _CG_STATIC_QUALIFIER dim3 thread_index()
106
+ {
107
+ return dim3(threadIdx.x, threadIdx.y, threadIdx.z);
108
+ }
109
+
110
+ _CG_STATIC_QUALIFIER dim3 dim_threads()
111
+ {
112
+ return dim3(blockDim.x, blockDim.y, blockDim.z);
113
+ }
114
+
115
+ // Legacy aliases
116
+ _CG_STATIC_QUALIFIER unsigned int size()
117
+ {
118
+ return num_threads();
119
+ }
120
+
121
+ _CG_STATIC_QUALIFIER dim3 block_dim()
122
+ {
123
+ return dim_threads();
124
+ }
125
+
126
+ };
127
+
128
+ class _coalesced_group_data_access {
129
+ public:
130
+ // Retrieve mask of coalesced groups
131
+ template <typename TyGroup>
132
+ _CG_STATIC_QUALIFIER unsigned int get_mask(const TyGroup &group) {
133
+ return group.get_mask();
134
+ }
135
+
136
+ // Retrieve mask of tiles
137
+ template <template <typename, typename> typename TyGroup, typename Sz, typename Parent>
138
+ _CG_STATIC_QUALIFIER unsigned int get_mask(const TyGroup<Sz, Parent> &group) {
139
+ return group.build_maks();
140
+ }
141
+
142
+ template <typename TyGroup>
143
+ _CG_STATIC_QUALIFIER TyGroup construct_from_mask(unsigned int mask) {
144
+ return TyGroup(mask);
145
+ }
146
+
147
+ template <typename TyGroup>
148
+ _CG_STATIC_QUALIFIER void modify_meta_group(TyGroup &group, unsigned int mgRank, unsigned int mgSize) {
149
+ group._data.coalesced.metaGroupRank = mgRank;
150
+ group._data.coalesced.metaGroupSize = mgSize;
151
+ }
152
+ };
153
+
154
+ namespace tile {
155
+ template <unsigned int TileCount, unsigned int TileMask, unsigned int LaneMask, unsigned int ShiftCount>
156
+ struct _tile_helpers{
157
+ _CG_STATIC_CONST_DECL unsigned int tileCount = TileCount;
158
+ _CG_STATIC_CONST_DECL unsigned int tileMask = TileMask;
159
+ _CG_STATIC_CONST_DECL unsigned int laneMask = LaneMask;
160
+ _CG_STATIC_CONST_DECL unsigned int shiftCount = ShiftCount;
161
+ };
162
+
163
+ template <unsigned int> struct tile_helpers;
164
+ template <> struct tile_helpers<32> : public _tile_helpers<1, 0xFFFFFFFF, 0x1F, 5> {};
165
+ template <> struct tile_helpers<16> : public _tile_helpers<2, 0x0000FFFF, 0x0F, 4> {};
166
+ template <> struct tile_helpers<8> : public _tile_helpers<4, 0x000000FF, 0x07, 3> {};
167
+ template <> struct tile_helpers<4> : public _tile_helpers<8, 0x0000000F, 0x03, 2> {};
168
+ template <> struct tile_helpers<2> : public _tile_helpers<16, 0x00000003, 0x01, 1> {};
169
+ template <> struct tile_helpers<1> : public _tile_helpers<32, 0x00000001, 0x00, 0> {};
170
+
171
+ #ifdef _CG_CPP11_FEATURES
172
+ namespace shfl {
173
+ /***********************************************************************************
174
+ * Recursively Sliced Shuffle
175
+ * Purpose:
176
+ * Slices an input type a number of times into integral types so that shuffles
177
+ * are well defined
178
+ * Expectations:
179
+ * This object *should not* be used from a reinterpret_cast pointer unless
180
+ * some alignment guarantees can be met. Use a memcpy to guarantee that loads
181
+ * from the integral types stored within are aligned and correct.
182
+ **********************************************************************************/
183
+ template <unsigned int count, bool intSized = (count <= sizeof(int))>
184
+ struct recursive_sliced_shuffle_helper;
185
+
186
+ template <unsigned int count>
187
+ struct recursive_sliced_shuffle_helper<count, true> {
188
+ int val;
189
+
190
+ template <typename TyFn>
191
+ _CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) {
192
+ val = shfl(val);
193
+ }
194
+ };
195
+
196
+ template <unsigned int count>
197
+ struct recursive_sliced_shuffle_helper<count, false> {
198
+ int val;
199
+ recursive_sliced_shuffle_helper<count - sizeof(int)> next;
200
+
201
+ template <typename TyFn>
202
+ _CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) {
203
+ val = shfl(val);
204
+ next.invoke_shuffle(shfl);
205
+ }
206
+ };
207
+ }
208
+
209
+ struct _memory_shuffle {
210
+ template <typename TyElem, typename TyShflFn>
211
+ _CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) {
212
+ static_assert(sizeof(TyElem) > 0, "in memory shuffle is not yet implemented");
213
+ return TyElem{};
214
+ }
215
+
216
+ template <typename TyElem, typename TyRet = remove_qual<TyElem>>
217
+ _CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
218
+ auto shfl = [=](int val) -> int {
219
+ return 0;
220
+ };
221
+
222
+ return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
223
+ }
224
+
225
+ template <typename TyElem, typename TyRet = remove_qual<TyElem>>
226
+ _CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
227
+ auto shfl = [=](int val) -> int {
228
+ return 0;
229
+ };
230
+
231
+ return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
232
+ }
233
+
234
+ template <typename TyElem, typename TyRet = remove_qual<TyElem>>
235
+ _CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
236
+ auto shfl = [=](int val) -> int {
237
+ return 0;
238
+ };
239
+
240
+ return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
241
+ }
242
+
243
+ template <typename TyElem, typename TyRet = remove_qual<TyElem>>
244
+ _CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
245
+ auto shfl = [=](int val) -> int {
246
+ return 0;
247
+ };
248
+
249
+ return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
250
+ }
251
+ };
252
+
253
+ /***********************************************************************************
254
+ * Intrinsic Device Function Shuffle
255
+ * Purpose:
256
+ * Uses a shuffle helper that has characteristics best suited for moving
257
+ * elements between threads
258
+ * Expectations:
259
+ * Object given will be forced into an l-value type so that it can be used
260
+ * with a helper structure that reinterprets the data into intrinsic compatible
261
+ * types
262
+ * Notes:
263
+ * !! TyRet is required so that objects are returned by value and not as
264
+ * dangling references depending on the value category of the passed object
265
+ **********************************************************************************/
266
+ struct _intrinsic_compat_shuffle {
267
+ template <unsigned int count>
268
+ using shfl_helper = shfl::recursive_sliced_shuffle_helper<count>;
269
+
270
+ template <typename TyElem, typename TyShflFn>
271
+ _CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) {
272
+ static_assert(__is_trivially_copyable(TyElem), "Type is not compatible with device shuffle");
273
+ shfl_helper<sizeof(TyElem)> helper;
274
+ memcpy(&helper, &elem, sizeof(TyElem));
275
+ helper.invoke_shuffle(fn);
276
+ memcpy(&elem, &helper, sizeof(TyElem));
277
+ return elem;
278
+ }
279
+
280
+ template <typename TyElem, typename TyRet = remove_qual<TyElem>>
281
+ _CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
282
+ auto shfl = [=](int val) -> int {
283
+ return __shfl_sync(gMask, val, srcRank, threads);
284
+ };
285
+
286
+ return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
287
+ }
288
+
289
+ template <typename TyElem, typename TyRet = remove_qual<TyElem>>
290
+ _CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
291
+ auto shfl = [=](int val) -> int {
292
+ return __shfl_down_sync(gMask, val, delta, threads);
293
+ };
294
+
295
+ return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
296
+ }
297
+
298
+ template <typename TyElem, typename TyRet = remove_qual<TyElem>>
299
+ _CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
300
+ auto shfl = [=](int val) -> int {
301
+ return __shfl_up_sync(gMask, val, delta, threads);
302
+ };
303
+
304
+ return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
305
+ }
306
+
307
+ template <typename TyElem, typename TyRet = remove_qual<TyElem>>
308
+ _CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
309
+ auto shfl = [=](int val) -> int {
310
+ return __shfl_xor_sync(gMask, val, lMask, threads);
311
+ };
312
+
313
+ return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
314
+ }
315
+ };
316
+
317
+ struct _native_shuffle {
318
+ template <typename TyElem>
319
+ _CG_STATIC_QUALIFIER TyElem shfl(
320
+ TyElem elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
321
+ return static_cast<TyElem>(__shfl_sync(gMask, elem, srcRank, threads));
322
+ }
323
+
324
+ template <typename TyElem>
325
+ _CG_STATIC_QUALIFIER TyElem shfl_down(
326
+ TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
327
+ return static_cast<TyElem>(__shfl_down_sync(gMask, elem, delta, threads));
328
+ }
329
+
330
+ template <typename TyElem>
331
+ _CG_STATIC_QUALIFIER TyElem shfl_up(
332
+ TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
333
+ return static_cast<TyElem>(__shfl_up_sync(gMask, elem, delta, threads));
334
+ }
335
+
336
+ template <typename TyElem>
337
+ _CG_STATIC_QUALIFIER TyElem shfl_xor(
338
+ TyElem elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
339
+ return static_cast<TyElem>(__shfl_xor_sync(gMask, elem, lMask, threads));
340
+ }
341
+ };
342
+
343
+ // Almost all arithmetic types are supported by native shuffle
344
+ // Vector types are the exception
345
+ template <typename TyElem>
346
+ using use_native_shuffle = _CG_STL_NAMESPACE::integral_constant<
347
+ bool,
348
+ _CG_STL_NAMESPACE::is_integral<
349
+ remove_qual<TyElem>>::value ||
350
+ details::is_float_or_half<
351
+ remove_qual<TyElem>>::value
352
+ >;
353
+
354
+ constexpr unsigned long long _MemoryShuffleCutoff = 32;
355
+
356
+ template <typename TyElem,
357
+ bool IsNative = use_native_shuffle<TyElem>::value,
358
+ bool InMem = (sizeof(TyElem) > _MemoryShuffleCutoff)>
359
+ struct shuffle_dispatch;
360
+
361
+ template <typename TyElem>
362
+ struct shuffle_dispatch<TyElem, true, false> : public _native_shuffle {};
363
+
364
+ template <typename TyElem>
365
+ struct shuffle_dispatch<TyElem, false, false> : public _intrinsic_compat_shuffle {};
366
+
367
+ template <typename TyElem>
368
+ struct shuffle_dispatch<TyElem, false, true> : public _memory_shuffle {};
369
+
370
+ #endif //_CG_CPP11_FEATURES
371
+ };
372
+
373
+ namespace multi_grid {
374
+ struct multi_grid_functions;
375
+ };
376
+
377
+ namespace grid {
378
+ _CG_STATIC_QUALIFIER void sync(unsigned int *bar) {
379
+ unsigned int expected = gridDim.x * gridDim.y * gridDim.z;
380
+
381
+ details::sync_grids(expected, bar);
382
+ }
383
+
384
+ _CG_STATIC_QUALIFIER unsigned long long num_blocks()
385
+ {
386
+ // grid.y * grid.z -> [max(65535) * max(65535)] fits within 4b, promote after multiplication
387
+ // grid.x * (grid.y * grid.z) -> [max(2^31-1) * max(65535 * 65535)] exceeds 4b, promote before multiplication
388
+ return (unsigned long long)gridDim.x * (gridDim.y * gridDim.z);
389
+ }
390
+
391
+ _CG_STATIC_QUALIFIER unsigned long long num_threads()
392
+ {
393
+ return num_blocks() * cta::num_threads();
394
+ }
395
+
396
+ _CG_STATIC_QUALIFIER unsigned long long block_rank()
397
+ {
398
+ return vec3_to_linear<unsigned long long>(blockIdx, gridDim);
399
+ }
400
+
401
+ _CG_STATIC_QUALIFIER unsigned long long thread_rank()
402
+ {
403
+ return block_rank() * cta::num_threads() + cta::thread_rank();
404
+ }
405
+
406
+ _CG_STATIC_QUALIFIER dim3 dim_blocks()
407
+ {
408
+ return dim3(gridDim.x, gridDim.y, gridDim.z);
409
+ }
410
+
411
+ _CG_STATIC_QUALIFIER dim3 block_index()
412
+ {
413
+ return dim3(blockIdx.x, blockIdx.y, blockIdx.z);
414
+ }
415
+
416
+ #if defined(_CG_HAS_CLUSTER_GROUP)
417
+ _CG_STATIC_QUALIFIER dim3 dim_clusters() {
418
+ return __clusterGridDimInClusters();
419
+ }
420
+
421
+ _CG_STATIC_QUALIFIER unsigned long long num_clusters() {
422
+ const dim3 dimClusters = dim_clusters();
423
+ return dimClusters.x * dimClusters.y * dimClusters.z;
424
+ }
425
+
426
+ _CG_STATIC_QUALIFIER dim3 cluster_index() {
427
+ return __clusterIdx();
428
+ }
429
+
430
+ _CG_STATIC_QUALIFIER unsigned long long cluster_rank() {
431
+ return vec3_to_linear<unsigned long long>(cluster_index(), dim_clusters());
432
+ }
433
+ #endif
434
+
435
+ // Legacy aliases
436
+ _CG_STATIC_QUALIFIER unsigned long long size()
437
+ {
438
+ return num_threads();
439
+ }
440
+
441
+ _CG_STATIC_QUALIFIER dim3 grid_dim()
442
+ {
443
+ return dim_blocks();
444
+ }
445
+ };
446
+
447
+
448
+ #if defined(_CG_HAS_MULTI_GRID_GROUP)
449
+
450
+ namespace multi_grid {
451
+ _CG_STATIC_QUALIFIER unsigned long long get_intrinsic_handle()
452
+ {
453
+ return (cudaCGGetIntrinsicHandle(cudaCGScopeMultiGrid));
454
+ }
455
+
456
+ _CG_STATIC_QUALIFIER void sync(const unsigned long long handle)
457
+ {
458
+ cudaError_t err = cudaCGSynchronize(handle, 0);
459
+ }
460
+
461
+ _CG_STATIC_QUALIFIER unsigned int size(const unsigned long long handle)
462
+ {
463
+ unsigned int numThreads = 0;
464
+ cudaCGGetSize(&numThreads, NULL, handle);
465
+ return numThreads;
466
+ }
467
+
468
+ _CG_STATIC_QUALIFIER unsigned int thread_rank(const unsigned long long handle)
469
+ {
470
+ unsigned int threadRank = 0;
471
+ cudaCGGetRank(&threadRank, NULL, handle);
472
+ return threadRank;
473
+ }
474
+
475
+ _CG_STATIC_QUALIFIER unsigned int grid_rank(const unsigned long long handle)
476
+ {
477
+ unsigned int gridRank = 0;
478
+ cudaCGGetRank(NULL, &gridRank, handle);
479
+ return gridRank;
480
+ }
481
+
482
+ _CG_STATIC_QUALIFIER unsigned int num_grids(const unsigned long long handle)
483
+ {
484
+ unsigned int numGrids = 0;
485
+ cudaCGGetSize(NULL, &numGrids, handle);
486
+ return numGrids;
487
+ }
488
+
489
+ # ifdef _CG_CPP11_FEATURES
490
+ struct multi_grid_functions {
491
+ decltype(multi_grid::get_intrinsic_handle) *get_intrinsic_handle;
492
+ decltype(multi_grid::sync) *sync;
493
+ decltype(multi_grid::size) *size;
494
+ decltype(multi_grid::thread_rank) *thread_rank;
495
+ decltype(multi_grid::grid_rank) *grid_rank;
496
+ decltype(multi_grid::num_grids) *num_grids;
497
+ };
498
+
499
+ template <typename = void>
500
+ _CG_STATIC_QUALIFIER const multi_grid_functions* load_grid_intrinsics() {
501
+ __constant__ static const multi_grid_functions mgf {
502
+ &multi_grid::get_intrinsic_handle,
503
+ &multi_grid::sync,
504
+ &multi_grid::size,
505
+ &multi_grid::thread_rank,
506
+ &multi_grid::grid_rank,
507
+ &multi_grid::num_grids
508
+ };
509
+
510
+ return &mgf;
511
+ }
512
+ # endif
513
+ };
514
+ #endif
515
+
516
+ #if defined(_CG_HAS_CLUSTER_GROUP)
517
+ namespace cluster {
518
+
519
+ _CG_STATIC_QUALIFIER bool isReal()
520
+ {
521
+ return __clusterDimIsSpecified();
522
+ }
523
+
524
+ _CG_STATIC_QUALIFIER void barrier_arrive()
525
+ {
526
+ __cluster_barrier_arrive();
527
+ }
528
+
529
+ _CG_STATIC_QUALIFIER void barrier_wait()
530
+ {
531
+ __cluster_barrier_wait();
532
+ }
533
+
534
+ _CG_STATIC_QUALIFIER void sync()
535
+ {
536
+ barrier_arrive();
537
+ barrier_wait();
538
+ }
539
+
540
+ _CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr)
541
+ {
542
+ return __cluster_query_shared_rank(addr);
543
+ }
544
+
545
+ template <typename T>
546
+ _CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank)
547
+ {
548
+ return static_cast<T*>(__cluster_map_shared_rank(addr, rank));
549
+ }
550
+
551
+ _CG_STATIC_QUALIFIER dim3 block_index()
552
+ {
553
+ return __clusterRelativeBlockIdx();
554
+ }
555
+
556
+ _CG_STATIC_QUALIFIER unsigned int block_rank()
557
+ {
558
+ return __clusterRelativeBlockRank();
559
+ }
560
+
561
+ _CG_STATIC_QUALIFIER unsigned int thread_rank()
562
+ {
563
+ return block_rank() * cta::num_threads() + cta::thread_rank();
564
+ }
565
+
566
+ _CG_STATIC_QUALIFIER dim3 dim_blocks()
567
+ {
568
+ return __clusterDim();
569
+ }
570
+
571
+ _CG_STATIC_QUALIFIER unsigned int num_blocks()
572
+ {
573
+ return __clusterSizeInBlocks();
574
+ }
575
+
576
+ _CG_STATIC_QUALIFIER dim3 dim_threads()
577
+ {
578
+ const dim3 dimBlocks = dim_blocks();
579
+ const unsigned int x = dimBlocks.x * blockDim.x;
580
+ const unsigned int y = dimBlocks.y * blockDim.y;
581
+ const unsigned int z = dimBlocks.z * blockDim.z;
582
+ return dim3(x, y, z);
583
+ }
584
+
585
+ _CG_STATIC_QUALIFIER unsigned int num_threads()
586
+ {
587
+ return num_blocks() * cta::num_threads();
588
+ }
589
+
590
+ };
591
+ #endif
592
+
593
+ _CG_STATIC_QUALIFIER unsigned int laneid()
594
+ {
595
+ unsigned int laneid;
596
+ asm ("mov.u32 %0, %%laneid;" : "=r"(laneid));
597
+ return laneid;
598
+ }
599
+
600
+ _CG_STATIC_QUALIFIER unsigned int lanemask32_eq()
601
+ {
602
+ unsigned int lanemask32_eq;
603
+ asm ("mov.u32 %0, %%lanemask_eq;" : "=r"(lanemask32_eq));
604
+ return (lanemask32_eq);
605
+ }
606
+
607
+ _CG_STATIC_QUALIFIER unsigned int lanemask32_lt()
608
+ {
609
+ unsigned int lanemask32_lt;
610
+ asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask32_lt));
611
+ return (lanemask32_lt);
612
+ }
613
+
614
+ _CG_STATIC_QUALIFIER void abort()
615
+ {
616
+ _CG_ABORT();
617
+ }
618
+
619
+ template <typename Ty>
620
+ _CG_QUALIFIER void assert_if_not_arithmetic() {
621
+ #ifdef _CG_CPP11_FEATURES
622
+ static_assert(
623
+ _CG_STL_NAMESPACE::is_integral<Ty>::value ||
624
+ details::is_float_or_half<Ty>::value,
625
+ "Error: Ty is neither integer or float"
626
+ );
627
+ #endif
628
+ }
629
+
630
+ #if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
631
+ template <unsigned int numWarps>
632
+ struct copy_channel {
633
+ char* channel_ptr;
634
+ barrier_t* sync_location;
635
+ size_t channel_size;
636
+
637
+ // One warp sending to all other warps, it has to wait for all other warps.
638
+ struct send_many_to_many {
639
+ _CG_STATIC_CONST_DECL wait_for_warps_kind wait_kind = wait_for_all_other_warps;
640
+ _CG_STATIC_QUALIFIER void post_iter_release(unsigned int thread_idx, barrier_t* sync_location) {
641
+ __syncwarp(0xFFFFFFFF);
642
+ details::sync_warps_release(sync_location, thread_idx == 0, cta::thread_rank(), numWarps);
643
+ }
644
+ };
645
+
646
+ // One warp is receiving and all other warps are sending to that warp, they have to wait for that one warp.
647
+ struct send_many_to_one {
648
+ _CG_STATIC_CONST_DECL wait_for_warps_kind wait_kind = wait_for_specific_warp;
649
+ _CG_STATIC_QUALIFIER void post_iter_release(unsigned int thread_idx, barrier_t* sync_location) {
650
+ // Wait for all warps to finish and let the last warp release all threads.
651
+ if (details::sync_warps_last_releases(sync_location, cta::thread_rank(), numWarps)) {
652
+ details::sync_warps_release(sync_location, thread_idx == 0, cta::thread_rank(), numWarps);
653
+ }
654
+ }
655
+ };
656
+
657
+ template <unsigned int ThreadCnt, size_t ValSize, typename SendDetails>
658
+ _CG_QUALIFIER void _send_value_internal(char* val_ptr, unsigned int thread_idx, unsigned int warp_id) {
659
+ size_t thread_offset = thread_idx * sizeof(int);
660
+
661
+ for (size_t i = 0; i < ValSize; i += channel_size) {
662
+ size_t bytes_left = ValSize - i;
663
+ size_t copy_chunk = min(bytes_left, channel_size);
664
+
665
+ details::sync_warps_wait_for_warps<SendDetails::wait_kind>(warp_id, sync_location, cta::thread_rank(), numWarps);
666
+ #pragma unroll 1
667
+ for (size_t j = thread_offset; j < copy_chunk ; j += sizeof(int) * ThreadCnt) {
668
+ size_t my_bytes_left = copy_chunk - j;
669
+ memcpy(channel_ptr + j, val_ptr + i + j, min(my_bytes_left, sizeof(int)));
670
+ }
671
+ SendDetails::post_iter_release(thread_idx, sync_location);
672
+ }
673
+ }
674
+
675
+
676
+ template <typename TyVal, unsigned int ThreadCnt, typename SendDetails>
677
+ _CG_QUALIFIER void send_value(TyVal& val, unsigned int thread_idx, unsigned int warp_id) {
678
+ _send_value_internal<ThreadCnt, sizeof(TyVal), SendDetails>(reinterpret_cast<char*>(&val), thread_idx, warp_id);
679
+ }
680
+
681
+ template <size_t ValSize>
682
+ _CG_QUALIFIER void _receive_value_internal(char* val_ptr, bool warp_master, bool active) {
683
+ for (size_t i = 0; i < ValSize; i += channel_size) {
684
+ size_t bytes_left = ValSize - i;
685
+ details::sync_warps_wait_for_release(sync_location, warp_master, cta::thread_rank(), numWarps);
686
+ if (active) {
687
+ memcpy(val_ptr + i, channel_ptr, min(bytes_left, channel_size));
688
+ }
689
+ }
690
+ }
691
+
692
+ template <typename TyVal>
693
+ _CG_QUALIFIER void receive_value(TyVal& val, bool warp_master, bool active = true) {
694
+ _receive_value_internal<sizeof(TyVal)>(reinterpret_cast<char*>(&val), warp_master, active);
695
+ }
696
+ };
697
+
698
+ _CG_STATIC_QUALIFIER constexpr unsigned int log2(unsigned int x) {
699
+ return x == 1 ? 0 : 1 + log2(x / 2);
700
+ }
701
+ #endif //_CG_CPP11_FEATURES
702
+
703
+ }; // !Namespace internal
704
+
705
+ _CG_END_NAMESPACE
706
+
707
+ #endif /* !_COOPERATIVE_GROUPS_HELPERS_H_ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/info.h ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+
50
+
51
+ #ifndef _CG_INFO_H_
52
+ #define _CG_INFO_H_
53
+ /*
54
+ ** Define: _CG_VERSION
55
+ */
56
+ #define _CG_VERSION 1000
57
+
58
+ /*
59
+ ** Define: _CG_ABI_VERSION
60
+ */
61
+ #ifndef _CG_ABI_VERSION
62
+ # define _CG_ABI_VERSION 1
63
+ #endif
64
+
65
+ /*
66
+ ** Define: _CG_ABI_EXPERIMENTAL
67
+ ** Desc: If enabled, sets all features enabled (ABI-breaking or experimental)
68
+ */
69
+ #if defined(_CG_ABI_EXPERIMENTAL)
70
+ #endif
71
+
72
+ #define _CG_CONCAT_INNER(x, y) x ## y
73
+ #define _CG_CONCAT_OUTER(x, y) _CG_CONCAT_INNER(x, y)
74
+ #define _CG_NAMESPACE _CG_CONCAT_OUTER(__v, _CG_ABI_VERSION)
75
+
76
+ #define _CG_BEGIN_NAMESPACE \
77
+ namespace cooperative_groups { namespace _CG_NAMESPACE {
78
+ #define _CG_END_NAMESPACE \
79
+ }; using namespace _CG_NAMESPACE; };
80
+
81
+ #if (defined(__cplusplus) && (__cplusplus >= 201103L)) || (defined(_MSC_VER) && (_MSC_VER >= 1900))
82
+ # define _CG_CPP11_FEATURES
83
+ #endif
84
+
85
+ #if !defined(_CG_QUALIFIER)
86
+ # define _CG_QUALIFIER __forceinline__ __device__
87
+ #endif
88
+ #if !defined(_CG_STATIC_QUALIFIER)
89
+ # define _CG_STATIC_QUALIFIER static __forceinline__ __device__
90
+ #endif
91
+ #if !defined(_CG_CONSTEXPR_QUALIFIER)
92
+ # if defined(_CG_CPP11_FEATURES)
93
+ # define _CG_CONSTEXPR_QUALIFIER constexpr __forceinline__ __device__
94
+ # else
95
+ # define _CG_CONSTEXPR_QUALIFIER _CG_QUALIFIER
96
+ # endif
97
+ #endif
98
+ #if !defined(_CG_STATIC_CONSTEXPR_QUALIFIER)
99
+ # if defined(_CG_CPP11_FEATURES)
100
+ # define _CG_STATIC_CONSTEXPR_QUALIFIER static constexpr __forceinline__ __device__
101
+ # else
102
+ # define _CG_STATIC_CONSTEXPR_QUALIFIER _CG_STATIC_QUALIFIER
103
+ # endif
104
+ #endif
105
+
106
+ #if defined(_MSC_VER)
107
+ # define _CG_DEPRECATED __declspec(deprecated)
108
+ #else
109
+ # define _CG_DEPRECATED __attribute__((deprecated))
110
+ #endif
111
+
112
+ #if (__CUDA_ARCH__ >= 600) || !defined(__CUDA_ARCH__)
113
+ # define _CG_HAS_GRID_GROUP
114
+ #endif
115
+ #if (__CUDA_ARCH__ >= 600) || !defined(__CUDA_ARCH__)
116
+ # define _CG_HAS_MULTI_GRID_GROUP
117
+ #endif
118
+ #if (__CUDA_ARCH__ >= 700) || !defined(__CUDA_ARCH__)
119
+ # define _CG_HAS_MATCH_COLLECTIVE
120
+ #endif
121
+ #if ((__CUDA_ARCH__ >= 900) || !defined(__CUDA_ARCH__)) && (defined(__NVCC__) || defined(__CUDACC_RTC__) || defined(_CG_CLUSTER_INTRINSICS_AVAILABLE))
122
+ # define _CG_HAS_CLUSTER_GROUP
123
+ #endif
124
+ // Has __half and __half2
125
+ // Only usable if you include the cuda_fp16.h extension, and
126
+ // _before_ including cooperative_groups.h
127
+ #ifdef __CUDA_FP16_TYPES_EXIST__
128
+ # define _CG_HAS_FP16_COLLECTIVE
129
+ #endif
130
+
131
+ #if (__CUDA_ARCH__ >= 800) || !defined(__CUDA_ARCH__) && (defined(__NVCC__) || defined(__CUDACC_RTC__))
132
+ # define _CG_HAS_OP_REDUX
133
+ #endif
134
+
135
+ // Include libcu++ where supported.
136
+ #if defined(_CG_CPP11_FEATURES) && !defined(__QNX__) && !defined(__ibmxl__) && \
137
+ (defined(__NVCC__) || defined(__CUDACC_RTC__)) && \
138
+ (defined(__x86_64__) || defined(__aarch64__) || defined(__ppc64__)|| defined(_M_X64) || defined(_M_ARM64)) && \
139
+ (defined(_MSC_VER) || defined(__GNUC__) || defined(__clang__))
140
+ # define _CG_USE_CUDA_STL
141
+ #else
142
+ # define _CG_USE_OWN_TRAITS
143
+ #endif
144
+
145
+ #if defined(_CG_USE_CUDA_STL) && (!defined(__CUDA_ARCH__) || \
146
+ ((!defined(_MSC_VER) && __CUDA_ARCH__ >= 600) || (defined(_MSC_VER) && __CUDA_ARCH__ >= 700)))
147
+ # define _CG_HAS_STL_ATOMICS
148
+ #endif
149
+
150
+ #ifdef _CG_CPP11_FEATURES
151
+ // Use cuda::std:: for type_traits
152
+ # if defined(_CG_USE_CUDA_STL)
153
+ # define _CG_STL_NAMESPACE cuda::std
154
+ # include <cuda/std/type_traits>
155
+ // Use CG's implementation of type traits
156
+ # else
157
+ # define _CG_STL_NAMESPACE cooperative_groups::details::templates
158
+ # endif
159
+ #endif
160
+
161
+ #ifdef _CG_CPP11_FEATURES
162
+ # define _CG_STATIC_CONST_DECL static constexpr
163
+ # define _CG_CONST_DECL constexpr
164
+ #else
165
+ # define _CG_STATIC_CONST_DECL static const
166
+ # define _CG_CONST_DECL const
167
+ #endif
168
+
169
+ #if (defined(_MSC_VER) && !defined(_WIN64)) || defined(__arm__)
170
+ # define _CG_ASM_PTR_CONSTRAINT "r"
171
+ #else
172
+ # define _CG_ASM_PTR_CONSTRAINT "l"
173
+ #endif
174
+
175
+ /*
176
+ ** Define: CG_DEBUG
177
+ ** What: Enables various runtime safety checks
178
+ */
179
+ #if defined(__CUDACC_DEBUG__) && defined(CG_DEBUG) && !defined(NDEBUG)
180
+ # define _CG_DEBUG
181
+ #endif
182
+
183
+ #if defined(_CG_DEBUG)
184
+ # include <assert.h>
185
+ # define _CG_ASSERT(x) assert((x));
186
+ # define _CG_ABORT() assert(0);
187
+ #else
188
+ # define _CG_ASSERT(x)
189
+ # define _CG_ABORT() __trap();
190
+ #endif
191
+
192
+ #if defined(_CG_CPP11_FEATURES) && !defined(_CG_USE_CUDA_STL)
193
+ _CG_BEGIN_NAMESPACE
194
+
195
+ namespace details {
196
+ namespace templates {
197
+
198
+ /**
199
+ * Integral constants
200
+ **/
201
+ template <typename Ty, Ty Val>
202
+ struct integral_constant {
203
+ static constexpr Ty value = Val;
204
+ typedef Ty type;
205
+
206
+ _CG_QUALIFIER constexpr operator type() const noexcept { return value; }
207
+ _CG_QUALIFIER constexpr type operator()() const noexcept { return value; }
208
+ };
209
+
210
+ typedef integral_constant<bool, true> true_type;
211
+ typedef integral_constant<bool, false> false_type;
212
+
213
+ /**
214
+ * CV Qualifiers
215
+ **/
216
+ template <class Ty> struct is_lvalue_reference : public details::templates::false_type {};
217
+ template <class Ty> struct is_lvalue_reference<Ty&> : public details::templates::true_type {};
218
+
219
+ template <class Ty> struct remove_reference {typedef Ty type;};
220
+ template <class Ty> struct remove_reference<Ty&> {typedef Ty type;};
221
+ template <class Ty> struct remove_reference<Ty&&> {typedef Ty type;};
222
+
223
+ template <class Ty>
224
+ using remove_reference_t = typename details::templates::remove_reference<Ty>::type;
225
+
226
+ template <class Ty> struct remove_const {typedef Ty type;};
227
+ template <class Ty> struct remove_const<const Ty> {typedef Ty type;};
228
+
229
+ template <class Ty> struct remove_volatile {typedef Ty type;};
230
+ template <class Ty> struct remove_volatile<volatile Ty> {typedef Ty type;};
231
+
232
+ template <class Ty> struct remove_cv {typedef typename details::templates::remove_volatile<typename details::templates::remove_const<Ty>::type>::type type;};
233
+
234
+ template <class Ty>
235
+ using remove_cv_t = typename details::templates::remove_cv<Ty>::type;
236
+
237
+ template <class Ty>
238
+ _CG_QUALIFIER Ty&& forward(remove_reference_t<Ty> &t) noexcept {
239
+ return static_cast<Ty&&>(t);
240
+ }
241
+
242
+ template <class Ty>
243
+ _CG_QUALIFIER Ty&& forward(remove_reference_t<Ty> &&t) noexcept {
244
+ static_assert(!details::templates::is_lvalue_reference<Ty>::value, "Forwarding an rvalue as an lvalue is not allowed.");
245
+ return static_cast<Ty&&>(t);
246
+ }
247
+
248
+ /**
249
+ * is_integral
250
+ **/
251
+ template <class Ty> struct _is_integral : public details::templates::false_type {};
252
+ template <> struct _is_integral<bool> : public details::templates::true_type {};
253
+ template <> struct _is_integral<char> : public details::templates::true_type {};
254
+ template <> struct _is_integral<unsigned char> : public details::templates::true_type {};
255
+ template <> struct _is_integral<short> : public details::templates::true_type {};
256
+ template <> struct _is_integral<unsigned short> : public details::templates::true_type {};
257
+ template <> struct _is_integral<int> : public details::templates::true_type {};
258
+ template <> struct _is_integral<unsigned int> : public details::templates::true_type {};
259
+ template <> struct _is_integral<long> : public details::templates::true_type {};
260
+ template <> struct _is_integral<long long> : public details::templates::true_type {};
261
+ template <> struct _is_integral<unsigned long> : public details::templates::true_type {};
262
+ template <> struct _is_integral<unsigned long long> : public details::templates::true_type {};
263
+ //Vector type support?
264
+
265
+ template <typename Ty>
266
+ struct is_integral : public details::templates::_is_integral<typename details::templates::remove_cv<Ty>::type> {};
267
+
268
+ /**
269
+ * is_floating_point
270
+ **/
271
+ template <class Ty> struct _is_floating_point : public details::templates::false_type {};
272
+ template <> struct _is_floating_point<float> : public details::templates::true_type {};
273
+ template <> struct _is_floating_point<double> : public details::templates::true_type {};
274
+ template <> struct _is_floating_point<long double> : public details::templates::true_type {};
275
+ # ifdef __CUDA_FP16_TYPES_EXIST__
276
+ template <> struct _is_floating_point<__half> : public details::templates::true_type {};
277
+ template <> struct _is_floating_point<__half2> : public details::templates::true_type {};
278
+ # endif
279
+ //Vector type support?
280
+
281
+ template <typename Ty>
282
+ struct is_floating_point : public details::templates::_is_floating_point<typename details::templates::remove_cv<Ty>::type> {};
283
+
284
+ template <class T>
285
+ struct is_arithmetic : details::templates::integral_constant<
286
+ bool,
287
+ details::templates::is_integral<T>::value ||
288
+ details::templates::is_floating_point<T>::value> {};
289
+
290
+ template <typename Ty, bool = details::templates::is_arithmetic<Ty>::value>
291
+ struct _is_unsigned : details::templates::integral_constant<bool, Ty(0) < Ty(-1)> {};
292
+
293
+ template <typename Ty>
294
+ struct _is_unsigned<Ty,false> : details::templates::false_type {};
295
+
296
+ template <typename Ty>
297
+ struct is_unsigned : _is_unsigned<typename details::templates::remove_cv<Ty>::type> {};
298
+
299
+ /**
300
+ * programmatic type traits
301
+ **/
302
+ template<bool B, class Ty = void>
303
+ struct enable_if {};
304
+
305
+ template<class Ty>
306
+ struct enable_if<true, Ty> { typedef Ty type; };
307
+
308
+ template<bool Cond, typename Ty = void>
309
+ using enable_if_t = typename details::templates::enable_if<Cond, Ty>::type;
310
+
311
+ template<class Ty1, class Ty2>
312
+ struct is_same : details::templates::false_type {};
313
+
314
+ template<class Ty>
315
+ struct is_same<Ty, Ty> : details::templates::true_type {};
316
+
317
+ } // templates
318
+ } // details
319
+ _CG_END_NAMESPACE
320
+
321
+ #endif // _CG_CPP11_FEATURES
322
+
323
+ #endif // _CG_INFO_H_
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/partitioning.h ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef _CG_PARTITIONING_H
51
+ #define _CG_PARTITIONING_H
52
+
53
+ #include "info.h"
54
+ #include "helpers.h"
55
+
56
+ _CG_BEGIN_NAMESPACE
57
+
58
+ namespace details {
59
+
60
+ template <typename TyGroup>
61
+ _CG_STATIC_QUALIFIER coalesced_group _binary_partition(const TyGroup &tile, bool pred) {
62
+ const unsigned int fullMask = ~0u;
63
+
64
+ unsigned int thisMask = _coalesced_group_data_access::get_mask(tile);
65
+ unsigned int predMask = pred ? 0 : fullMask;
66
+ unsigned int setMask = __ballot_sync(thisMask, pred);
67
+
68
+ if (setMask == thisMask || setMask == 0) {
69
+ coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(thisMask);
70
+ _coalesced_group_data_access::modify_meta_group(subTile, 0, 1);
71
+ return subTile;
72
+ }
73
+ else {
74
+ unsigned int subMask = thisMask & (setMask ^ predMask);
75
+ coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(subMask);
76
+ _coalesced_group_data_access::modify_meta_group(subTile, pred, 2);
77
+ return subTile;
78
+ }
79
+ }
80
+
81
+ #ifdef _CG_HAS_MATCH_COLLECTIVE
82
+ template <typename TyGroup, typename TyPredicate>
83
+ _CG_STATIC_QUALIFIER coalesced_group _labeled_partition(const TyGroup &tile, TyPredicate pred) {
84
+ unsigned int thisMask = _coalesced_group_data_access::get_mask(tile);
85
+ unsigned int thisBias = __ffs(thisMask) - 1; // Subtract 1 to index properly from [1-32]
86
+ unsigned int subMask = __match_any_sync(thisMask, pred);
87
+
88
+ coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(subMask);
89
+
90
+ int leaderLaneId = subTile.shfl(details::laneid(), 0);
91
+
92
+ bool isLeader = !subTile.thread_rank();
93
+ unsigned int leaderMask = __ballot_sync(thisMask, isLeader);
94
+ unsigned int tileRank = __fns(leaderMask, leaderLaneId, 0) - thisBias;
95
+
96
+ _coalesced_group_data_access::modify_meta_group(subTile, tileRank, __popc(leaderMask));
97
+
98
+ return subTile;
99
+ }
100
+ #endif
101
+ }; // namespace details
102
+
103
+ _CG_STATIC_QUALIFIER coalesced_group binary_partition(const coalesced_group &tile, bool pred) {
104
+ return details::_binary_partition(tile, pred);
105
+ }
106
+
107
+ template <unsigned int Size, typename ParentT>
108
+ _CG_STATIC_QUALIFIER coalesced_group binary_partition(const thread_block_tile<Size, ParentT> &tile, bool pred) {
109
+ #ifdef _CG_CPP11_FEATURES
110
+ static_assert(Size <= 32, "Binary partition is available only for tiles of size smaller or equal to 32");
111
+ #endif
112
+ return details::_binary_partition(tile, pred);
113
+ }
114
+
115
+
116
+ #if defined(_CG_HAS_MATCH_COLLECTIVE) && defined(_CG_CPP11_FEATURES)
117
+ template <typename TyPredicate>
118
+ _CG_STATIC_QUALIFIER coalesced_group labeled_partition(const coalesced_group &tile, TyPredicate pred) {
119
+ static_assert(_CG_STL_NAMESPACE::is_integral<TyPredicate>::value, "labeled_partition predicate must be an integral type");
120
+ return details::_labeled_partition(tile, pred);
121
+ }
122
+
123
+ template <typename TyPredicate, unsigned int Size, typename ParentT>
124
+ _CG_STATIC_QUALIFIER coalesced_group labeled_partition(const thread_block_tile<Size, ParentT> &tile, TyPredicate pred) {
125
+ static_assert(_CG_STL_NAMESPACE::is_integral<TyPredicate>::value, "labeled_partition predicate must be an integral type");
126
+ static_assert(Size <= 32, "Labeled partition is available only for tiles of size smaller or equal to 32");
127
+ return details::_labeled_partition(tile, pred);
128
+ }
129
+ #endif
130
+
131
+ _CG_END_NAMESPACE
132
+
133
+ #endif // _CG_PARTITIONING_H
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/reduce.h ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _CG_REDUCE_H_
50
+ #define _CG_REDUCE_H_
51
+
52
+ #include "info.h"
53
+ #include "helpers.h"
54
+ #include "coalesced_reduce.h"
55
+ #include "functional.h"
56
+ #include "cooperative_groups.h"
57
+
58
+ _CG_BEGIN_NAMESPACE
59
+
60
+ namespace details {
61
+
62
+ template <class Ty>
63
+ using _redux_is_add_supported = _CG_STL_NAMESPACE::integral_constant<
64
+ bool,
65
+ _CG_STL_NAMESPACE::is_integral<Ty>::value && (sizeof(Ty) <= 4)>;
66
+
67
+ template <class Ty>
68
+ using redux_is_add_supported = _redux_is_add_supported<Ty>;
69
+
70
+ // A specialization for 64 bit logical operations is possible
71
+ // but for now only accelerate 32 bit bitwise ops
72
+ template <class Ty>
73
+ using redux_is_logical_supported = redux_is_add_supported<Ty>;
74
+
75
+ // Base operator support case
76
+ template <class TyOp, class Ty> struct _redux_op_supported : public _CG_STL_NAMESPACE::false_type {};
77
+ #ifdef _CG_HAS_OP_REDUX
78
+ template <class Ty> struct _redux_op_supported<cooperative_groups::plus<Ty>, Ty> : public redux_is_add_supported<Ty> {};
79
+ template <class Ty> struct _redux_op_supported<cooperative_groups::less<Ty>, Ty> : public redux_is_add_supported<Ty> {};
80
+ template <class Ty> struct _redux_op_supported<cooperative_groups::greater<Ty>, Ty> : public redux_is_add_supported<Ty> {};
81
+ template <class Ty> struct _redux_op_supported<cooperative_groups::bit_and<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
82
+ template <class Ty> struct _redux_op_supported<cooperative_groups::bit_or<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
83
+ template <class Ty> struct _redux_op_supported<cooperative_groups::bit_xor<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
84
+ #endif
85
+
86
+ template <class Ty, template <class> class TyOp>
87
+ using redux_op_supported = _redux_op_supported<
88
+ typename details::remove_qual<TyOp<Ty>>,
89
+ Ty>;
90
+
91
+ // Groups smaller than 16 actually have worse performance characteristics when used with redux
92
+ // tiles of size 16 and 32 perform the same or better and have better code generation profiles
93
+ template <class TyGroup> struct _redux_group_optimized : public _CG_STL_NAMESPACE::false_type {};
94
+
95
+ template <unsigned int Sz, typename TyPar>
96
+ struct _redux_group_optimized<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::integral_constant<
97
+ bool,
98
+ (Sz >= 16)> {};
99
+ template <unsigned int Sz, typename TyPar>
100
+ struct _redux_group_optimized<internal_thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::integral_constant<
101
+ bool,
102
+ (Sz >= 16)> {};
103
+ template <>
104
+ struct _redux_group_optimized<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
105
+
106
+ template <typename TyGroup>
107
+ using redux_group_optimized = _redux_group_optimized<details::remove_qual<TyGroup>>;
108
+
109
+ template <template <class> class TyOp>
110
+ _CG_STATIC_QUALIFIER int pick_redux(int mask, int val);
111
+ template <template <class> class TyOp>
112
+ _CG_STATIC_QUALIFIER unsigned int pick_redux(int mask, unsigned int val);
113
+
114
+ #ifdef _CG_HAS_OP_REDUX
115
+ template <> _CG_QUALIFIER int pick_redux<cooperative_groups::plus>(int mask, int val) {
116
+ return __reduce_add_sync(mask, val);
117
+ }
118
+ template <> _CG_QUALIFIER int pick_redux<cooperative_groups::less>(int mask, int val) {
119
+ return __reduce_min_sync(mask, val);
120
+ }
121
+ template <> _CG_QUALIFIER int pick_redux<cooperative_groups::greater>(int mask, int val) {
122
+ return __reduce_max_sync(mask, val);
123
+ }
124
+ template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_and>(int mask, int val) {
125
+ return __reduce_and_sync(mask, val);
126
+ }
127
+ template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_xor>(int mask, int val) {
128
+ return __reduce_xor_sync(mask, val);
129
+ }
130
+ template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_or>(int mask, int val) {
131
+ return __reduce_or_sync(mask, val);
132
+ }
133
+
134
+ template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::plus>(int mask, unsigned int val) {
135
+ return __reduce_add_sync(mask, val);
136
+ }
137
+ template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::less>(int mask, unsigned int val) {
138
+ return __reduce_min_sync(mask, val);
139
+ }
140
+ template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::greater>(int mask, unsigned int val) {
141
+ return __reduce_max_sync(mask, val);
142
+ }
143
+ template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_and>(int mask, unsigned int val) {
144
+ return __reduce_and_sync(mask, val);
145
+ }
146
+ template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_xor>(int mask, unsigned int val) {
147
+ return __reduce_xor_sync(mask, val);
148
+ }
149
+ template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_or>(int mask, unsigned int val) {
150
+ return __reduce_or_sync(mask, val);
151
+ }
152
+ #endif
153
+
154
+
155
+ template <typename TyVal, bool = _CG_STL_NAMESPACE::is_unsigned<TyVal>::value>
156
+ struct _accelerated_op;
157
+
158
+ // Signed type redux intrinsic dispatch
159
+ template <typename TyVal>
160
+ struct _accelerated_op<TyVal, false> {
161
+ template <template <class> class TyOp>
162
+ _CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) {
163
+ return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<int>(val)));
164
+ }
165
+ };
166
+
167
+ // Unsigned type redux intrinsic dispatch
168
+ template <typename TyVal>
169
+ struct _accelerated_op<TyVal, true> {
170
+ template <template <class> class TyOp>
171
+ _CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) {
172
+ return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<unsigned int>(val)));
173
+ }
174
+ };
175
+
176
+ template <typename TyVal>
177
+ using accelerated_op = _accelerated_op<TyVal>;
178
+
179
+
180
+ template <typename TyVal, typename TyFnInput, typename TyGroup>
181
+ class _redux_dispatch {
182
+ template <class Ty, template <class> class TyOp>
183
+ using _redux_is_usable = _CG_STL_NAMESPACE::integral_constant<bool,
184
+ redux_op_supported<Ty, TyOp>::value &&
185
+ redux_group_optimized<TyGroup>::value>;
186
+
187
+ template <class Ty, template <class> class TyOp>
188
+ using redux_is_usable = typename _CG_STL_NAMESPACE::enable_if<_redux_is_usable<Ty, TyOp>::value, void>::type*;
189
+
190
+ template <class Ty, template <class> class TyOp>
191
+ using redux_is_not_usable = typename _CG_STL_NAMESPACE::enable_if<!_redux_is_usable<Ty, TyOp>::value, void>::type*;
192
+
193
+ public:
194
+ // Dispatch to redux if the combination of op and args are supported
195
+ template<
196
+ template <class> class TyOp,
197
+ redux_is_usable<TyFnInput, TyOp> = nullptr>
198
+ _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
199
+ // Retrieve the mask for the group and dispatch to redux
200
+ return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val));
201
+ }
202
+
203
+ template<
204
+ template <class> class TyOp,
205
+ redux_is_usable<TyFnInput, TyOp> = nullptr>
206
+ _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) {
207
+ // Retrieve the mask for the group and dispatch to redux
208
+ return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val));
209
+ }
210
+
211
+ // Fallback shuffle sync reduction
212
+ template <
213
+ template <class> class TyOp,
214
+ redux_is_not_usable<TyFnInput, TyOp> = nullptr>
215
+ _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
216
+ //Dispatch to fallback shuffle sync accelerated reduction
217
+ return coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
218
+ }
219
+
220
+ };
221
+
222
+ // Group support for reduce.
223
+ template <class TyGroup> struct _reduce_group_supported : public _CG_STL_NAMESPACE::false_type {};
224
+
225
+ template <unsigned int Sz, typename TyPar>
226
+ struct _reduce_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
227
+ template <unsigned int Sz, typename TyPar>
228
+ struct _reduce_group_supported<internal_thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
229
+ template <>
230
+ struct _reduce_group_supported<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
231
+
232
+ template <typename TyGroup>
233
+ using reduce_group_supported = _reduce_group_supported<details::remove_qual<TyGroup>>;
234
+
235
+ template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup>
236
+ _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
237
+ static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ");
238
+
239
+ using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>;
240
+ return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
241
+ }
242
+
243
+ template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup>
244
+ _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) {
245
+ static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ");
246
+
247
+ using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>;
248
+ return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
249
+ }
250
+
251
+
252
+ template <typename TyVal, typename TyOp, typename TyGroup>
253
+ _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
254
+ return details::coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
255
+ }
256
+
257
+ template <unsigned int GroupId>
258
+ struct tile_reduce_dispatch;
259
+
260
+ template <>
261
+ struct tile_reduce_dispatch<details::coalesced_group_id> {
262
+ template <typename TyGroup, typename TyVal, typename TyFn>
263
+ _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
264
+ return details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
265
+ }
266
+ };
267
+
268
+ #if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
269
+ template <>
270
+ struct tile_reduce_dispatch<details::multi_tile_group_id> {
271
+ template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn>
272
+ _CG_STATIC_QUALIFIER auto reduce(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
273
+ using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
274
+ using TyRet = details::remove_qual<TyVal>;
275
+ const unsigned int num_warps = Size / 32;
276
+
277
+ auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
278
+ *warp_scratch_location =
279
+ details::reduce(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
280
+ };
281
+ auto inter_warp_lambda =
282
+ [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
283
+ *thread_scratch_location =
284
+ details::reduce(subwarp, *thread_scratch_location, _CG_STL_NAMESPACE::forward<TyFn>(op));
285
+ };
286
+ return details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
287
+ }
288
+ };
289
+
290
+ enum class AsyncReduceType { store, update };
291
+
292
+ template <AsyncReduceType TyAsyncReduce>
293
+ struct async_reduce_result_handler;
294
+
295
+ template<>
296
+ struct async_reduce_result_handler<AsyncReduceType::store> {
297
+ template<typename TyDst, typename TyVal, typename TyOp>
298
+ _CG_STATIC_QUALIFIER void handleResult(TyDst *dst, TyVal& result, TyOp&& op) {
299
+ *dst = result;
300
+ }
301
+ };
302
+
303
+ #if defined(_CG_HAS_STL_ATOMICS)
304
+ template<>
305
+ struct async_reduce_result_handler<AsyncReduceType::update> {
306
+ template<typename TyDst, typename TyVal, typename TyOp>
307
+ _CG_STATIC_QUALIFIER void handleResult(TyDst& dst, TyVal& result, TyOp&& op) {
308
+ atomic_update(dst, result, _CG_STL_NAMESPACE::forward<TyOp>(op));
309
+ }
310
+ };
311
+ #endif
312
+
313
+ template <unsigned int GroupId, AsyncReduceType TyAsyncReduce>
314
+ struct tile_async_reduce_dispatch;
315
+
316
+ template <AsyncReduceType TyAsyncReduce>
317
+ struct tile_async_reduce_dispatch<details::coalesced_group_id, TyAsyncReduce> {
318
+ template <unsigned int TySize, typename ParentT, typename TyDst, typename TyVal, typename TyFn>
319
+ _CG_STATIC_QUALIFIER void reduce(const __single_warp_thread_block_tile<TySize, ParentT>& group, TyDst& dst, TyVal&& val, TyFn&& op) {
320
+ // Do regular, in group reduction
321
+ auto result = details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
322
+
323
+ // One thread stores/updates the destination
324
+ if (group.thread_rank() == 0) {
325
+ async_reduce_result_handler<TyAsyncReduce>::handleResult(dst, result, _CG_STL_NAMESPACE::forward<TyFn>(op));
326
+ }
327
+ }
328
+
329
+ template <typename TyDst, typename TyVal, typename TyFn>
330
+ _CG_STATIC_QUALIFIER void reduce(const coalesced_group& group, TyDst& dst, TyVal&& val, TyFn&& op) {
331
+ // Do in group reduction to the last thread
332
+ auto result = details::coalesced_reduce_to_one(group, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
333
+
334
+ // One thread stores/updates the destination
335
+ if (group.thread_rank() == group.size() - 1) {
336
+ async_reduce_result_handler<TyAsyncReduce>::handleResult(dst, result, _CG_STL_NAMESPACE::forward<TyFn>(op));
337
+ }
338
+ }
339
+ };
340
+
341
+ template <AsyncReduceType TyAsyncReduce>
342
+ struct tile_async_reduce_dispatch<details::multi_tile_group_id, TyAsyncReduce> {
343
+ template <unsigned int TySize, typename ParentT, typename TyDst, typename TyInputVal, typename TyFn>
344
+ _CG_STATIC_QUALIFIER void reduce(const thread_block_tile<TySize, ParentT>& group, TyDst& dst, TyInputVal&& val, TyFn&& op) {
345
+ using TyVal = remove_qual<TyInputVal>;
346
+ const unsigned int num_warps = TySize / 32;
347
+ details::barrier_t* sync_location = multi_warp_sync_location_getter(group);
348
+ auto warp_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, group.thread_rank() / 32);
349
+
350
+ // Do in warp reduce
351
+ auto warp = details::tiled_partition_internal<32, thread_block_tile<TySize, ParentT>>();
352
+ *warp_scratch_location = details::reduce(warp, _CG_STL_NAMESPACE::forward<TyInputVal>(val), op);
353
+
354
+ // Tile of size num_warps from the last warp to arrive does final reduction step
355
+ if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), num_warps)) {
356
+ auto subwarp = details::tiled_partition_internal<num_warps, decltype(warp)>();
357
+ if (subwarp.meta_group_rank() == 0) {
358
+ auto thread_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, subwarp.thread_rank());
359
+ auto thread_val = *thread_scratch_location;
360
+ // Release other warps, we read their contribution already.
361
+ subwarp.sync();
362
+ details::sync_warps_release(sync_location, subwarp.thread_rank() == 0, details::cta::thread_rank(), num_warps);
363
+ TyVal result = details::reduce(subwarp, thread_val, op);
364
+ // One thread stores the result or updates the atomic
365
+ if (subwarp.thread_rank() == 0) {
366
+ async_reduce_result_handler<TyAsyncReduce>::handleResult(dst, result, _CG_STL_NAMESPACE::forward<TyFn>(op));
367
+ }
368
+ }
369
+ warp.sync();
370
+ }
371
+ }
372
+ };
373
+ #endif
374
+
375
+ template <typename TyGroup, typename TyInputVal, typename TyRetVal>
376
+ _CG_QUALIFIER void check_reduce_params() {
377
+ static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ");
378
+ static_assert(details::reduce_group_supported<TyGroup>::value, "This group does not exclusively represent a tile");
379
+ };
380
+
381
+ template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal>
382
+ _CG_QUALIFIER void check_async_reduce_params() {
383
+ check_reduce_params<TyGroup, TyInputVal, TyRetVal>();
384
+ static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ");
385
+ }
386
+ } // details
387
+
388
+ template <typename TyGroup, typename TyVal, typename TyFn>
389
+ _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
390
+ details::check_reduce_params<TyGroup, details::remove_qual<TyVal>, decltype(op(val, val))>();
391
+
392
+ using dispatch = details::tile_reduce_dispatch<TyGroup::_group_id>;
393
+ return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
394
+ }
395
+
396
+ #if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
397
+ namespace experimental {
398
+
399
+ #if defined(_CG_HAS_STL_ATOMICS)
400
+ template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
401
+ void _CG_QUALIFIER reduce_update_async(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
402
+ details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
403
+
404
+ using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id, details::AsyncReduceType::update>;
405
+ dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
406
+ }
407
+
408
+ template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
409
+ void _CG_QUALIFIER reduce_update_async(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
410
+ details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
411
+
412
+ using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id, details::AsyncReduceType::update>;
413
+ dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
414
+ }
415
+ #endif
416
+
417
+ template<typename TyGroup, typename TyVal, typename TyInputVal, typename TyFn>
418
+ void _CG_QUALIFIER reduce_store_async(const TyGroup& group, TyVal* dst, TyInputVal&& val, TyFn&& op) {
419
+ details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
420
+
421
+ using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id, details::AsyncReduceType::store>;
422
+ dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
423
+ }
424
+
425
+ }
426
+ #endif
427
+
428
+ _CG_END_NAMESPACE
429
+
430
+ #endif // _CG_REDUCE_H_
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/scan.h ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _CG_SCAN_H_
50
+ #define _CG_SCAN_H_
51
+
52
+ #include "info.h"
53
+ #include "helpers.h"
54
+ #include "functional.h"
55
+ #include "coalesced_scan.h"
56
+
57
+ _CG_BEGIN_NAMESPACE
58
+
59
+ namespace details {
60
+
61
+ // Group support for scan.
62
+ template <class TyGroup> struct _scan_group_supported : public _CG_STL_NAMESPACE::false_type {};
63
+
64
+ template <unsigned int Sz, typename TyPar>
65
+ struct _scan_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
66
+ template <unsigned int Sz, typename TyPar>
67
+ struct _scan_group_supported<internal_thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
68
+ template <>
69
+ struct _scan_group_supported<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
70
+
71
+ template <typename TyGroup>
72
+ using scan_group_supported = _scan_group_supported<details::remove_qual<TyGroup>>;
73
+
74
+ template <bool IsIntegralPlus>
75
+ struct integral_optimized_scan;
76
+
77
+ enum class ScanType { exclusive, inclusive };
78
+
79
+ template <unsigned int GroupId, ScanType TyScan>
80
+ struct scan_dispatch;
81
+
82
+ template <ScanType TyScan>
83
+ struct scan_dispatch<details::coalesced_group_id, TyScan> {
84
+ template <typename TyGroup, typename TyVal, typename TyFn>
85
+ _CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
86
+ auto scan_result = coalesced_inclusive_scan(group, val, op);
87
+ if (TyScan == ScanType::exclusive) {
88
+ scan_result = convert_inclusive_to_exclusive(group,
89
+ scan_result,
90
+ _CG_STL_NAMESPACE::forward<TyVal>(val),
91
+ _CG_STL_NAMESPACE::forward<TyFn>(op));
92
+ }
93
+ return scan_result;
94
+ }
95
+ };
96
+
97
+ #if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
98
+ template <ScanType TyScan>
99
+ struct scan_dispatch<details::multi_tile_group_id, TyScan> {
100
+ template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn>
101
+ _CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
102
+ using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
103
+ using TyRet = details::remove_qual<TyVal>;
104
+ const unsigned int num_warps = Size / 32;
105
+ // In warp scan result, calculated in warp_lambda
106
+ TyRet warp_scan;
107
+
108
+ // In warp scan, put sum in the warp_scratch_location
109
+ auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
110
+ warp_scan =
111
+ details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
112
+ if (warp.thread_rank() + 1 == warp.size()) {
113
+ *warp_scratch_location = warp_scan;
114
+ }
115
+ if (TyScan == ScanType::exclusive) {
116
+ warp_scan = warp.shfl_up(warp_scan, 1);
117
+ }
118
+ };
119
+
120
+ // Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it
121
+ // to its in-warp scan result
122
+ auto inter_warp_lambda =
123
+ [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
124
+ auto thread_val = *thread_scratch_location;
125
+ auto result = coalesced_inclusive_scan(subwarp, thread_val, op);
126
+ *thread_scratch_location = convert_inclusive_to_exclusive(subwarp, result, thread_val, op);
127
+ };
128
+
129
+ TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
130
+ if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) {
131
+ return previous_warps_sum;
132
+ }
133
+ if (warpType::meta_group_rank() == 0) {
134
+ return warp_scan;
135
+ }
136
+ else {
137
+ return op(warp_scan, previous_warps_sum);
138
+ }
139
+ }
140
+ };
141
+
142
+ #if defined(_CG_HAS_STL_ATOMICS)
143
+ template <unsigned int GroupId, ScanType TyScan>
144
+ struct scan_update_dispatch;
145
+
146
+ template <ScanType TyScan>
147
+ struct scan_update_dispatch<details::coalesced_group_id, TyScan> {
148
+ template <typename TyGroup, typename TyAtomic, typename TyVal, typename TyFn>
149
+ _CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
150
+ details::remove_qual<TyVal> old;
151
+
152
+ // Do regular in group scan
153
+ auto scan_result = details::coalesced_inclusive_scan(group, val, op);
154
+
155
+ // Last thread updates the atomic and distributes its old value to other threads
156
+ if (group.thread_rank() == group.size() - 1) {
157
+ old = atomic_update(dst, scan_result, _CG_STL_NAMESPACE::forward<TyFn>(op));
158
+ }
159
+ old = group.shfl(old, group.size() - 1);
160
+ if (TyScan == ScanType::exclusive) {
161
+ scan_result = convert_inclusive_to_exclusive(group, scan_result, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
162
+ }
163
+ scan_result = op(old, scan_result);
164
+ return scan_result;
165
+ }
166
+ };
167
+
168
+ template <ScanType TyScan>
169
+ struct scan_update_dispatch<details::multi_tile_group_id, TyScan> {
170
+ template <unsigned int Size, typename ParentT, typename TyAtomic, typename TyVal, typename TyFn>
171
+ _CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
172
+ using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
173
+ using TyRet = details::remove_qual<TyVal>;
174
+ const unsigned int num_warps = Size / 32;
175
+ // In warp scan result, calculated in warp_lambda
176
+ TyRet warp_scan;
177
+
178
+ // In warp scan, put sum in the warp_scratch_location
179
+ auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
180
+ warp_scan =
181
+ details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
182
+ if (warp.thread_rank() + 1 == warp.size()) {
183
+ *warp_scratch_location = warp_scan;
184
+ }
185
+ if (TyScan == ScanType::exclusive) {
186
+ warp_scan = warp.shfl_up(warp_scan, 1);
187
+ }
188
+ };
189
+
190
+ // Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it
191
+ // to its in-warp scan result
192
+ auto inter_warp_lambda =
193
+ [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
194
+ auto thread_val = *thread_scratch_location;
195
+ auto scan_result = details::coalesced_inclusive_scan(subwarp, thread_val, op);
196
+ TyRet offset;
197
+ // Single thread does the atomic update with sum of all contributions and reads the old value.
198
+ if (subwarp.thread_rank() == subwarp.size() - 1) {
199
+ offset = details::atomic_update(dst, scan_result, op);
200
+ }
201
+ offset = subwarp.shfl(offset, subwarp.size() - 1);
202
+ scan_result = convert_inclusive_to_exclusive(subwarp, scan_result, thread_val, op);
203
+ // Add offset read from the atomic to the scanned warp sum.
204
+ // Skipping first thread, since it got defautly constructed value from the conversion,
205
+ // it should just return the offset received from the thread that did the atomic update.
206
+ if (subwarp.thread_rank() != 0) {
207
+ offset = op(scan_result, offset);
208
+ }
209
+ *thread_scratch_location = offset;
210
+ };
211
+
212
+ TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
213
+ if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) {
214
+ return previous_warps_sum;
215
+ }
216
+ return op(warp_scan, previous_warps_sum);
217
+ }
218
+ };
219
+ #endif
220
+ #endif
221
+
222
+ template <typename TyGroup, typename TyInputVal, typename TyRetVal>
223
+ _CG_QUALIFIER void check_scan_params() {
224
+ static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ");
225
+ static_assert(details::scan_group_supported<TyGroup>::value, "This group does not exclusively represent a tile");
226
+ }
227
+
228
+ #if defined(_CG_HAS_STL_ATOMICS)
229
+ template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal>
230
+ _CG_QUALIFIER void check_scan_update_params() {
231
+ check_scan_params<TyGroup, TyInputVal, TyRetVal>();
232
+ static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ");
233
+ }
234
+ #endif
235
+
236
+ } // details
237
+
238
+ template <typename TyGroup, typename TyVal, typename TyFn>
239
+ _CG_QUALIFIER auto inclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
240
+ details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>();
241
+
242
+ using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
243
+ return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
244
+ }
245
+
246
+ template <typename TyGroup, typename TyVal>
247
+ _CG_QUALIFIER details::remove_qual<TyVal> inclusive_scan(const TyGroup& group, TyVal&& val) {
248
+ return inclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>());
249
+ }
250
+
251
+ template <typename TyGroup, typename TyVal, typename TyFn>
252
+ _CG_QUALIFIER auto exclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
253
+ details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>();
254
+
255
+ using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
256
+ return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
257
+ }
258
+
259
+ template <typename TyGroup, typename TyVal>
260
+ _CG_QUALIFIER details::remove_qual<TyVal> exclusive_scan(const TyGroup& group, TyVal&& val) {
261
+ return exclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>());
262
+ }
263
+
264
+ #if defined(_CG_HAS_STL_ATOMICS) && defined(_CG_ABI_EXPERIMENTAL)
265
+
266
+ namespace experimental {
267
+ template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
268
+ _CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
269
+ details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
270
+
271
+ using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
272
+ return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
273
+ }
274
+
275
+ template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
276
+ _CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco> & dst, TyInputVal&& val) {
277
+ return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
278
+ }
279
+
280
+ template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
281
+ _CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
282
+ details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
283
+
284
+ using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
285
+ return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
286
+ }
287
+
288
+ template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
289
+ _CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val) {
290
+ return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
291
+ }
292
+
293
+ template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
294
+ _CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
295
+ details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
296
+
297
+ using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
298
+ return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
299
+ }
300
+
301
+ template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
302
+ _CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco> & dst, TyInputVal&& val) {
303
+ return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
304
+ }
305
+
306
+ template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
307
+ _CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
308
+ details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
309
+
310
+ using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
311
+ return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
312
+ }
313
+
314
+ template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
315
+ _CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val) {
316
+ return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
317
+ }
318
+ }
319
+
320
+ #endif
321
+
322
+ _CG_END_NAMESPACE
323
+
324
+ #endif // _CG_SCAN_H_
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/memcpy_async.h ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _COOPERATIVE_GROUPS_MEMCPY_ASYNC
50
+ #define _COOPERATIVE_GROUPS_MEMCPY_ASYNC
51
+
52
+ #include "../cooperative_groups.h"
53
+ #include "details/info.h"
54
+
55
+ #ifdef _CG_CPP11_FEATURES
56
+ # include "details/async.h"
57
+ #else
58
+ # error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
59
+ -std=c++11 compiler option.
60
+ #endif
61
+
62
+ #endif // _COOPERATIVE_GROUPS_MEMCPY_ASYNC
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/reduce.h ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #ifndef _COOPERATIVE_GROUPS_REDUCE_H
50
+ #define _COOPERATIVE_GROUPS_REDUCE_H
51
+
52
+ #include "../cooperative_groups.h"
53
+ #include "details/info.h"
54
+
55
+ #ifdef _CG_CPP11_FEATURES
56
+ # include "details/reduce.h"
57
+ #else
58
+ # error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
59
+ -std=c++11 compiler option.
60
+ #endif
61
+
62
+
63
+ #endif //_COOPERATIVE_GROUPS_REDUCE_H
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda.h ADDED
The diff for this file is too large to render. See raw diff
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaEGLTypedefs.h ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2020-2021 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef CUDAEGLTYPEDEFS_H
51
+ #define CUDAEGLTYPEDEFS_H
52
+
53
+ #include <cudaEGL.h>
54
+
55
+ #ifdef __cplusplus
56
+ extern "C" {
57
+ #endif // __cplusplus
58
+
59
+ /*
60
+ * Macros for the latest version for each driver function in cudaEGL.h
61
+ */
62
+ #define PFN_cuGraphicsEGLRegisterImage PFN_cuGraphicsEGLRegisterImage_v7000
63
+ #define PFN_cuEGLStreamConsumerConnect PFN_cuEGLStreamConsumerConnect_v7000
64
+ #define PFN_cuEGLStreamConsumerConnectWithFlags PFN_cuEGLStreamConsumerConnectWithFlags_v8000
65
+ #define PFN_cuEGLStreamConsumerDisconnect PFN_cuEGLStreamConsumerDisconnect_v7000
66
+ #define PFN_cuEGLStreamConsumerAcquireFrame PFN_cuEGLStreamConsumerAcquireFrame_v7000
67
+ #define PFN_cuEGLStreamConsumerReleaseFrame PFN_cuEGLStreamConsumerReleaseFrame_v7000
68
+ #define PFN_cuEGLStreamProducerConnect PFN_cuEGLStreamProducerConnect_v7000
69
+ #define PFN_cuEGLStreamProducerDisconnect PFN_cuEGLStreamProducerDisconnect_v7000
70
+ #define PFN_cuEGLStreamProducerPresentFrame PFN_cuEGLStreamProducerPresentFrame_v7000
71
+ #define PFN_cuEGLStreamProducerReturnFrame PFN_cuEGLStreamProducerReturnFrame_v7000
72
+ #define PFN_cuGraphicsResourceGetMappedEglFrame PFN_cuGraphicsResourceGetMappedEglFrame_v7000
73
+ #define PFN_cuEventCreateFromEGLSync PFN_cuEventCreateFromEGLSync_v9000
74
+
75
+
76
+ /**
77
+ * Type definitions for functions defined in cudaEGL.h
78
+ */
79
+ typedef CUresult (CUDAAPI *PFN_cuGraphicsEGLRegisterImage_v7000)(CUgraphicsResource CUDAAPI *pCudaResource, EGLImageKHR image, unsigned int flags);
80
+ typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerConnect_v7000)(CUeglStreamConnection CUDAAPI *conn, EGLStreamKHR stream);
81
+ typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerConnectWithFlags_v8000)(CUeglStreamConnection CUDAAPI *conn, EGLStreamKHR stream, unsigned int flags);
82
+ typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerDisconnect_v7000)(CUeglStreamConnection CUDAAPI *conn);
83
+ typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerAcquireFrame_v7000)(CUeglStreamConnection CUDAAPI *conn, CUgraphicsResource CUDAAPI *pCudaResource, CUstream CUDAAPI *pStream, unsigned int timeout);
84
+ typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerReleaseFrame_v7000)(CUeglStreamConnection CUDAAPI *conn, CUgraphicsResource pCudaResource, CUstream CUDAAPI *pStream);
85
+ typedef CUresult (CUDAAPI *PFN_cuEGLStreamProducerConnect_v7000)(CUeglStreamConnection CUDAAPI *conn, EGLStreamKHR stream, EGLint width, EGLint height);
86
+ typedef CUresult (CUDAAPI *PFN_cuEGLStreamProducerDisconnect_v7000)(CUeglStreamConnection CUDAAPI *conn);
87
+ typedef CUresult (CUDAAPI *PFN_cuEGLStreamProducerPresentFrame_v7000)(CUeglStreamConnection CUDAAPI *conn, CUeglFrame_v1 eglframe, CUstream CUDAAPI *pStream);
88
+ typedef CUresult (CUDAAPI *PFN_cuEGLStreamProducerReturnFrame_v7000)(CUeglStreamConnection CUDAAPI *conn, CUeglFrame_v1 CUDAAPI *eglframe, CUstream CUDAAPI *pStream);
89
+ typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedEglFrame_v7000)(CUeglFrame_v1 CUDAAPI *eglFrame, CUgraphicsResource resource, unsigned int index, unsigned int mipLevel);
90
+ typedef CUresult (CUDAAPI *PFN_cuEventCreateFromEGLSync_v9000)(CUevent CUDAAPI *phEvent, EGLSyncKHR eglSync, unsigned int flags);
91
+
92
+ #ifdef __cplusplus
93
+ }
94
+ #endif // __cplusplus
95
+
96
+ #endif // file guard
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaGLTypedefs.h ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2020-2021 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef CUDAGLTYPEDEFS_H
51
+ #define CUDAGLTYPEDEFS_H
52
+
53
+ // Dependent includes for cudagl.h
54
+ #include <GL/gl.h>
55
+
56
+ #include <cudaGL.h>
57
+
58
+ #if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
59
+ #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptds
60
+ #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptsz
61
+ #else
62
+ #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## default_version
63
+ #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## default_version
64
+ #endif
65
+
66
+ #ifdef __cplusplus
67
+ extern "C" {
68
+ #endif // __cplusplus
69
+
70
+ /*
71
+ * Macros for the latest version for each driver function in cudaGL.h
72
+ */
73
+ #define PFN_cuGraphicsGLRegisterBuffer PFN_cuGraphicsGLRegisterBuffer_v3000
74
+ #define PFN_cuGraphicsGLRegisterImage PFN_cuGraphicsGLRegisterImage_v3000
75
+ #define PFN_cuWGLGetDevice PFN_cuWGLGetDevice_v2020
76
+ #define PFN_cuGLGetDevices PFN_cuGLGetDevices_v6050
77
+ #define PFN_cuGLCtxCreate PFN_cuGLCtxCreate_v3020
78
+ #define PFN_cuGLInit PFN_cuGLInit_v2000
79
+ #define PFN_cuGLRegisterBufferObject PFN_cuGLRegisterBufferObject_v2000
80
+ #define PFN_cuGLMapBufferObject __API_TYPEDEF_PTDS(PFN_cuGLMapBufferObject, 3020, 7000)
81
+ #define PFN_cuGLUnmapBufferObject PFN_cuGLUnmapBufferObject_v2000
82
+ #define PFN_cuGLUnregisterBufferObject PFN_cuGLUnregisterBufferObject_v2000
83
+ #define PFN_cuGLSetBufferObjectMapFlags PFN_cuGLSetBufferObjectMapFlags_v2030
84
+ #define PFN_cuGLMapBufferObjectAsync __API_TYPEDEF_PTSZ(PFN_cuGLMapBufferObjectAsync, 3020, 7000)
85
+ #define PFN_cuGLUnmapBufferObjectAsync PFN_cuGLUnmapBufferObjectAsync_v2030
86
+
87
+
88
+ /**
89
+ * Type definitions for functions defined in cudaGL.h
90
+ */
91
+ typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterBuffer_v3000)(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags);
92
+ typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterImage_v3000)(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags);
93
+ #ifdef _WIN32
94
+ typedef CUresult (CUDAAPI *PFN_cuWGLGetDevice_v2020)(CUdevice_v1 *pDevice, HGPUNV hGpu);
95
+ #endif
96
+ typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v6050)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
97
+ typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v3020)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device);
98
+ typedef CUresult (CUDAAPI *PFN_cuGLInit_v2000)(void);
99
+ typedef CUresult (CUDAAPI *PFN_cuGLRegisterBufferObject_v2000)(GLuint buffer);
100
+ typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v7000_ptds)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer);
101
+ typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObject_v2000)(GLuint buffer);
102
+ typedef CUresult (CUDAAPI *PFN_cuGLUnregisterBufferObject_v2000)(GLuint buffer);
103
+ typedef CUresult (CUDAAPI *PFN_cuGLSetBufferObjectMapFlags_v2030)(GLuint buffer, unsigned int Flags);
104
+ typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v7000_ptsz)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream);
105
+ typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObjectAsync_v2030)(GLuint buffer, CUstream hStream);
106
+ typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer);
107
+ typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream);
108
+
109
+ /*
110
+ * Type definitions for older versioned functions in cuda.h
111
+ */
112
+ #if defined(__CUDA_API_VERSION_INTERNAL)
113
+ typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v4010)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
114
+ typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v2000)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer);
115
+ typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v2030)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer, CUstream hStream);
116
+ typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v2000)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device);
117
+ #endif
118
+
119
+ #ifdef __cplusplus
120
+ }
121
+ #endif // __cplusplus
122
+
123
+ #endif // file guard
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaTypedefs.h ADDED
@@ -0,0 +1,959 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2020-2021 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef CUDATYPEDEFS_H
51
+ #define CUDATYPEDEFS_H
52
+
53
+ #include <cuda.h>
54
+
55
+ #if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
56
+ #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptds
57
+ #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptsz
58
+ #else
59
+ #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## default_version
60
+ #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## default_version
61
+ #endif
62
+
63
+ #ifdef __cplusplus
64
+ extern "C" {
65
+ #endif // __cplusplus
66
+
67
+ /*
68
+ * Macros for the latest version for each driver function in cuda.h
69
+ */
70
+ #define PFN_cuGetErrorString PFN_cuGetErrorString_v6000
71
+ #define PFN_cuGetErrorName PFN_cuGetErrorName_v6000
72
+ #define PFN_cuInit PFN_cuInit_v2000
73
+ #define PFN_cuDriverGetVersion PFN_cuDriverGetVersion_v2020
74
+ #define PFN_cuDeviceGet PFN_cuDeviceGet_v2000
75
+ #define PFN_cuDeviceGetCount PFN_cuDeviceGetCount_v2000
76
+ #define PFN_cuDeviceGetName PFN_cuDeviceGetName_v2000
77
+ #define PFN_cuDeviceGetUuid PFN_cuDeviceGetUuid_v11040
78
+ #define PFN_cuDeviceGetLuid PFN_cuDeviceGetLuid_v10000
79
+ #define PFN_cuDeviceTotalMem PFN_cuDeviceTotalMem_v3020
80
+ #define PFN_cuDeviceGetTexture1DLinearMaxWidth PFN_cuDeviceGetTexture1DLinearMaxWidth_v11010
81
+ #define PFN_cuDeviceGetAttribute PFN_cuDeviceGetAttribute_v2000
82
+ #define PFN_cuDeviceGetNvSciSyncAttributes PFN_cuDeviceGetNvSciSyncAttributes_v10020
83
+ #define PFN_cuDeviceSetMemPool PFN_cuDeviceSetMemPool_v11020
84
+ #define PFN_cuDeviceGetMemPool PFN_cuDeviceGetMemPool_v11020
85
+ #define PFN_cuDeviceGetDefaultMemPool PFN_cuDeviceGetDefaultMemPool_v11020
86
+ #define PFN_cuDeviceGetProperties PFN_cuDeviceGetProperties_v2000
87
+ #define PFN_cuDeviceComputeCapability PFN_cuDeviceComputeCapability_v2000
88
+ #define PFN_cuDevicePrimaryCtxRetain PFN_cuDevicePrimaryCtxRetain_v7000
89
+ #define PFN_cuDevicePrimaryCtxRelease PFN_cuDevicePrimaryCtxRelease_v11000
90
+ #define PFN_cuDevicePrimaryCtxSetFlags PFN_cuDevicePrimaryCtxSetFlags_v11000
91
+ #define PFN_cuDevicePrimaryCtxGetState PFN_cuDevicePrimaryCtxGetState_v7000
92
+ #define PFN_cuDevicePrimaryCtxReset PFN_cuDevicePrimaryCtxReset_v11000
93
+ #define PFN_cuDeviceGetExecAffinitySupport PFN_cuDeviceGetExecAffinitySupport_v11040
94
+ #define PFN_cuCtxCreate PFN_cuCtxCreate_v11040
95
+ #define PFN_cuCtxDestroy PFN_cuCtxDestroy_v4000
96
+ #define PFN_cuCtxPushCurrent PFN_cuCtxPushCurrent_v4000
97
+ #define PFN_cuCtxPopCurrent PFN_cuCtxPopCurrent_v4000
98
+ #define PFN_cuCtxSetCurrent PFN_cuCtxSetCurrent_v4000
99
+ #define PFN_cuCtxGetCurrent PFN_cuCtxGetCurrent_v4000
100
+ #define PFN_cuCtxGetDevice PFN_cuCtxGetDevice_v2000
101
+ #define PFN_cuCtxGetFlags PFN_cuCtxGetFlags_v7000
102
+ #define PFN_cuCtxSynchronize PFN_cuCtxSynchronize_v2000
103
+ #define PFN_cuCtxSetLimit PFN_cuCtxSetLimit_v3010
104
+ #define PFN_cuCtxGetLimit PFN_cuCtxGetLimit_v3010
105
+ #define PFN_cuCtxGetCacheConfig PFN_cuCtxGetCacheConfig_v3020
106
+ #define PFN_cuCtxSetCacheConfig PFN_cuCtxSetCacheConfig_v3020
107
+ #define PFN_cuCtxGetSharedMemConfig PFN_cuCtxGetSharedMemConfig_v4020
108
+ #define PFN_cuCtxSetSharedMemConfig PFN_cuCtxSetSharedMemConfig_v4020
109
+ #define PFN_cuCtxGetApiVersion PFN_cuCtxGetApiVersion_v3020
110
+ #define PFN_cuCtxGetStreamPriorityRange PFN_cuCtxGetStreamPriorityRange_v5050
111
+ #define PFN_cuCtxResetPersistingL2Cache PFN_cuCtxResetPersistingL2Cache_v11000
112
+ #define PFN_cuCtxAttach PFN_cuCtxAttach_v2000
113
+ #define PFN_cuCtxDetach PFN_cuCtxDetach_v2000
114
+ #define PFN_cuCtxGetExecAffinity PFN_cuCtxGetExecAffinity_v11040
115
+ #define PFN_cuModuleLoad PFN_cuModuleLoad_v2000
116
+ #define PFN_cuModuleLoadData PFN_cuModuleLoadData_v2000
117
+ #define PFN_cuModuleLoadDataEx PFN_cuModuleLoadDataEx_v2010
118
+ #define PFN_cuModuleLoadFatBinary PFN_cuModuleLoadFatBinary_v2000
119
+ #define PFN_cuModuleUnload PFN_cuModuleUnload_v2000
120
+ #define PFN_cuModuleGetFunction PFN_cuModuleGetFunction_v2000
121
+ #define PFN_cuModuleGetGlobal PFN_cuModuleGetGlobal_v3020
122
+ #define PFN_cuModuleGetTexRef PFN_cuModuleGetTexRef_v2000
123
+ #define PFN_cuModuleGetSurfRef PFN_cuModuleGetSurfRef_v3000
124
+ #define PFN_cuLinkCreate PFN_cuLinkCreate_v6050
125
+ #define PFN_cuLinkAddData PFN_cuLinkAddData_v6050
126
+ #define PFN_cuLinkAddFile PFN_cuLinkAddFile_v6050
127
+ #define PFN_cuLinkComplete PFN_cuLinkComplete_v5050
128
+ #define PFN_cuLinkDestroy PFN_cuLinkDestroy_v5050
129
+ #define PFN_cuMemGetInfo PFN_cuMemGetInfo_v3020
130
+ #define PFN_cuMemAlloc PFN_cuMemAlloc_v3020
131
+ #define PFN_cuMemAllocPitch PFN_cuMemAllocPitch_v3020
132
+ #define PFN_cuMemFree PFN_cuMemFree_v3020
133
+ #define PFN_cuMemGetAddressRange PFN_cuMemGetAddressRange_v3020
134
+ #define PFN_cuMemAllocHost PFN_cuMemAllocHost_v3020
135
+ #define PFN_cuMemFreeHost PFN_cuMemFreeHost_v2000
136
+ #define PFN_cuMemHostAlloc PFN_cuMemHostAlloc_v2020
137
+ #define PFN_cuMemHostGetDevicePointer PFN_cuMemHostGetDevicePointer_v3020
138
+ #define PFN_cuMemHostGetFlags PFN_cuMemHostGetFlags_v2030
139
+ #define PFN_cuMemAllocManaged PFN_cuMemAllocManaged_v6000
140
+ #define PFN_cuDeviceGetByPCIBusId PFN_cuDeviceGetByPCIBusId_v4010
141
+ #define PFN_cuDeviceGetPCIBusId PFN_cuDeviceGetPCIBusId_v4010
142
+ #define PFN_cuIpcGetEventHandle PFN_cuIpcGetEventHandle_v4010
143
+ #define PFN_cuIpcOpenEventHandle PFN_cuIpcOpenEventHandle_v4010
144
+ #define PFN_cuIpcGetMemHandle PFN_cuIpcGetMemHandle_v4010
145
+ #define PFN_cuIpcOpenMemHandle PFN_cuIpcOpenMemHandle_v11000
146
+ #define PFN_cuIpcCloseMemHandle PFN_cuIpcCloseMemHandle_v4010
147
+ #define PFN_cuMemHostRegister PFN_cuMemHostRegister_v6050
148
+ #define PFN_cuMemHostUnregister PFN_cuMemHostUnregister_v4000
149
+ #define PFN_cuMemcpy __API_TYPEDEF_PTDS(PFN_cuMemcpy, 4000, 7000)
150
+ #define PFN_cuMemcpyPeer __API_TYPEDEF_PTDS(PFN_cuMemcpyPeer, 4000, 7000)
151
+ #define PFN_cuMemcpyHtoD __API_TYPEDEF_PTDS(PFN_cuMemcpyHtoD, 3020, 7000)
152
+ #define PFN_cuMemcpyDtoH __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoH, 3020, 7000)
153
+ #define PFN_cuMemcpyDtoD __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoD, 3020, 7000)
154
+ #define PFN_cuMemcpyDtoA __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoA, 3020, 7000)
155
+ #define PFN_cuMemcpyAtoD __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoD, 3020, 7000)
156
+ #define PFN_cuMemcpyHtoA __API_TYPEDEF_PTDS(PFN_cuMemcpyHtoA, 3020, 7000)
157
+ #define PFN_cuMemcpyAtoH __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoH, 3020, 7000)
158
+ #define PFN_cuMemcpyAtoA __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoA, 3020, 7000)
159
+ #define PFN_cuMemcpy2D __API_TYPEDEF_PTDS(PFN_cuMemcpy2D, 3020, 7000)
160
+ #define PFN_cuMemcpy2DUnaligned __API_TYPEDEF_PTDS(PFN_cuMemcpy2DUnaligned, 3020, 7000)
161
+ #define PFN_cuMemcpy3D __API_TYPEDEF_PTDS(PFN_cuMemcpy3D, 3020, 7000)
162
+ #define PFN_cuMemcpy3DPeer __API_TYPEDEF_PTDS(PFN_cuMemcpy3DPeer, 4000, 7000)
163
+ #define PFN_cuMemcpyAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyAsync, 4000, 7000)
164
+ #define PFN_cuMemcpyPeerAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyPeerAsync, 4000, 7000)
165
+ #define PFN_cuMemcpyHtoDAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyHtoDAsync, 3020, 7000)
166
+ #define PFN_cuMemcpyDtoHAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyDtoHAsync, 3020, 7000)
167
+ #define PFN_cuMemcpyDtoDAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyDtoDAsync, 3020, 7000)
168
+ #define PFN_cuMemcpyHtoAAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyHtoAAsync, 3020, 7000)
169
+ #define PFN_cuMemcpyAtoHAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyAtoHAsync, 3020, 7000)
170
+ #define PFN_cuMemcpy2DAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpy2DAsync, 3020, 7000)
171
+ #define PFN_cuMemcpy3DAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpy3DAsync, 3020, 7000)
172
+ #define PFN_cuMemcpy3DPeerAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpy3DPeerAsync, 4000, 7000)
173
+ #define PFN_cuMemsetD8 __API_TYPEDEF_PTDS(PFN_cuMemsetD8, 3020, 7000)
174
+ #define PFN_cuMemsetD16 __API_TYPEDEF_PTDS(PFN_cuMemsetD16, 3020, 7000)
175
+ #define PFN_cuMemsetD32 __API_TYPEDEF_PTDS(PFN_cuMemsetD32, 3020, 7000)
176
+ #define PFN_cuMemsetD2D8 __API_TYPEDEF_PTDS(PFN_cuMemsetD2D8, 3020, 7000)
177
+ #define PFN_cuMemsetD2D16 __API_TYPEDEF_PTDS(PFN_cuMemsetD2D16, 3020, 7000)
178
+ #define PFN_cuMemsetD2D32 __API_TYPEDEF_PTDS(PFN_cuMemsetD2D32, 3020, 7000)
179
+ #define PFN_cuMemsetD8Async __API_TYPEDEF_PTSZ(PFN_cuMemsetD8Async, 3020, 7000)
180
+ #define PFN_cuMemsetD16Async __API_TYPEDEF_PTSZ(PFN_cuMemsetD16Async, 3020, 7000)
181
+ #define PFN_cuMemsetD32Async __API_TYPEDEF_PTSZ(PFN_cuMemsetD32Async, 3020, 7000)
182
+ #define PFN_cuMemsetD2D8Async __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D8Async, 3020, 7000)
183
+ #define PFN_cuMemsetD2D16Async __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D16Async, 3020, 7000)
184
+ #define PFN_cuMemsetD2D32Async __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D32Async, 3020, 7000)
185
+ #define PFN_cuArrayCreate PFN_cuArrayCreate_v3020
186
+ #define PFN_cuArrayGetDescriptor PFN_cuArrayGetDescriptor_v3020
187
+ #define PFN_cuArrayGetSparseProperties PFN_cuArrayGetSparseProperties_v11010
188
+ #define PFN_cuMipmappedArrayGetSparseProperties PFN_cuMipmappedArrayGetSparseProperties_v11010
189
+ #define PFN_cuArrayGetMemoryRequirements PFN_cuArrayGetMemoryRequirements_v11060
190
+ #define PFN_cuMipmappedArrayGetMemoryRequirements PFN_cuMipmappedArrayGetMemoryRequirements_v11060
191
+ #define PFN_cuArrayGetPlane PFN_cuArrayGetPlane_v11020
192
+ #define PFN_cuArrayDestroy PFN_cuArrayDestroy_v2000
193
+ #define PFN_cuArray3DCreate PFN_cuArray3DCreate_v3020
194
+ #define PFN_cuArray3DGetDescriptor PFN_cuArray3DGetDescriptor_v3020
195
+ #define PFN_cuMipmappedArrayCreate PFN_cuMipmappedArrayCreate_v5000
196
+ #define PFN_cuMipmappedArrayGetLevel PFN_cuMipmappedArrayGetLevel_v5000
197
+ #define PFN_cuMipmappedArrayDestroy PFN_cuMipmappedArrayDestroy_v5000
198
+ #define PFN_cuMemAddressReserve PFN_cuMemAddressReserve_v10020
199
+ #define PFN_cuMemAddressFree PFN_cuMemAddressFree_v10020
200
+ #define PFN_cuMemCreate PFN_cuMemCreate_v10020
201
+ #define PFN_cuMemRelease PFN_cuMemRelease_v10020
202
+ #define PFN_cuMemMap PFN_cuMemMap_v10020
203
+ #define PFN_cuMemMapArrayAsync __API_TYPEDEF_PTSZ(PFN_cuMemMapArrayAsync, 11010, 11010)
204
+ #define PFN_cuMemUnmap PFN_cuMemUnmap_v10020
205
+ #define PFN_cuMemSetAccess PFN_cuMemSetAccess_v10020
206
+ #define PFN_cuMemGetAccess PFN_cuMemGetAccess_v10020
207
+ #define PFN_cuMemExportToShareableHandle PFN_cuMemExportToShareableHandle_v10020
208
+ #define PFN_cuMemImportFromShareableHandle PFN_cuMemImportFromShareableHandle_v10020
209
+ #define PFN_cuMemGetAllocationGranularity PFN_cuMemGetAllocationGranularity_v10020
210
+ #define PFN_cuMemGetAllocationPropertiesFromHandle PFN_cuMemGetAllocationPropertiesFromHandle_v10020
211
+ #define PFN_cuMemRetainAllocationHandle PFN_cuMemRetainAllocationHandle_v11000
212
+ #define PFN_cuMemFreeAsync __API_TYPEDEF_PTSZ(PFN_cuMemFreeAsync, 11020, 11020)
213
+ #define PFN_cuMemAllocAsync __API_TYPEDEF_PTSZ(PFN_cuMemAllocAsync, 11020, 11020)
214
+ #define PFN_cuMemPoolTrimTo PFN_cuMemPoolTrimTo_v11020
215
+ #define PFN_cuMemPoolSetAttribute PFN_cuMemPoolSetAttribute_v11020
216
+ #define PFN_cuMemPoolGetAttribute PFN_cuMemPoolGetAttribute_v11020
217
+ #define PFN_cuMemPoolSetAccess PFN_cuMemPoolSetAccess_v11020
218
+ #define PFN_cuMemPoolGetAccess PFN_cuMemPoolGetAccess_v11020
219
+ #define PFN_cuMemPoolCreate PFN_cuMemPoolCreate_v11020
220
+ #define PFN_cuMemPoolDestroy PFN_cuMemPoolDestroy_v11020
221
+ #define PFN_cuMemAllocFromPoolAsync __API_TYPEDEF_PTSZ(PFN_cuMemAllocFromPoolAsync, 11020, 11020)
222
+ #define PFN_cuMemPoolExportToShareableHandle PFN_cuMemPoolExportToShareableHandle_v11020
223
+ #define PFN_cuMemPoolImportFromShareableHandle PFN_cuMemPoolImportFromShareableHandle_v11020
224
+ #define PFN_cuMemPoolExportPointer PFN_cuMemPoolExportPointer_v11020
225
+ #define PFN_cuMemPoolImportPointer PFN_cuMemPoolImportPointer_v11020
226
+ #define PFN_cuPointerGetAttribute PFN_cuPointerGetAttribute_v4000
227
+ #define PFN_cuMemPrefetchAsync __API_TYPEDEF_PTSZ(PFN_cuMemPrefetchAsync, 8000, 8000)
228
+ #define PFN_cuMemAdvise PFN_cuMemAdvise_v8000
229
+ #define PFN_cuMemRangeGetAttribute PFN_cuMemRangeGetAttribute_v8000
230
+ #define PFN_cuMemRangeGetAttributes PFN_cuMemRangeGetAttributes_v8000
231
+ #define PFN_cuPointerSetAttribute PFN_cuPointerSetAttribute_v6000
232
+ #define PFN_cuPointerGetAttributes PFN_cuPointerGetAttributes_v7000
233
+ #define PFN_cuStreamCreate PFN_cuStreamCreate_v2000
234
+ #define PFN_cuStreamCreateWithPriority PFN_cuStreamCreateWithPriority_v5050
235
+ #define PFN_cuStreamGetPriority __API_TYPEDEF_PTSZ(PFN_cuStreamGetPriority, 5050, 7000)
236
+ #define PFN_cuStreamGetFlags __API_TYPEDEF_PTSZ(PFN_cuStreamGetFlags, 5050, 7000)
237
+ #define PFN_cuStreamGetCtx __API_TYPEDEF_PTSZ(PFN_cuStreamGetCtx, 9020, 9020)
238
+ #define PFN_cuStreamWaitEvent __API_TYPEDEF_PTSZ(PFN_cuStreamWaitEvent, 3020, 7000)
239
+ #define PFN_cuStreamAddCallback __API_TYPEDEF_PTSZ(PFN_cuStreamAddCallback, 5000, 7000)
240
+ #define PFN_cuStreamBeginCapture __API_TYPEDEF_PTSZ(PFN_cuStreamBeginCapture, 10010, 10010)
241
+ #define PFN_cuThreadExchangeStreamCaptureMode PFN_cuThreadExchangeStreamCaptureMode_v10010
242
+ #define PFN_cuStreamEndCapture __API_TYPEDEF_PTSZ(PFN_cuStreamEndCapture, 10000, 10000)
243
+ #define PFN_cuStreamIsCapturing __API_TYPEDEF_PTSZ(PFN_cuStreamIsCapturing, 10000, 10000)
244
+ #define PFN_cuStreamGetCaptureInfo __API_TYPEDEF_PTSZ(PFN_cuStreamGetCaptureInfo, 10010, 10010)
245
+ #define PFN_cuStreamGetCaptureInfo_v2 __API_TYPEDEF_PTSZ(PFN_cuStreamGetCaptureInfo, 11030, 11030)
246
+ #define PFN_cuStreamUpdateCaptureDependencies __API_TYPEDEF_PTSZ(PFN_cuStreamUpdateCaptureDependencies, 11030, 11030)
247
+ #define PFN_cuStreamAttachMemAsync __API_TYPEDEF_PTSZ(PFN_cuStreamAttachMemAsync, 6000, 7000)
248
+ #define PFN_cuStreamQuery __API_TYPEDEF_PTSZ(PFN_cuStreamQuery, 2000, 7000)
249
+ #define PFN_cuStreamSynchronize __API_TYPEDEF_PTSZ(PFN_cuStreamSynchronize, 2000, 7000)
250
+ #define PFN_cuStreamDestroy PFN_cuStreamDestroy_v4000
251
+ #define PFN_cuStreamCopyAttributes __API_TYPEDEF_PTSZ(PFN_cuStreamCopyAttributes, 11000, 11000)
252
+ #define PFN_cuStreamGetAttribute __API_TYPEDEF_PTSZ(PFN_cuStreamGetAttribute, 11000, 11000)
253
+ #define PFN_cuStreamSetAttribute __API_TYPEDEF_PTSZ(PFN_cuStreamSetAttribute, 11000, 11000)
254
+ #define PFN_cuEventCreate PFN_cuEventCreate_v2000
255
+ #define PFN_cuEventRecord __API_TYPEDEF_PTSZ(PFN_cuEventRecord, 2000, 7000)
256
+ #define PFN_cuEventRecordWithFlags __API_TYPEDEF_PTSZ(PFN_cuEventRecordWithFlags, 11010, 11010)
257
+ #define PFN_cuEventQuery PFN_cuEventQuery_v2000
258
+ #define PFN_cuEventSynchronize PFN_cuEventSynchronize_v2000
259
+ #define PFN_cuEventDestroy PFN_cuEventDestroy_v4000
260
+ #define PFN_cuEventElapsedTime PFN_cuEventElapsedTime_v2000
261
+ #define PFN_cuImportExternalMemory PFN_cuImportExternalMemory_v10000
262
+ #define PFN_cuExternalMemoryGetMappedBuffer PFN_cuExternalMemoryGetMappedBuffer_v10000
263
+ #define PFN_cuExternalMemoryGetMappedMipmappedArray PFN_cuExternalMemoryGetMappedMipmappedArray_v10000
264
+ #define PFN_cuDestroyExternalMemory PFN_cuDestroyExternalMemory_v10000
265
+ #define PFN_cuImportExternalSemaphore PFN_cuImportExternalSemaphore_v10000
266
+ #define PFN_cuSignalExternalSemaphoresAsync __API_TYPEDEF_PTSZ(PFN_cuSignalExternalSemaphoresAsync, 10000, 10000)
267
+ #define PFN_cuWaitExternalSemaphoresAsync __API_TYPEDEF_PTSZ(PFN_cuWaitExternalSemaphoresAsync, 10000, 10000)
268
+ #define PFN_cuDestroyExternalSemaphore PFN_cuDestroyExternalSemaphore_v10000
269
+ #define PFN_cuStreamWaitValue32 __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue32, 8000, 8000)
270
+ #define PFN_cuStreamWaitValue64 __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue64, 9000, 9000)
271
+ #define PFN_cuStreamWriteValue32 __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue32, 8000, 8000)
272
+ #define PFN_cuStreamWriteValue64 __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue64, 9000, 9000)
273
+ #define PFN_cuStreamBatchMemOp __API_TYPEDEF_PTSZ(PFN_cuStreamBatchMemOp, 8000, 8000)
274
+ #define PFN_cuStreamWaitValue32_v2 __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue32, 11070, 11070)
275
+ #define PFN_cuStreamWaitValue64_v2 __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue64, 11070, 11070)
276
+ #define PFN_cuStreamWriteValue32_v2 __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue32, 11070, 11070)
277
+ #define PFN_cuStreamWriteValue64_v2 __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue64, 11070, 11070)
278
+ #define PFN_cuStreamBatchMemOp_v2 __API_TYPEDEF_PTSZ(PFN_cuStreamBatchMemOp, 11070, 11070)
279
+ #define PFN_cuFuncGetAttribute PFN_cuFuncGetAttribute_v2020
280
+ #define PFN_cuFuncSetAttribute PFN_cuFuncSetAttribute_v9000
281
+ #define PFN_cuFuncSetCacheConfig PFN_cuFuncSetCacheConfig_v3000
282
+ #define PFN_cuFuncSetSharedMemConfig PFN_cuFuncSetSharedMemConfig_v4020
283
+ #define PFN_cuLaunchKernel __API_TYPEDEF_PTSZ(PFN_cuLaunchKernel, 4000, 7000)
284
+ #define PFN_cuLaunchKernelEx __API_TYPEDEF_PTSZ(PFN_cuLaunchKernelEx, 11060, 11060)
285
+ #define PFN_cuLaunchCooperativeKernel __API_TYPEDEF_PTSZ(PFN_cuLaunchCooperativeKernel, 9000, 9000)
286
+ #define PFN_cuLaunchCooperativeKernelMultiDevice PFN_cuLaunchCooperativeKernelMultiDevice_v9000
287
+ #define PFN_cuLaunchHostFunc __API_TYPEDEF_PTSZ(PFN_cuLaunchHostFunc, 10000, 10000)
288
+ #define PFN_cuFuncSetBlockShape PFN_cuFuncSetBlockShape_v2000
289
+ #define PFN_cuFuncSetSharedSize PFN_cuFuncSetSharedSize_v2000
290
+ #define PFN_cuParamSetSize PFN_cuParamSetSize_v2000
291
+ #define PFN_cuParamSeti PFN_cuParamSeti_v2000
292
+ #define PFN_cuParamSetf PFN_cuParamSetf_v2000
293
+ #define PFN_cuParamSetv PFN_cuParamSetv_v2000
294
+ #define PFN_cuLaunch PFN_cuLaunch_v2000
295
+ #define PFN_cuLaunchGrid PFN_cuLaunchGrid_v2000
296
+ #define PFN_cuLaunchGridAsync PFN_cuLaunchGridAsync_v2000
297
+ #define PFN_cuParamSetTexRef PFN_cuParamSetTexRef_v2000
298
+ #define PFN_cuGraphCreate PFN_cuGraphCreate_v10000
299
+ #define PFN_cuGraphAddKernelNode PFN_cuGraphAddKernelNode_v10000
300
+ #define PFN_cuGraphKernelNodeGetParams PFN_cuGraphKernelNodeGetParams_v10000
301
+ #define PFN_cuGraphKernelNodeSetParams PFN_cuGraphKernelNodeSetParams_v10000
302
+ #define PFN_cuGraphAddMemcpyNode PFN_cuGraphAddMemcpyNode_v10000
303
+ #define PFN_cuGraphMemcpyNodeGetParams PFN_cuGraphMemcpyNodeGetParams_v10000
304
+ #define PFN_cuGraphMemcpyNodeSetParams PFN_cuGraphMemcpyNodeSetParams_v10000
305
+ #define PFN_cuGraphAddMemsetNode PFN_cuGraphAddMemsetNode_v10000
306
+ #define PFN_cuGraphMemsetNodeGetParams PFN_cuGraphMemsetNodeGetParams_v10000
307
+ #define PFN_cuGraphMemsetNodeSetParams PFN_cuGraphMemsetNodeSetParams_v10000
308
+ #define PFN_cuGraphAddHostNode PFN_cuGraphAddHostNode_v10000
309
+ #define PFN_cuGraphHostNodeGetParams PFN_cuGraphHostNodeGetParams_v10000
310
+ #define PFN_cuGraphHostNodeSetParams PFN_cuGraphHostNodeSetParams_v10000
311
+ #define PFN_cuGraphAddChildGraphNode PFN_cuGraphAddChildGraphNode_v10000
312
+ #define PFN_cuGraphChildGraphNodeGetGraph PFN_cuGraphChildGraphNodeGetGraph_v10000
313
+ #define PFN_cuGraphAddEmptyNode PFN_cuGraphAddEmptyNode_v10000
314
+ #define PFN_cuGraphAddEventRecordNode PFN_cuGraphAddEventRecordNode_v11010
315
+ #define PFN_cuGraphEventRecordNodeGetEvent PFN_cuGraphEventRecordNodeGetEvent_v11010
316
+ #define PFN_cuGraphEventRecordNodeSetEvent PFN_cuGraphEventRecordNodeSetEvent_v11010
317
+ #define PFN_cuGraphAddEventWaitNode PFN_cuGraphAddEventWaitNode_v11010
318
+ #define PFN_cuGraphEventWaitNodeGetEvent PFN_cuGraphEventWaitNodeGetEvent_v11010
319
+ #define PFN_cuGraphEventWaitNodeSetEvent PFN_cuGraphEventWaitNodeSetEvent_v11010
320
+ #define PFN_cuGraphAddExternalSemaphoresSignalNode PFN_cuGraphAddExternalSemaphoresSignalNode_v11020
321
+ #define PFN_cuGraphExternalSemaphoresSignalNodeGetParams PFN_cuGraphExternalSemaphoresSignalNodeGetParams_v11020
322
+ #define PFN_cuGraphExternalSemaphoresSignalNodeSetParams PFN_cuGraphExternalSemaphoresSignalNodeSetParams_v11020
323
+ #define PFN_cuGraphAddExternalSemaphoresWaitNode PFN_cuGraphAddExternalSemaphoresWaitNode_v11020
324
+ #define PFN_cuGraphExternalSemaphoresWaitNodeGetParams PFN_cuGraphExternalSemaphoresWaitNodeGetParams_v11020
325
+ #define PFN_cuGraphExternalSemaphoresWaitNodeSetParams PFN_cuGraphExternalSemaphoresWaitNodeSetParams_v11020
326
+ #define PFN_cuGraphAddBatchMemOpNode PFN_cuGraphAddBatchMemOpNode_v11070
327
+ #define PFN_cuGraphBatchMemOpNodeGetParams PFN_cuGraphBatchMemOpNodeGetParams_v11070
328
+ #define PFN_cuGraphBatchMemOpNodeSetParams PFN_cuGraphBatchMemOpNodeSetParams _v11070
329
+ #define PFN_cuGraphExecBatchMemOpNodeSetParams PFN_cuGraphExecBatchMemOpNodeSetParams_v11070
330
+ #define PFN_cuGraphClone PFN_cuGraphClone_v10000
331
+ #define PFN_cuGraphNodeFindInClone PFN_cuGraphNodeFindInClone_v10000
332
+ #define PFN_cuGraphNodeGetType PFN_cuGraphNodeGetType_v10000
333
+ #define PFN_cuGraphGetNodes PFN_cuGraphGetNodes_v10000
334
+ #define PFN_cuGraphGetRootNodes PFN_cuGraphGetRootNodes_v10000
335
+ #define PFN_cuGraphGetEdges PFN_cuGraphGetEdges_v10000
336
+ #define PFN_cuGraphNodeGetDependencies PFN_cuGraphNodeGetDependencies_v10000
337
+ #define PFN_cuGraphNodeGetDependentNodes PFN_cuGraphNodeGetDependentNodes_v10000
338
+ #define PFN_cuGraphAddDependencies PFN_cuGraphAddDependencies_v10000
339
+ #define PFN_cuGraphRemoveDependencies PFN_cuGraphRemoveDependencies_v10000
340
+ #define PFN_cuGraphDestroyNode PFN_cuGraphDestroyNode_v10000
341
+ #define PFN_cuGraphInstantiate PFN_cuGraphInstantiate_v11000
342
+ #define PFN_cuGraphInstantiateWithFlags PFN_cuGraphInstantiateWithFlags_v11040
343
+ #define PFN_cuGraphExecKernelNodeSetParams PFN_cuGraphExecKernelNodeSetParams_v10010
344
+ #define PFN_cuGraphExecMemcpyNodeSetParams PFN_cuGraphExecMemcpyNodeSetParams_v10020
345
+ #define PFN_cuGraphExecMemsetNodeSetParams PFN_cuGraphExecMemsetNodeSetParams_v10020
346
+ #define PFN_cuGraphExecHostNodeSetParams PFN_cuGraphExecHostNodeSetParams_v10020
347
+ #define PFN_cuGraphExecChildGraphNodeSetParams PFN_cuGraphExecChildGraphNodeSetParams_v11010
348
+ #define PFN_cuGraphExecEventRecordNodeSetEvent PFN_cuGraphExecEventRecordNodeSetEvent_v11010
349
+ #define PFN_cuGraphExecEventWaitNodeSetEvent PFN_cuGraphExecEventWaitNodeSetEvent_v11010
350
+ #define PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams_v11020
351
+ #define PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams_v11020
352
+ #define PFN_cuGraphUpload __API_TYPEDEF_PTSZ(PFN_cuGraphUpload, 11010, 11010)
353
+ #define PFN_cuGraphLaunch __API_TYPEDEF_PTSZ(PFN_cuGraphLaunch, 10000, 10000)
354
+ #define PFN_cuGraphExecDestroy PFN_cuGraphExecDestroy_v10000
355
+ #define PFN_cuGraphDestroy PFN_cuGraphDestroy_v10000
356
+ #define PFN_cuGraphExecUpdate PFN_cuGraphExecUpdate_v10020
357
+ #define PFN_cuGraphKernelNodeCopyAttributes PFN_cuGraphKernelNodeCopyAttributes_v11000
358
+ #define PFN_cuGraphKernelNodeGetAttribute PFN_cuGraphKernelNodeGetAttribute_v11000
359
+ #define PFN_cuGraphKernelNodeSetAttribute PFN_cuGraphKernelNodeSetAttribute_v11000
360
+ #define PFN_cuGraphDebugDotPrint PFN_cuGraphDebugDotPrint_v11030
361
+ #define PFN_cuGraphAddMemAllocNode PFN_cuGraphAddMemAllocNode_v11040
362
+ #define PFN_cuGraphMemAllocNodeGetParams PFN_cuGraphMemAllocNodeGetParams_v11040
363
+ #define PFN_cuGraphAddMemFreeNode PFN_cuGraphAddMemFreeNode_v11040
364
+ #define PFN_cuGraphMemFreeNodeGetParams PFN_cuGraphMemFreeNodeGetParams_v11040
365
+ #define PFN_cuGraphNodeSetEnabled PFN_cuGraphNodeSetEnabled_v11060
366
+ #define PFN_cuGraphNodeGetEnabled PFN_cuGraphNodeGetEnabled_v11060
367
+ #define PFN_cuDeviceGraphMemTrim PFN_cuDeviceGraphMemTrim_v11040
368
+ #define PFN_cuDeviceGetGraphMemAttribute PFN_cuDeviceGetGraphMemAttribute_v11040
369
+ #define PFN_cuDeviceSetGraphMemAttribute PFN_cuDeviceSetGraphMemAttribute_v11040
370
+ #define PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor_v6050
371
+ #define PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000
372
+ #define PFN_cuOccupancyMaxPotentialBlockSize PFN_cuOccupancyMaxPotentialBlockSize_v6050
373
+ #define PFN_cuOccupancyMaxPotentialBlockSizeWithFlags PFN_cuOccupancyMaxPotentialBlockSizeWithFlags_v7000
374
+ #define PFN_cuOccupancyAvailableDynamicSMemPerBlock PFN_cuOccupancyAvailableDynamicSMemPerBlock_v10020
375
+ #define PFN_cuOccupancyMaxPotentialClusterSize PFN_cuOccupancyMaxPotentialClusterSize_v11070
376
+ #define PFN_cuOccupancyMaxActiveClusters PFN_cuOccupancyMaxActiveClusters_v11070
377
+ #define PFN_cuTexRefSetArray PFN_cuTexRefSetArray_v2000
378
+ #define PFN_cuTexRefSetMipmappedArray PFN_cuTexRefSetMipmappedArray_v5000
379
+ #define PFN_cuTexRefSetAddress PFN_cuTexRefSetAddress_v3020
380
+ #define PFN_cuTexRefSetAddress2D PFN_cuTexRefSetAddress2D_v4010
381
+ #define PFN_cuTexRefSetFormat PFN_cuTexRefSetFormat_v2000
382
+ #define PFN_cuTexRefSetAddressMode PFN_cuTexRefSetAddressMode_v2000
383
+ #define PFN_cuTexRefSetFilterMode PFN_cuTexRefSetFilterMode_v2000
384
+ #define PFN_cuTexRefSetMipmapFilterMode PFN_cuTexRefSetMipmapFilterMode_v5000
385
+ #define PFN_cuTexRefSetMipmapLevelBias PFN_cuTexRefSetMipmapLevelBias_v5000
386
+ #define PFN_cuTexRefSetMipmapLevelClamp PFN_cuTexRefSetMipmapLevelClamp_v5000
387
+ #define PFN_cuTexRefSetMaxAnisotropy PFN_cuTexRefSetMaxAnisotropy_v5000
388
+ #define PFN_cuTexRefSetBorderColor PFN_cuTexRefSetBorderColor_v8000
389
+ #define PFN_cuTexRefSetFlags PFN_cuTexRefSetFlags_v2000
390
+ #define PFN_cuTexRefGetAddress PFN_cuTexRefGetAddress_v3020
391
+ #define PFN_cuTexRefGetArray PFN_cuTexRefGetArray_v2000
392
+ #define PFN_cuTexRefGetMipmappedArray PFN_cuTexRefGetMipmappedArray_v5000
393
+ #define PFN_cuTexRefGetAddressMode PFN_cuTexRefGetAddressMode_v2000
394
+ #define PFN_cuTexRefGetFilterMode PFN_cuTexRefGetFilterMode_v2000
395
+ #define PFN_cuTexRefGetFormat PFN_cuTexRefGetFormat_v2000
396
+ #define PFN_cuTexRefGetMipmapFilterMode PFN_cuTexRefGetMipmapFilterMode_v5000
397
+ #define PFN_cuTexRefGetMipmapLevelBias PFN_cuTexRefGetMipmapLevelBias_v5000
398
+ #define PFN_cuTexRefGetMipmapLevelClamp PFN_cuTexRefGetMipmapLevelClamp_v5000
399
+ #define PFN_cuTexRefGetMaxAnisotropy PFN_cuTexRefGetMaxAnisotropy_v5000
400
+ #define PFN_cuTexRefGetBorderColor PFN_cuTexRefGetBorderColor_v8000
401
+ #define PFN_cuTexRefGetFlags PFN_cuTexRefGetFlags_v2000
402
+ #define PFN_cuTexRefCreate PFN_cuTexRefCreate_v2000
403
+ #define PFN_cuTexRefDestroy PFN_cuTexRefDestroy_v2000
404
+ #define PFN_cuSurfRefSetArray PFN_cuSurfRefSetArray_v3000
405
+ #define PFN_cuSurfRefGetArray PFN_cuSurfRefGetArray_v3000
406
+ #define PFN_cuTexObjectCreate PFN_cuTexObjectCreate_v5000
407
+ #define PFN_cuTexObjectDestroy PFN_cuTexObjectDestroy_v5000
408
+ #define PFN_cuTexObjectGetResourceDesc PFN_cuTexObjectGetResourceDesc_v5000
409
+ #define PFN_cuTexObjectGetTextureDesc PFN_cuTexObjectGetTextureDesc_v5000
410
+ #define PFN_cuTexObjectGetResourceViewDesc PFN_cuTexObjectGetResourceViewDesc_v5000
411
+ #define PFN_cuSurfObjectCreate PFN_cuSurfObjectCreate_v5000
412
+ #define PFN_cuSurfObjectDestroy PFN_cuSurfObjectDestroy_v5000
413
+ #define PFN_cuSurfObjectGetResourceDesc PFN_cuSurfObjectGetResourceDesc_v5000
414
+ #define PFN_cuDeviceCanAccessPeer PFN_cuDeviceCanAccessPeer_v4000
415
+ #define PFN_cuCtxEnablePeerAccess PFN_cuCtxEnablePeerAccess_v4000
416
+ #define PFN_cuCtxDisablePeerAccess PFN_cuCtxDisablePeerAccess_v4000
417
+ #define PFN_cuDeviceGetP2PAttribute PFN_cuDeviceGetP2PAttribute_v8000
418
+ #define PFN_cuGraphicsUnregisterResource PFN_cuGraphicsUnregisterResource_v3000
419
+ #define PFN_cuGraphicsSubResourceGetMappedArray PFN_cuGraphicsSubResourceGetMappedArray_v3000
420
+ #define PFN_cuGraphicsResourceGetMappedMipmappedArray PFN_cuGraphicsResourceGetMappedMipmappedArray_v5000
421
+ #define PFN_cuGraphicsResourceGetMappedPointer PFN_cuGraphicsResourceGetMappedPointer_v3020
422
+ #define PFN_cuGraphicsResourceSetMapFlags PFN_cuGraphicsResourceSetMapFlags_v6050
423
+ #define PFN_cuGraphicsMapResources __API_TYPEDEF_PTSZ(PFN_cuGraphicsMapResources, 3000, 7000)
424
+ #define PFN_cuGraphicsUnmapResources __API_TYPEDEF_PTSZ(PFN_cuGraphicsUnmapResources, 3000, 7000)
425
+ #define PFN_cuGetExportTable PFN_cuGetExportTable_v3000
426
+ #define PFN_cuFuncGetModule PFN_cuFuncGetModule_v11000
427
+ #define PFN_cuFlushGPUDirectRDMAWrites PFN_cuFlushGPUDirectRDMAWrites_v11030
428
+ #define PFN_cuGetProcAddress PFN_cuGetProcAddress_v11030
429
+ #define PFN_cuUserObjectCreate PFN_cuUserObjectCreate_v11030
430
+ #define PFN_cuUserObjectRetain PFN_cuUserObjectRetain_v11030
431
+ #define PFN_cuUserObjectRelease PFN_cuUserObjectRelease_v11030
432
+ #define PFN_cuGraphRetainUserObject PFN_cuGraphRetainUserObject_v11030
433
+ #define PFN_cuGraphReleaseUserObject PFN_cuGraphReleaseUserObject_v11030
434
+ #define PFN_cuModuleGetLoadingMode PFN_cuModuleGetLoadingMode_v11070
435
+ #define PFN_cuMemGetHandleForAddressRange PFN_cuMemGetHandleForAddressRange_v11070
436
+
437
+ /*
438
+ * Type definitions for functions defined in cuda.h
439
+ */
440
+ typedef CUresult (CUDAAPI *PFN_cuGetErrorString_v6000)(CUresult error, const char **pStr);
441
+ typedef CUresult (CUDAAPI *PFN_cuGetErrorName_v6000)(CUresult error, const char **pStr);
442
+ typedef CUresult (CUDAAPI *PFN_cuInit_v2000)(unsigned int Flags);
443
+ typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion_v2020)(int *driverVersion);
444
+ typedef CUresult (CUDAAPI *PFN_cuDeviceGet_v2000)(CUdevice_v1 *device, int ordinal);
445
+ typedef CUresult (CUDAAPI *PFN_cuDeviceGetCount_v2000)(int *count);
446
+ typedef CUresult (CUDAAPI *PFN_cuDeviceGetName_v2000)(char *name, int len, CUdevice_v1 dev);
447
+ typedef CUresult (CUDAAPI *PFN_cuDeviceGetUuid_v9020)(CUuuid *uuid, CUdevice_v1 dev);
448
+ typedef CUresult (CUDAAPI *PFN_cuDeviceGetUuid_v11040)(CUuuid *uuid, CUdevice_v1 dev);
449
+ typedef CUresult (CUDAAPI *PFN_cuDeviceGetLuid_v10000)(char *luid, unsigned int *deviceNodeMask, CUdevice_v1 dev);
450
+ typedef CUresult (CUDAAPI *PFN_cuDeviceTotalMem_v3020)(size_t *bytes, CUdevice_v1 dev);
451
+ typedef CUresult (CUDAAPI *PFN_cuDeviceGetTexture1DLinearMaxWidth_v11010)(size_t *maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice_v1 dev);
452
+ typedef CUresult (CUDAAPI *PFN_cuDeviceGetAttribute_v2000)(int *pi, CUdevice_attribute attrib, CUdevice_v1 dev);
453
+ typedef CUresult (CUDAAPI *PFN_cuDeviceGetNvSciSyncAttributes_v10020)(void *nvSciSyncAttrList, CUdevice_v1 dev, int flags);
454
+ typedef CUresult (CUDAAPI *PFN_cuDeviceSetMemPool_v11020)(CUdevice_v1 dev, CUmemoryPool pool);
455
+ typedef CUresult (CUDAAPI *PFN_cuDeviceGetMemPool_v11020)(CUmemoryPool *pool, CUdevice_v1 dev);
456
+ typedef CUresult (CUDAAPI *PFN_cuDeviceGetDefaultMemPool_v11020)(CUmemoryPool *pool_out, CUdevice_v1 dev);
457
+ typedef CUresult (CUDAAPI *PFN_cuDeviceGetProperties_v2000)(CUdevprop_v1 *prop, CUdevice_v1 dev);
458
+ typedef CUresult (CUDAAPI *PFN_cuDeviceComputeCapability_v2000)(int *major, int *minor, CUdevice_v1 dev);
459
+ typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRetain_v7000)(CUcontext *pctx, CUdevice_v1 dev);
460
+ typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRelease_v11000)(CUdevice_v1 dev);
461
+ typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxSetFlags_v11000)(CUdevice_v1 dev, unsigned int flags);
462
+ typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxGetState_v7000)(CUdevice_v1 dev, unsigned int *flags, int *active);
463
+ typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxReset_v11000)(CUdevice_v1 dev);
464
+ typedef CUresult (CUDAAPI *PFN_cuDeviceGetExecAffinitySupport_v11040)(int *pi, CUexecAffinityType type, CUdevice dev);
465
+ typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v3020)(CUcontext *pctx, unsigned int flags, CUdevice_v1 dev);
466
+ typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v11040)(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice_v1 dev);
467
+ typedef CUresult (CUDAAPI *PFN_cuCtxDestroy_v4000)(CUcontext ctx);
468
+ typedef CUresult (CUDAAPI *PFN_cuCtxPushCurrent_v4000)(CUcontext ctx);
469
+ typedef CUresult (CUDAAPI *PFN_cuCtxPopCurrent_v4000)(CUcontext *pctx);
470
+ typedef CUresult (CUDAAPI *PFN_cuCtxSetCurrent_v4000)(CUcontext ctx);
471
+ typedef CUresult (CUDAAPI *PFN_cuCtxGetCurrent_v4000)(CUcontext *pctx);
472
+ typedef CUresult (CUDAAPI *PFN_cuCtxGetDevice_v2000)(CUdevice_v1 *device);
473
+ typedef CUresult (CUDAAPI *PFN_cuCtxGetFlags_v7000)(unsigned int *flags);
474
+ typedef CUresult (CUDAAPI *PFN_cuCtxSynchronize_v2000)(void);
475
+ typedef CUresult (CUDAAPI *PFN_cuCtxSetLimit_v3010)(CUlimit limit, size_t value);
476
+ typedef CUresult (CUDAAPI *PFN_cuCtxGetLimit_v3010)(size_t *pvalue, CUlimit limit);
477
+ typedef CUresult (CUDAAPI *PFN_cuCtxGetCacheConfig_v3020)(CUfunc_cache *pconfig);
478
+ typedef CUresult (CUDAAPI *PFN_cuCtxSetCacheConfig_v3020)(CUfunc_cache config);
479
+ typedef CUresult (CUDAAPI *PFN_cuCtxGetSharedMemConfig_v4020)(CUsharedconfig *pConfig);
480
+ typedef CUresult (CUDAAPI *PFN_cuCtxSetSharedMemConfig_v4020)(CUsharedconfig config);
481
+ typedef CUresult (CUDAAPI *PFN_cuCtxGetApiVersion_v3020)(CUcontext ctx, unsigned int *version);
482
+ typedef CUresult (CUDAAPI *PFN_cuCtxGetStreamPriorityRange_v5050)(int *leastPriority, int *greatestPriority);
483
+ typedef CUresult (CUDAAPI *PFN_cuCtxResetPersistingL2Cache_v11000)(void);
484
+ typedef CUresult (CUDAAPI *PFN_cuCtxAttach_v2000)(CUcontext *pctx, unsigned int flags);
485
+ typedef CUresult (CUDAAPI *PFN_cuCtxDetach_v2000)(CUcontext ctx);
486
+ typedef CUresult (CUDAAPI *PFN_cuCtxGetExecAffinity_v11040)(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type);
487
+ typedef CUresult (CUDAAPI *PFN_cuModuleLoad_v2000)(CUmodule *module, const char *fname);
488
+ typedef CUresult (CUDAAPI *PFN_cuModuleLoadData_v2000)(CUmodule *module, const void *image);
489
+ typedef CUresult (CUDAAPI *PFN_cuModuleLoadDataEx_v2010)(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
490
+ typedef CUresult (CUDAAPI *PFN_cuModuleLoadFatBinary_v2000)(CUmodule *module, const void *fatCubin);
491
+ typedef CUresult (CUDAAPI *PFN_cuModuleUnload_v2000)(CUmodule hmod);
492
+ typedef CUresult (CUDAAPI *PFN_cuModuleGetFunction_v2000)(CUfunction *hfunc, CUmodule hmod, const char *name);
493
+ typedef CUresult (CUDAAPI *PFN_cuModuleGetGlobal_v3020)(CUdeviceptr_v2 *dptr, size_t *bytes, CUmodule hmod, const char *name);
494
+ typedef CUresult (CUDAAPI *PFN_cuModuleGetTexRef_v2000)(CUtexref *pTexRef, CUmodule hmod, const char *name);
495
+ typedef CUresult (CUDAAPI *PFN_cuModuleGetSurfRef_v3000)(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
496
+ typedef CUresult (CUDAAPI *PFN_cuLinkCreate_v6050)(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
497
+ typedef CUresult (CUDAAPI *PFN_cuLinkAddData_v6050)(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues);
498
+ typedef CUresult (CUDAAPI *PFN_cuLinkAddFile_v6050)(CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues);
499
+ typedef CUresult (CUDAAPI *PFN_cuLinkComplete_v5050)(CUlinkState state, void **cubinOut, size_t *sizeOut);
500
+ typedef CUresult (CUDAAPI *PFN_cuLinkDestroy_v5050)(CUlinkState state);
501
+ typedef CUresult (CUDAAPI *PFN_cuMemGetInfo_v3020)(size_t *free, size_t *total);
502
+ typedef CUresult (CUDAAPI *PFN_cuMemAlloc_v3020)(CUdeviceptr_v2 *dptr, size_t bytesize);
503
+ typedef CUresult (CUDAAPI *PFN_cuMemAllocPitch_v3020)(CUdeviceptr_v2 *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
504
+ typedef CUresult (CUDAAPI *PFN_cuMemFree_v3020)(CUdeviceptr_v2 dptr);
505
+ typedef CUresult (CUDAAPI *PFN_cuMemGetAddressRange_v3020)(CUdeviceptr_v2 *pbase, size_t *psize, CUdeviceptr_v2 dptr);
506
+ typedef CUresult (CUDAAPI *PFN_cuMemAllocHost_v3020)(void **pp, size_t bytesize);
507
+ typedef CUresult (CUDAAPI *PFN_cuMemFreeHost_v2000)(void *p);
508
+ typedef CUresult (CUDAAPI *PFN_cuMemHostAlloc_v2020)(void **pp, size_t bytesize, unsigned int Flags);
509
+ typedef CUresult (CUDAAPI *PFN_cuMemHostGetDevicePointer_v3020)(CUdeviceptr_v2 *pdptr, void *p, unsigned int Flags);
510
+ typedef CUresult (CUDAAPI *PFN_cuMemHostGetFlags_v2030)(unsigned int *pFlags, void *p);
511
+ typedef CUresult (CUDAAPI *PFN_cuMemAllocManaged_v6000)(CUdeviceptr_v2 *dptr, size_t bytesize, unsigned int flags);
512
+ typedef CUresult (CUDAAPI *PFN_cuDeviceGetByPCIBusId_v4010)(CUdevice_v1 *dev, const char *pciBusId);
513
+ typedef CUresult (CUDAAPI *PFN_cuDeviceGetPCIBusId_v4010)(char *pciBusId, int len, CUdevice_v1 dev);
514
+ typedef CUresult (CUDAAPI *PFN_cuIpcGetEventHandle_v4010)(CUipcEventHandle_v1 *pHandle, CUevent event);
515
+ typedef CUresult (CUDAAPI *PFN_cuIpcOpenEventHandle_v4010)(CUevent *phEvent, CUipcEventHandle_v1 handle);
516
+ typedef CUresult (CUDAAPI *PFN_cuIpcGetMemHandle_v4010)(CUipcMemHandle_v1 *pHandle, CUdeviceptr_v2 dptr);
517
+ typedef CUresult (CUDAAPI *PFN_cuIpcOpenMemHandle_v11000)(CUdeviceptr_v2 *pdptr, CUipcMemHandle_v1 handle, unsigned int Flags);
518
+ typedef CUresult (CUDAAPI *PFN_cuIpcCloseMemHandle_v4010)(CUdeviceptr_v2 dptr);
519
+ typedef CUresult (CUDAAPI *PFN_cuMemHostRegister_v6050)(void *p, size_t bytesize, unsigned int Flags);
520
+ typedef CUresult (CUDAAPI *PFN_cuMemHostUnregister_v4000)(void *p);
521
+ typedef CUresult (CUDAAPI *PFN_cuMemcpy_v7000_ptds)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount);
522
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyPeer_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount);
523
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount);
524
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v7000_ptds)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount);
525
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount);
526
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, CUdeviceptr_v2 srcDevice, size_t ByteCount);
527
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
528
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
529
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v7000_ptds)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
530
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
531
+ typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v7000_ptds)(const CUDA_MEMCPY2D_v2 *pCopy);
532
+ typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v7000_ptds)(const CUDA_MEMCPY2D_v2 *pCopy);
533
+ typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v7000_ptds)(const CUDA_MEMCPY3D_v2 *pCopy);
534
+ typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeer_v7000_ptds)(const CUDA_MEMCPY3D_PEER_v1 *pCopy);
535
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyAsync_v7000_ptsz)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount, CUstream hStream);
536
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyPeerAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
537
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
538
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v7000_ptsz)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
539
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
540
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v7000_ptsz)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
541
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v7000_ptsz)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
542
+ typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v7000_ptsz)(const CUDA_MEMCPY2D_v2 *pCopy, CUstream hStream);
543
+ typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v7000_ptsz)(const CUDA_MEMCPY3D_v2 *pCopy, CUstream hStream);
544
+ typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeerAsync_v7000_ptsz)(const CUDA_MEMCPY3D_PEER_v1 *pCopy, CUstream hStream);
545
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N);
546
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N);
547
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N);
548
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
549
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
550
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
551
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD8Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N, CUstream hStream);
552
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD16Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N, CUstream hStream);
553
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD32Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N, CUstream hStream);
554
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
555
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
556
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
557
+ typedef CUresult (CUDAAPI *PFN_cuArrayCreate_v3020)(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v2 *pAllocateArray);
558
+ typedef CUresult (CUDAAPI *PFN_cuArrayGetDescriptor_v3020)(CUDA_ARRAY_DESCRIPTOR_v2 *pArrayDescriptor, CUarray hArray);
559
+ typedef CUresult (CUDAAPI *PFN_cuArrayGetSparseProperties_v11010)(CUDA_ARRAY_SPARSE_PROPERTIES_v1 *sparseProperties, CUarray array);
560
+ typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetSparseProperties_v11010)(CUDA_ARRAY_SPARSE_PROPERTIES_v1 *sparseProperties, CUmipmappedArray mipmap);
561
+ typedef CUresult (CUDAAPI *PFN_cuArrayGetMemoryRequirements_v11060)(CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 *memoryRequirements, CUarray array, CUdevice device);
562
+ typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetMemoryRequirements_v11060)(CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 *memoryRequirements, CUmipmappedArray mipmap, CUdevice device);
563
+ typedef CUresult (CUDAAPI *PFN_cuArrayGetPlane_v11020)(CUarray *pPlaneArray, CUarray hArray, unsigned int planeIdx);
564
+ typedef CUresult (CUDAAPI *PFN_cuArrayDestroy_v2000)(CUarray hArray);
565
+ typedef CUresult (CUDAAPI *PFN_cuArray3DCreate_v3020)(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v2 *pAllocateArray);
566
+ typedef CUresult (CUDAAPI *PFN_cuArray3DGetDescriptor_v3020)(CUDA_ARRAY3D_DESCRIPTOR_v2 *pArrayDescriptor, CUarray hArray);
567
+ typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayCreate_v5000)(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v2 *pMipmappedArrayDesc, unsigned int numMipmapLevels);
568
+ typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetLevel_v5000)(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level);
569
+ typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayDestroy_v5000)(CUmipmappedArray hMipmappedArray);
570
+ typedef CUresult (CUDAAPI *PFN_cuMemAddressReserve_v10020)(CUdeviceptr_v2 *ptr, size_t size, size_t alignment, CUdeviceptr_v2 addr, unsigned long long flags);
571
+ typedef CUresult (CUDAAPI *PFN_cuMemAddressFree_v10020)(CUdeviceptr_v2 ptr, size_t size);
572
+ typedef CUresult (CUDAAPI *PFN_cuMemCreate_v10020)(CUmemGenericAllocationHandle_v1 *handle, size_t size, const CUmemAllocationProp_v1 *prop, unsigned long long flags);
573
+ typedef CUresult (CUDAAPI *PFN_cuMemRelease_v10020)(CUmemGenericAllocationHandle_v1 handle);
574
+ typedef CUresult (CUDAAPI *PFN_cuMemMap_v10020)(CUdeviceptr_v2 ptr, size_t size, size_t offset, CUmemGenericAllocationHandle_v1 handle, unsigned long long flags);
575
+ typedef CUresult (CUDAAPI *PFN_cuMemMapArrayAsync_v11010_ptsz)(CUarrayMapInfo_v1 *mapInfoList, unsigned int count, CUstream hStream);
576
+ typedef CUresult (CUDAAPI *PFN_cuMemUnmap_v10020)(CUdeviceptr_v2 ptr, size_t size);
577
+ typedef CUresult (CUDAAPI *PFN_cuMemSetAccess_v10020)(CUdeviceptr_v2 ptr, size_t size, const CUmemAccessDesc_v1 *desc, size_t count);
578
+ typedef CUresult (CUDAAPI *PFN_cuMemGetAccess_v10020)(unsigned long long *flags, const CUmemLocation_v1 *location, CUdeviceptr_v2 ptr);
579
+ typedef CUresult (CUDAAPI *PFN_cuMemExportToShareableHandle_v10020)(void *shareableHandle, CUmemGenericAllocationHandle_v1 handle, CUmemAllocationHandleType handleType, unsigned long long flags);
580
+ typedef CUresult (CUDAAPI *PFN_cuMemImportFromShareableHandle_v10020)(CUmemGenericAllocationHandle_v1 *handle, void *osHandle, CUmemAllocationHandleType shHandleType);
581
+ typedef CUresult (CUDAAPI *PFN_cuMemGetAllocationGranularity_v10020)(size_t *granularity, const CUmemAllocationProp_v1 *prop, CUmemAllocationGranularity_flags option);
582
+ typedef CUresult (CUDAAPI *PFN_cuMemGetAllocationPropertiesFromHandle_v10020)(CUmemAllocationProp_v1 *prop, CUmemGenericAllocationHandle_v1 handle);
583
+ typedef CUresult (CUDAAPI *PFN_cuMemRetainAllocationHandle_v11000)(CUmemGenericAllocationHandle_v1 *handle, void *addr);
584
+ typedef CUresult (CUDAAPI *PFN_cuMemFreeAsync_v11020_ptsz)(CUdeviceptr_v2 dptr, CUstream hStream);
585
+ typedef CUresult (CUDAAPI *PFN_cuMemAllocAsync_v11020_ptsz)(CUdeviceptr_v2 *dptr, size_t bytesize, CUstream hStream);
586
+ typedef CUresult (CUDAAPI *PFN_cuMemPoolTrimTo_v11020)(CUmemoryPool pool, size_t minBytesToKeep);
587
+ typedef CUresult (CUDAAPI *PFN_cuMemPoolSetAttribute_v11020)(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
588
+ typedef CUresult (CUDAAPI *PFN_cuMemPoolGetAttribute_v11020)(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
589
+ typedef CUresult (CUDAAPI *PFN_cuMemPoolSetAccess_v11020)(CUmemoryPool pool, const CUmemAccessDesc_v1 *map, size_t count);
590
+ typedef CUresult (CUDAAPI *PFN_cuMemPoolGetAccess_v11020)(CUmemAccess_flags *flags, CUmemoryPool memPool, CUmemLocation_v1 *location);
591
+ typedef CUresult (CUDAAPI *PFN_cuMemPoolCreate_v11020)(CUmemoryPool *pool, const CUmemPoolProps_v1 *poolProps);
592
+ typedef CUresult (CUDAAPI *PFN_cuMemPoolDestroy_v11020)(CUmemoryPool pool);
593
+ typedef CUresult (CUDAAPI *PFN_cuMemAllocFromPoolAsync_v11020_ptsz)(CUdeviceptr_v2 *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
594
+ typedef CUresult (CUDAAPI *PFN_cuMemPoolExportToShareableHandle_v11020)(void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType, unsigned long long flags);
595
+ typedef CUresult (CUDAAPI *PFN_cuMemPoolImportFromShareableHandle_v11020)(CUmemoryPool *pool_out, void *handle, CUmemAllocationHandleType handleType, unsigned long long flags);
596
+ typedef CUresult (CUDAAPI *PFN_cuMemPoolExportPointer_v11020)(CUmemPoolPtrExportData_v1 *shareData_out, CUdeviceptr_v2 ptr);
597
+ typedef CUresult (CUDAAPI *PFN_cuMemPoolImportPointer_v11020)(CUdeviceptr_v2 *ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData_v1 *shareData);
598
+ typedef CUresult (CUDAAPI *PFN_cuPointerGetAttribute_v4000)(void *data, CUpointer_attribute attribute, CUdeviceptr_v2 ptr);
599
+ typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v8000_ptsz)(CUdeviceptr_v2 devPtr, size_t count, CUdevice_v1 dstDevice, CUstream hStream);
600
+ typedef CUresult (CUDAAPI *PFN_cuMemAdvise_v8000)(CUdeviceptr_v2 devPtr, size_t count, CUmem_advise advice, CUdevice_v1 device);
601
+ typedef CUresult (CUDAAPI *PFN_cuMemRangeGetAttribute_v8000)(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr_v2 devPtr, size_t count);
602
+ typedef CUresult (CUDAAPI *PFN_cuMemRangeGetAttributes_v8000)(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr_v2 devPtr, size_t count);
603
+ typedef CUresult (CUDAAPI *PFN_cuPointerSetAttribute_v6000)(const void *value, CUpointer_attribute attribute, CUdeviceptr_v2 ptr);
604
+ typedef CUresult (CUDAAPI *PFN_cuPointerGetAttributes_v7000)(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr_v2 ptr);
605
+ typedef CUresult (CUDAAPI *PFN_cuStreamCreate_v2000)(CUstream *phStream, unsigned int Flags);
606
+ typedef CUresult (CUDAAPI *PFN_cuStreamCreateWithPriority_v5050)(CUstream *phStream, unsigned int flags, int priority);
607
+ typedef CUresult (CUDAAPI *PFN_cuStreamGetPriority_v7000_ptsz)(CUstream hStream, int *priority);
608
+ typedef CUresult (CUDAAPI *PFN_cuStreamGetFlags_v7000_ptsz)(CUstream hStream, unsigned int *flags);
609
+ typedef CUresult (CUDAAPI *PFN_cuStreamGetCtx_v9020_ptsz)(CUstream hStream, CUcontext *pctx);
610
+ typedef CUresult (CUDAAPI *PFN_cuStreamWaitEvent_v7000_ptsz)(CUstream hStream, CUevent hEvent, unsigned int Flags);
611
+ typedef CUresult (CUDAAPI *PFN_cuStreamAddCallback_v7000_ptsz)(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
612
+ typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10010_ptsz)(CUstream hStream, CUstreamCaptureMode mode);
613
+ typedef CUresult (CUDAAPI *PFN_cuThreadExchangeStreamCaptureMode_v10010)(CUstreamCaptureMode *mode);
614
+ typedef CUresult (CUDAAPI *PFN_cuStreamEndCapture_v10000_ptsz)(CUstream hStream, CUgraph *phGraph);
615
+ typedef CUresult (CUDAAPI *PFN_cuStreamIsCapturing_v10000_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus);
616
+ typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v10010_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
617
+ typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v11030_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
618
+ typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v11030_ptsz)(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
619
+ typedef CUresult (CUDAAPI *PFN_cuStreamAttachMemAsync_v7000_ptsz)(CUstream hStream, CUdeviceptr_v2 dptr, size_t length, unsigned int flags);
620
+ typedef CUresult (CUDAAPI *PFN_cuStreamQuery_v7000_ptsz)(CUstream hStream);
621
+ typedef CUresult (CUDAAPI *PFN_cuStreamSynchronize_v7000_ptsz)(CUstream hStream);
622
+ typedef CUresult (CUDAAPI *PFN_cuStreamDestroy_v4000)(CUstream hStream);
623
+ typedef CUresult (CUDAAPI *PFN_cuStreamCopyAttributes_v11000_ptsz)(CUstream dst, CUstream src);
624
+ typedef CUresult (CUDAAPI *PFN_cuStreamGetAttribute_v11000_ptsz)(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue_v1 *value_out);
625
+ typedef CUresult (CUDAAPI *PFN_cuStreamSetAttribute_v11000_ptsz)(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue_v1 *value);
626
+ typedef CUresult (CUDAAPI *PFN_cuEventCreate_v2000)(CUevent *phEvent, unsigned int Flags);
627
+ typedef CUresult (CUDAAPI *PFN_cuEventRecord_v7000_ptsz)(CUevent hEvent, CUstream hStream);
628
+ typedef CUresult (CUDAAPI *PFN_cuEventRecordWithFlags_v11010_ptsz)(CUevent hEvent, CUstream hStream, unsigned int flags);
629
+ typedef CUresult (CUDAAPI *PFN_cuEventQuery_v2000)(CUevent hEvent);
630
+ typedef CUresult (CUDAAPI *PFN_cuEventSynchronize_v2000)(CUevent hEvent);
631
+ typedef CUresult (CUDAAPI *PFN_cuEventDestroy_v4000)(CUevent hEvent);
632
+ typedef CUresult (CUDAAPI *PFN_cuEventElapsedTime_v2000)(float *pMilliseconds, CUevent hStart, CUevent hEnd);
633
+ typedef CUresult (CUDAAPI *PFN_cuImportExternalMemory_v10000)(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1 *memHandleDesc);
634
+ typedef CUresult (CUDAAPI *PFN_cuExternalMemoryGetMappedBuffer_v10000)(CUdeviceptr_v2 *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1 *bufferDesc);
635
+ typedef CUresult (CUDAAPI *PFN_cuExternalMemoryGetMappedMipmappedArray_v10000)(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1 *mipmapDesc);
636
+ typedef CUresult (CUDAAPI *PFN_cuDestroyExternalMemory_v10000)(CUexternalMemory extMem);
637
+ typedef CUresult (CUDAAPI *PFN_cuImportExternalSemaphore_v10000)(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1 *semHandleDesc);
638
+ typedef CUresult (CUDAAPI *PFN_cuSignalExternalSemaphoresAsync_v10000_ptsz)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
639
+ typedef CUresult (CUDAAPI *PFN_cuWaitExternalSemaphoresAsync_v10000_ptsz)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
640
+ typedef CUresult (CUDAAPI *PFN_cuDestroyExternalSemaphore_v10000)(CUexternalSemaphore extSem);
641
+ typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v8000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
642
+ typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v9000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
643
+ typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v8000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
644
+ typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v9000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
645
+ typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v8000_ptsz)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams_v1 *paramArray, unsigned int flags);
646
+ typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
647
+ typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
648
+ typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
649
+ typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
650
+ typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v11070_ptsz)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
651
+ typedef CUresult (CUDAAPI *PFN_cuFuncGetAttribute_v2020)(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
652
+ typedef CUresult (CUDAAPI *PFN_cuFuncSetAttribute_v9000)(CUfunction hfunc, CUfunction_attribute attrib, int value);
653
+ typedef CUresult (CUDAAPI *PFN_cuFuncSetCacheConfig_v3000)(CUfunction hfunc, CUfunc_cache config);
654
+ typedef CUresult (CUDAAPI *PFN_cuFuncSetSharedMemConfig_v4020)(CUfunction hfunc, CUsharedconfig config);
655
+ typedef CUresult (CUDAAPI *PFN_cuLaunchKernel_v7000_ptsz)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
656
+ typedef CUresult (CUDAAPI *PFN_cuLaunchKernelEx_v11060_ptsz)(const CUlaunchConfig *config, CUfunction f, void **kernelParams, void **extra);
657
+ typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernel_v9000_ptsz)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams);
658
+ typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernelMultiDevice_v9000)(CUDA_LAUNCH_PARAMS_v1 *launchParamsList, unsigned int numDevices, unsigned int flags);
659
+ typedef CUresult (CUDAAPI *PFN_cuLaunchHostFunc_v10000_ptsz)(CUstream hStream, CUhostFn fn, void *userData);
660
+ typedef CUresult (CUDAAPI *PFN_cuFuncSetBlockShape_v2000)(CUfunction hfunc, int x, int y, int z);
661
+ typedef CUresult (CUDAAPI *PFN_cuFuncSetSharedSize_v2000)(CUfunction hfunc, unsigned int bytes);
662
+ typedef CUresult (CUDAAPI *PFN_cuParamSetSize_v2000)(CUfunction hfunc, unsigned int numbytes);
663
+ typedef CUresult (CUDAAPI *PFN_cuParamSeti_v2000)(CUfunction hfunc, int offset, unsigned int value);
664
+ typedef CUresult (CUDAAPI *PFN_cuParamSetf_v2000)(CUfunction hfunc, int offset, float value);
665
+ typedef CUresult (CUDAAPI *PFN_cuParamSetv_v2000)(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
666
+ typedef CUresult (CUDAAPI *PFN_cuLaunch_v2000)(CUfunction f);
667
+ typedef CUresult (CUDAAPI *PFN_cuLaunchGrid_v2000)(CUfunction f, int grid_width, int grid_height);
668
+ typedef CUresult (CUDAAPI *PFN_cuLaunchGridAsync_v2000)(CUfunction f, int grid_width, int grid_height, CUstream hStream);
669
+ typedef CUresult (CUDAAPI *PFN_cuParamSetTexRef_v2000)(CUfunction hfunc, int texunit, CUtexref hTexRef);
670
+ typedef CUresult (CUDAAPI *PFN_cuGraphCreate_v10000)(CUgraph *phGraph, unsigned int flags);
671
+ typedef CUresult (CUDAAPI *PFN_cuGraphAddKernelNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
672
+ typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeGetParams_v10000)(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
673
+ typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
674
+ typedef CUresult (CUDAAPI *PFN_cuGraphAddMemcpyNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D_v2 *copyParams, CUcontext ctx);
675
+ typedef CUresult (CUDAAPI *PFN_cuGraphMemcpyNodeGetParams_v10000)(CUgraphNode hNode, CUDA_MEMCPY3D_v2 *nodeParams);
676
+ typedef CUresult (CUDAAPI *PFN_cuGraphMemcpyNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_MEMCPY3D_v2 *nodeParams);
677
+ typedef CUresult (CUDAAPI *PFN_cuGraphAddMemsetNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS_v1 *memsetParams, CUcontext ctx);
678
+ typedef CUresult (CUDAAPI *PFN_cuGraphMemsetNodeGetParams_v10000)(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS_v1 *nodeParams);
679
+ typedef CUresult (CUDAAPI *PFN_cuGraphMemsetNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS_v1 *nodeParams);
680
+ typedef CUresult (CUDAAPI *PFN_cuGraphAddHostNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
681
+ typedef CUresult (CUDAAPI *PFN_cuGraphHostNodeGetParams_v10000)(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
682
+ typedef CUresult (CUDAAPI *PFN_cuGraphHostNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
683
+ typedef CUresult (CUDAAPI *PFN_cuGraphAddChildGraphNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph);
684
+ typedef CUresult (CUDAAPI *PFN_cuGraphChildGraphNodeGetGraph_v10000)(CUgraphNode hNode, CUgraph *phGraph);
685
+ typedef CUresult (CUDAAPI *PFN_cuGraphAddEmptyNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies);
686
+ typedef CUresult (CUDAAPI *PFN_cuGraphAddEventRecordNode_v11010)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
687
+ typedef CUresult (CUDAAPI *PFN_cuGraphEventRecordNodeGetEvent_v11010)(CUgraphNode hNode, CUevent *event_out);
688
+ typedef CUresult (CUDAAPI *PFN_cuGraphEventRecordNodeSetEvent_v11010)(CUgraphNode hNode, CUevent event);
689
+ typedef CUresult (CUDAAPI *PFN_cuGraphAddEventWaitNode_v11010)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
690
+ typedef CUresult (CUDAAPI *PFN_cuGraphEventWaitNodeGetEvent_v11010)(CUgraphNode hNode, CUevent *event_out);
691
+ typedef CUresult (CUDAAPI *PFN_cuGraphEventWaitNodeSetEvent_v11010)(CUgraphNode hNode, CUevent event);
692
+ typedef CUresult (CUDAAPI *PFN_cuGraphAddExternalSemaphoresSignalNode_v11020)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
693
+ typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresSignalNodeGetParams_v11020)(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *params_out);
694
+ typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresSignalNodeSetParams_v11020)(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
695
+ typedef CUresult (CUDAAPI *PFN_cuGraphAddExternalSemaphoresWaitNode_v11020)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
696
+ typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresWaitNodeGetParams_v11020)(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *params_out);
697
+ typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresWaitNodeSetParams_v11020)(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
698
+ typedef CUresult (CUDAAPI *PFN_cuGraphAddBatchMemOpNode_v11070)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
699
+ typedef CUresult (CUDAAPI *PFN_cuGraphBatchMemOpNodeGetParams_v11070)(CUgraphNode hNode, CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams_out);
700
+ typedef CUresult (CUDAAPI *PFN_cuGraphBatchMemOpNodeSetParams_v11070)(CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
701
+ typedef CUresult (CUDAAPI *PFN_cuGraphExecBatchMemOpNodeSetParams_v11070)(CUgraphExec graphExec, CUgraphNode node, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
702
+ typedef CUresult (CUDAAPI *PFN_cuGraphClone_v10000)(CUgraph *phGraphClone, CUgraph originalGraph);
703
+ typedef CUresult (CUDAAPI *PFN_cuGraphNodeFindInClone_v10000)(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph);
704
+ typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetType_v10000)(CUgraphNode hNode, CUgraphNodeType *type);
705
+ typedef CUresult (CUDAAPI *PFN_cuGraphGetNodes_v10000)(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes);
706
+ typedef CUresult (CUDAAPI *PFN_cuGraphGetRootNodes_v10000)(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes);
707
+ typedef CUresult (CUDAAPI *PFN_cuGraphGetEdges_v10000)(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges);
708
+ typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependencies_v10000)(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies);
709
+ typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependentNodes_v10000)(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes);
710
+ typedef CUresult (CUDAAPI *PFN_cuGraphAddDependencies_v10000)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
711
+ typedef CUresult (CUDAAPI *PFN_cuGraphRemoveDependencies_v10000)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
712
+ typedef CUresult (CUDAAPI *PFN_cuGraphDestroyNode_v10000)(CUgraphNode hNode);
713
+ typedef CUresult (CUDAAPI *PFN_cuGraphInstantiate_v11000)(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
714
+ typedef CUresult (CUDAAPI *PFN_cuGraphInstantiateWithFlags_v11040)(CUgraphExec *phGraphExec, CUgraph hGraph, unsigned long long flags);
715
+ typedef CUresult (CUDAAPI *PFN_cuGraphExecKernelNodeSetParams_v10010)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
716
+ typedef CUresult (CUDAAPI *PFN_cuGraphExecMemcpyNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D_v2 *copyParams, CUcontext ctx);
717
+ typedef CUresult (CUDAAPI *PFN_cuGraphExecMemsetNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS_v1 *memsetParams, CUcontext ctx);
718
+ typedef CUresult (CUDAAPI *PFN_cuGraphExecHostNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
719
+ typedef CUresult (CUDAAPI *PFN_cuGraphExecChildGraphNodeSetParams_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph);
720
+ typedef CUresult (CUDAAPI *PFN_cuGraphExecEventRecordNodeSetEvent_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
721
+ typedef CUresult (CUDAAPI *PFN_cuGraphExecEventWaitNodeSetEvent_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
722
+ typedef CUresult (CUDAAPI *PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams_v11020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
723
+ typedef CUresult (CUDAAPI *PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams_v11020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
724
+ typedef CUresult (CUDAAPI *PFN_cuGraphUpload_v11010_ptsz)(CUgraphExec hGraphExec, CUstream hStream);
725
+ typedef CUresult (CUDAAPI *PFN_cuGraphLaunch_v10000_ptsz)(CUgraphExec hGraphExec, CUstream hStream);
726
+ typedef CUresult (CUDAAPI *PFN_cuGraphExecDestroy_v10000)(CUgraphExec hGraphExec);
727
+ typedef CUresult (CUDAAPI *PFN_cuGraphDestroy_v10000)(CUgraph hGraph);
728
+ typedef CUresult (CUDAAPI *PFN_cuGraphExecUpdate_v10020)(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out);
729
+ typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeCopyAttributes_v11000)(CUgraphNode dst, CUgraphNode src);
730
+ typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeGetAttribute_v11000)(CUgraphNode hNode, CUkernelNodeAttrID attr, CUkernelNodeAttrValue_v1 *value_out);
731
+ typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeSetAttribute_v11000)(CUgraphNode hNode, CUkernelNodeAttrID attr, const CUkernelNodeAttrValue_v1 *value);
732
+ typedef CUresult (CUDAAPI *PFN_cuGraphDebugDotPrint_v11030)(CUgraph hGraph, const char *path, unsigned int flags);
733
+ typedef CUresult (CUDAAPI *PFN_cuGraphAddMemAllocNode_v11040)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams);
734
+ typedef CUresult (CUDAAPI *PFN_cuGraphMemAllocNodeGetParams_v11040)(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out);
735
+ typedef CUresult (CUDAAPI *PFN_cuGraphAddMemFreeNode_v11040)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr);
736
+ typedef CUresult (CUDAAPI *PFN_cuGraphMemFreeNodeGetParams_v11040)(CUgraphNode hNode, CUdeviceptr *dptr_out);
737
+ typedef CUresult (CUDAAPI *PFN_cuGraphNodeSetEnabled_v11060)(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int isEnabled);
738
+ typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetEnabled_v11060)(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int *isEnabled);
739
+ typedef CUresult (CUDAAPI *PFN_cuDeviceGraphMemTrim_v11040)(CUdevice device);
740
+ typedef CUresult (CUDAAPI *PFN_cuDeviceGetGraphMemAttribute_v11040)(CUdevice device, CUgraphMem_attribute attr, void* value);
741
+ typedef CUresult (CUDAAPI *PFN_cuDeviceSetGraphMemAttribute_v11040)(CUdevice device, CUgraphMem_attribute attr, void* value);
742
+ typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor_v6050)(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize);
743
+ typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000)(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags);
744
+ typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialBlockSize_v6050)(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit);
745
+ typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialBlockSizeWithFlags_v7000)(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags);
746
+ typedef CUresult (CUDAAPI *PFN_cuOccupancyAvailableDynamicSMemPerBlock_v10020)(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize);
747
+ typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialClusterSize_v11070)(int *clusterSize, CUfunction func, const CUlaunchConfig *config);
748
+ typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveClusters_v11070)(int *numClusters, CUfunction func, const CUlaunchConfig *config);
749
+ typedef CUresult (CUDAAPI *PFN_cuTexRefSetArray_v2000)(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
750
+ typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmappedArray_v5000)(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags);
751
+ typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress_v3020)(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr_v2 dptr, size_t bytes);
752
+ typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v4010)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v2 *desc, CUdeviceptr_v2 dptr, size_t Pitch);
753
+ typedef CUresult (CUDAAPI *PFN_cuTexRefSetFormat_v2000)(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
754
+ typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddressMode_v2000)(CUtexref hTexRef, int dim, CUaddress_mode am);
755
+ typedef CUresult (CUDAAPI *PFN_cuTexRefSetFilterMode_v2000)(CUtexref hTexRef, CUfilter_mode fm);
756
+ typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapFilterMode_v5000)(CUtexref hTexRef, CUfilter_mode fm);
757
+ typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapLevelBias_v5000)(CUtexref hTexRef, float bias);
758
+ typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapLevelClamp_v5000)(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp);
759
+ typedef CUresult (CUDAAPI *PFN_cuTexRefSetMaxAnisotropy_v5000)(CUtexref hTexRef, unsigned int maxAniso);
760
+ typedef CUresult (CUDAAPI *PFN_cuTexRefSetBorderColor_v8000)(CUtexref hTexRef, float *pBorderColor);
761
+ typedef CUresult (CUDAAPI *PFN_cuTexRefSetFlags_v2000)(CUtexref hTexRef, unsigned int Flags);
762
+ typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddress_v3020)(CUdeviceptr_v2 *pdptr, CUtexref hTexRef);
763
+ typedef CUresult (CUDAAPI *PFN_cuTexRefGetArray_v2000)(CUarray *phArray, CUtexref hTexRef);
764
+ typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmappedArray_v5000)(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef);
765
+ typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddressMode_v2000)(CUaddress_mode *pam, CUtexref hTexRef, int dim);
766
+ typedef CUresult (CUDAAPI *PFN_cuTexRefGetFilterMode_v2000)(CUfilter_mode *pfm, CUtexref hTexRef);
767
+ typedef CUresult (CUDAAPI *PFN_cuTexRefGetFormat_v2000)(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
768
+ typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapFilterMode_v5000)(CUfilter_mode *pfm, CUtexref hTexRef);
769
+ typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapLevelBias_v5000)(float *pbias, CUtexref hTexRef);
770
+ typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapLevelClamp_v5000)(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef);
771
+ typedef CUresult (CUDAAPI *PFN_cuTexRefGetMaxAnisotropy_v5000)(int *pmaxAniso, CUtexref hTexRef);
772
+ typedef CUresult (CUDAAPI *PFN_cuTexRefGetBorderColor_v8000)(float *pBorderColor, CUtexref hTexRef);
773
+ typedef CUresult (CUDAAPI *PFN_cuTexRefGetFlags_v2000)(unsigned int *pFlags, CUtexref hTexRef);
774
+ typedef CUresult (CUDAAPI *PFN_cuTexRefCreate_v2000)(CUtexref *pTexRef);
775
+ typedef CUresult (CUDAAPI *PFN_cuTexRefDestroy_v2000)(CUtexref hTexRef);
776
+ typedef CUresult (CUDAAPI *PFN_cuSurfRefSetArray_v3000)(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
777
+ typedef CUresult (CUDAAPI *PFN_cuSurfRefGetArray_v3000)(CUarray *phArray, CUsurfref hSurfRef);
778
+ typedef CUresult (CUDAAPI *PFN_cuTexObjectCreate_v5000)(CUtexObject_v1 *pTexObject, const CUDA_RESOURCE_DESC_v1 *pResDesc, const CUDA_TEXTURE_DESC_v1 *pTexDesc, const CUDA_RESOURCE_VIEW_DESC_v1 *pResViewDesc);
779
+ typedef CUresult (CUDAAPI *PFN_cuTexObjectDestroy_v5000)(CUtexObject_v1 texObject);
780
+ typedef CUresult (CUDAAPI *PFN_cuTexObjectGetResourceDesc_v5000)(CUDA_RESOURCE_DESC_v1 *pResDesc, CUtexObject_v1 texObject);
781
+ typedef CUresult (CUDAAPI *PFN_cuTexObjectGetTextureDesc_v5000)(CUDA_TEXTURE_DESC_v1 *pTexDesc, CUtexObject_v1 texObject);
782
+ typedef CUresult (CUDAAPI *PFN_cuTexObjectGetResourceViewDesc_v5000)(CUDA_RESOURCE_VIEW_DESC_v1 *pResViewDesc, CUtexObject_v1 texObject);
783
+ typedef CUresult (CUDAAPI *PFN_cuSurfObjectCreate_v5000)(CUsurfObject_v1 *pSurfObject, const CUDA_RESOURCE_DESC_v1 *pResDesc);
784
+ typedef CUresult (CUDAAPI *PFN_cuSurfObjectDestroy_v5000)(CUsurfObject_v1 surfObject);
785
+ typedef CUresult (CUDAAPI *PFN_cuSurfObjectGetResourceDesc_v5000)(CUDA_RESOURCE_DESC_v1 *pResDesc, CUsurfObject_v1 surfObject);
786
+ typedef CUresult (CUDAAPI *PFN_cuDeviceCanAccessPeer_v4000)(int *canAccessPeer, CUdevice_v1 dev, CUdevice_v1 peerDev);
787
+ typedef CUresult (CUDAAPI *PFN_cuCtxEnablePeerAccess_v4000)(CUcontext peerContext, unsigned int Flags);
788
+ typedef CUresult (CUDAAPI *PFN_cuCtxDisablePeerAccess_v4000)(CUcontext peerContext);
789
+ typedef CUresult (CUDAAPI *PFN_cuDeviceGetP2PAttribute_v8000)(int *value, CUdevice_P2PAttribute attrib, CUdevice_v1 srcDevice, CUdevice_v1 dstDevice);
790
+ typedef CUresult (CUDAAPI *PFN_cuGraphicsUnregisterResource_v3000)(CUgraphicsResource resource);
791
+ typedef CUresult (CUDAAPI *PFN_cuGraphicsSubResourceGetMappedArray_v3000)(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
792
+ typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedMipmappedArray_v5000)(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource);
793
+ typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedPointer_v3020)(CUdeviceptr_v2 *pDevPtr, size_t *pSize, CUgraphicsResource resource);
794
+ typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceSetMapFlags_v6050)(CUgraphicsResource resource, unsigned int flags);
795
+ typedef CUresult (CUDAAPI *PFN_cuGraphicsMapResources_v7000_ptsz)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
796
+ typedef CUresult (CUDAAPI *PFN_cuGraphicsUnmapResources_v7000_ptsz)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
797
+ typedef CUresult (CUDAAPI *PFN_cuGetExportTable_v3000)(const void **ppExportTable, const CUuuid *pExportTableId);
798
+ typedef CUresult (CUDAAPI *PFN_cuFuncGetModule_v11000)(CUmodule *hmod, CUfunction hfunc);
799
+ typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags);
800
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v3020)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount);
801
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v3020)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount);
802
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v3020)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount);
803
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v3020)(CUarray dstArray, size_t dstOffset, CUdeviceptr_v2 srcDevice, size_t ByteCount);
804
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v3020)(CUdeviceptr_v2 dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
805
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v3020)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
806
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v3020)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
807
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v3020)(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
808
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v3020)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
809
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v3020)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
810
+ typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v3020)(const CUDA_MEMCPY2D_v2 *pCopy);
811
+ typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v3020)(const CUDA_MEMCPY2D_v2 *pCopy);
812
+ typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v3020)(const CUDA_MEMCPY3D_v2 *pCopy);
813
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v3020)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
814
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v3020)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
815
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v3020)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
816
+ typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v3020)(const CUDA_MEMCPY2D_v2 *pCopy, CUstream hStream);
817
+ typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v3020)(const CUDA_MEMCPY3D_v2 *pCopy, CUstream hStream);
818
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v3020)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N);
819
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v3020)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N);
820
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v3020)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N);
821
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
822
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
823
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
824
+ typedef CUresult (CUDAAPI *PFN_cuMemcpy_v4000)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount);
825
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyAsync_v4000)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount, CUstream hStream);
826
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyPeer_v4000)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount);
827
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyPeerAsync_v4000)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
828
+ typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeer_v4000)(const CUDA_MEMCPY3D_PEER_v1 *pCopy);
829
+ typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeerAsync_v4000)(const CUDA_MEMCPY3D_PEER_v1 *pCopy, CUstream hStream);
830
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD8Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N, CUstream hStream);
831
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD16Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N, CUstream hStream);
832
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD32Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N, CUstream hStream);
833
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
834
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
835
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
836
+ typedef CUresult (CUDAAPI *PFN_cuStreamGetPriority_v5050)(CUstream hStream, int *priority);
837
+ typedef CUresult (CUDAAPI *PFN_cuStreamGetFlags_v5050)(CUstream hStream, unsigned int *flags);
838
+ typedef CUresult (CUDAAPI *PFN_cuStreamGetCtx_v9020)(CUstream hStream, CUcontext *pctx);
839
+ typedef CUresult (CUDAAPI *PFN_cuStreamWaitEvent_v3020)(CUstream hStream, CUevent hEvent, unsigned int Flags);
840
+ typedef CUresult (CUDAAPI *PFN_cuStreamAddCallback_v5000)(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
841
+ typedef CUresult (CUDAAPI *PFN_cuStreamAttachMemAsync_v6000)(CUstream hStream, CUdeviceptr_v2 dptr, size_t length, unsigned int flags);
842
+ typedef CUresult (CUDAAPI *PFN_cuStreamQuery_v2000)(CUstream hStream);
843
+ typedef CUresult (CUDAAPI *PFN_cuStreamSynchronize_v2000)(CUstream hStream);
844
+ typedef CUresult (CUDAAPI *PFN_cuEventRecord_v2000)(CUevent hEvent, CUstream hStream);
845
+ typedef CUresult (CUDAAPI *PFN_cuEventRecordWithFlags_v11010)(CUevent hEvent, CUstream hStream, unsigned int flags);
846
+ typedef CUresult (CUDAAPI *PFN_cuLaunchKernel_v4000)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
847
+ typedef CUresult (CUDAAPI *PFN_cuLaunchKernelEx_v11060)(const CUlaunchConfig *config, CUfunction f, void **kernelParams, void **extra);
848
+ typedef CUresult (CUDAAPI *PFN_cuLaunchHostFunc_v10000)(CUstream hStream, CUhostFn fn, void *userData);
849
+ typedef CUresult (CUDAAPI *PFN_cuGraphicsMapResources_v3000)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
850
+ typedef CUresult (CUDAAPI *PFN_cuGraphicsUnmapResources_v3000)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
851
+ typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v8000)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
852
+ typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v8000)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
853
+ typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v9000)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
854
+ typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v9000)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
855
+ typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v8000)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
856
+ typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
857
+ typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
858
+ typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
859
+ typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
860
+ typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v11070)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
861
+ typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v8000)(CUdeviceptr_v2 devPtr, size_t count, CUdevice_v1 dstDevice, CUstream hStream);
862
+ typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernel_v9000)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams);
863
+ typedef CUresult (CUDAAPI *PFN_cuSignalExternalSemaphoresAsync_v10000)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
864
+ typedef CUresult (CUDAAPI *PFN_cuWaitExternalSemaphoresAsync_v10000)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
865
+ typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10010)(CUstream hStream, CUstreamCaptureMode mode);
866
+ typedef CUresult (CUDAAPI *PFN_cuStreamEndCapture_v10000)(CUstream hStream, CUgraph *phGraph);
867
+ typedef CUresult (CUDAAPI *PFN_cuStreamIsCapturing_v10000)(CUstream hStream, CUstreamCaptureStatus *captureStatus);
868
+ typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v10010)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
869
+ typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v11030)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
870
+ typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v11030)(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
871
+ typedef CUresult (CUDAAPI *PFN_cuGraphUpload_v11010)(CUgraphExec hGraph, CUstream hStream);
872
+ typedef CUresult (CUDAAPI *PFN_cuGraphLaunch_v10000)(CUgraphExec hGraph, CUstream hStream);
873
+ typedef CUresult (CUDAAPI *PFN_cuStreamCopyAttributes_v11000)(CUstream dstStream, CUstream srcStream);
874
+ typedef CUresult (CUDAAPI *PFN_cuStreamGetAttribute_v11000)(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue_v1 *value);
875
+ typedef CUresult (CUDAAPI *PFN_cuStreamSetAttribute_v11000)(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue_v1 *param);
876
+ typedef CUresult (CUDAAPI *PFN_cuMemMapArrayAsync_v11010)(CUarrayMapInfo_v1 *mapInfoList, unsigned int count, CUstream hStream);
877
+ typedef CUresult (CUDAAPI *PFN_cuMemFreeAsync_v11020)(CUdeviceptr_v2 dptr, CUstream hStream);
878
+ typedef CUresult (CUDAAPI *PFN_cuMemAllocAsync_v11020)(CUdeviceptr_v2 *dptr, size_t bytesize, CUstream hStream);
879
+ typedef CUresult (CUDAAPI *PFN_cuMemAllocFromPoolAsync_v11020)(CUdeviceptr_v2 *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
880
+ typedef CUresult (CUDAAPI *PFN_cuFlushGPUDirectRDMAWrites_v11030)(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope);
881
+ typedef CUresult (CUDAAPI *PFN_cuUserObjectCreate_v11030)(CUuserObject *object_out, void *ptr, CUhostFn destroy, unsigned int initialRefcount, unsigned int flags);
882
+ typedef CUresult (CUDAAPI *PFN_cuUserObjectRetain_v11030)(CUuserObject object, unsigned int count);
883
+ typedef CUresult (CUDAAPI *PFN_cuUserObjectRelease_v11030)(CUuserObject object, unsigned int count);
884
+ typedef CUresult (CUDAAPI *PFN_cuGraphRetainUserObject_v11030)(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags);
885
+ typedef CUresult (CUDAAPI *PFN_cuGraphReleaseUserObject_v11030)(CUgraph graph, CUuserObject object, unsigned int count);
886
+ typedef CUresult (CUDAAPI *PFN_cuModuleGetLoadingMode_v11070)(CUmoduleLoadingMode *mode);
887
+ typedef CUresult (CUDAAPI *PFN_cuMemGetHandleForAddressRange_v11070)(void *handle, CUdeviceptr dptr, size_t size, CUmemRangeHandleType handleType, unsigned long long flags);
888
+
889
+ /*
890
+ * Type definitions for older versioned functions in cuda.h
891
+ */
892
+ #if defined(__CUDA_API_VERSION_INTERNAL)
893
+ typedef CUresult (CUDAAPI *PFN_cuMemHostRegister_v4000)(void *p, size_t bytesize, unsigned int Flags);
894
+ typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceSetMapFlags_v3000)(CUgraphicsResource resource, unsigned int flags);
895
+ typedef CUresult (CUDAAPI *PFN_cuLinkCreate_v5050)(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
896
+ typedef CUresult (CUDAAPI *PFN_cuLinkAddData_v5050)(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues);
897
+ typedef CUresult (CUDAAPI *PFN_cuLinkAddFile_v5050)(CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues);
898
+ typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v3020)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v2 *desc, CUdeviceptr_v2 dptr, size_t Pitch);
899
+ typedef CUresult (CUDAAPI *PFN_cuDeviceTotalMem_v2000)(unsigned int *bytes, CUdevice_v1 dev);
900
+ typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v2000)(CUcontext *pctx, unsigned int flags, CUdevice_v1 dev);
901
+ typedef CUresult (CUDAAPI *PFN_cuModuleGetGlobal_v2000)(CUdeviceptr_v1 *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
902
+ typedef CUresult (CUDAAPI *PFN_cuMemGetInfo_v2000)(unsigned int *free, unsigned int *total);
903
+ typedef CUresult (CUDAAPI *PFN_cuMemAlloc_v2000)(CUdeviceptr_v1 *dptr, unsigned int bytesize);
904
+ typedef CUresult (CUDAAPI *PFN_cuMemAllocPitch_v2000)(CUdeviceptr_v1 *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes);
905
+ typedef CUresult (CUDAAPI *PFN_cuMemFree_v2000)(CUdeviceptr_v1 dptr);
906
+ typedef CUresult (CUDAAPI *PFN_cuMemGetAddressRange_v2000)(CUdeviceptr_v1 *pbase, unsigned int *psize, CUdeviceptr_v1 dptr);
907
+ typedef CUresult (CUDAAPI *PFN_cuMemAllocHost_v2000)(void **pp, unsigned int bytesize);
908
+ typedef CUresult (CUDAAPI *PFN_cuMemHostGetDevicePointer_v2020)(CUdeviceptr_v1 *pdptr, void *p, unsigned int Flags);
909
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v2000)(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount);
910
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v2000)(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
911
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v2000)(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
912
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v2000)(CUarray dstArray, unsigned int dstOffset, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
913
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v2000)(CUdeviceptr_v1 dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
914
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v2000)(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
915
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v2000)(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
916
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v2000)(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
917
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v2000)(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream);
918
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v2000)(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream);
919
+ typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v2000)(const CUDA_MEMCPY2D_v1 *pCopy);
920
+ typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v2000)(const CUDA_MEMCPY2D_v1 *pCopy);
921
+ typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v2000)(const CUDA_MEMCPY3D_v1 *pCopy);
922
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v2000)(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream);
923
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v2000)(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
924
+ typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v3000)(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
925
+ typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v2000)(const CUDA_MEMCPY2D_v1 *pCopy, CUstream hStream);
926
+ typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v2000)(const CUDA_MEMCPY3D_v1 *pCopy, CUstream hStream);
927
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v2000)(CUdeviceptr_v1 dstDevice, unsigned char uc, unsigned int N);
928
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v2000)(CUdeviceptr_v1 dstDevice, unsigned short us, unsigned int N);
929
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v2000)(CUdeviceptr_v1 dstDevice, unsigned int ui, unsigned int N);
930
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
931
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
932
+ typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
933
+ typedef CUresult (CUDAAPI *PFN_cuArrayCreate_v2000)(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray);
934
+ typedef CUresult (CUDAAPI *PFN_cuArrayGetDescriptor_v2000)(CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
935
+ typedef CUresult (CUDAAPI *PFN_cuArray3DCreate_v2000)(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray);
936
+ typedef CUresult (CUDAAPI *PFN_cuArray3DGetDescriptor_v2000)(CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
937
+ typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress_v2000)(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr_v1 dptr, unsigned int bytes);
938
+ typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v2020)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v1 *desc, CUdeviceptr_v1 dptr, unsigned int Pitch);
939
+ typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddress_v2000)(CUdeviceptr_v1 *pdptr, CUtexref hTexRef);
940
+ typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedPointer_v3000)(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
941
+ typedef CUresult (CUDAAPI *PFN_cuCtxDestroy_v2000)(CUcontext ctx);
942
+ typedef CUresult (CUDAAPI *PFN_cuCtxPopCurrent_v2000)(CUcontext *pctx);
943
+ typedef CUresult (CUDAAPI *PFN_cuCtxPushCurrent_v2000)(CUcontext ctx);
944
+ typedef CUresult (CUDAAPI *PFN_cuStreamDestroy_v2000)(CUstream hStream);
945
+ typedef CUresult (CUDAAPI *PFN_cuEventDestroy_v2000)(CUevent hEvent);
946
+ typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRelease_v7000)(CUdevice_v1 dev);
947
+ typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxReset_v7000)(CUdevice_v1 dev);
948
+ typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxSetFlags_v7000)(CUdevice_v1 dev, unsigned int flags);
949
+ typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10000)(CUstream hStream);
950
+ typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10000_ptsz)(CUstream hStream);
951
+ typedef CUresult (CUDAAPI *PFN_cuIpcOpenMemHandle_v4010)(CUdeviceptr_v2 *pdptr, CUipcMemHandle_v1 handle, unsigned int Flags);
952
+ typedef CUresult (CUDAAPI *PFN_cuGraphInstantiate_v10000)(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
953
+ #endif
954
+
955
+ #ifdef __cplusplus
956
+ }
957
+ #endif // __cplusplus
958
+
959
+ #endif // file guard
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaVDPAUTypedefs.h ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2020-2021 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef CUDAVDPAUTYPEDEFS_H
51
+ #define CUDAVDPAUTYPEDEFS_H
52
+
53
+ // Dependent includes for cudavdpau.h
54
+ #include <vdpau/vdpau.h>
55
+
56
+ #include <cudaVDPAU.h>
57
+
58
+ #ifdef __cplusplus
59
+ extern "C" {
60
+ #endif // __cplusplus
61
+
62
+ /*
63
+ * Macros for the latest version for each driver function in cudaVDPAU.h
64
+ */
65
+ #define PFN_cuVDPAUGetDevice PFN_cuVDPAUGetDevice_v3010
66
+ #define PFN_cuVDPAUCtxCreate PFN_cuVDPAUCtxCreate_v3020
67
+ #define PFN_cuGraphicsVDPAURegisterVideoSurface PFN_cuGraphicsVDPAURegisterVideoSurface_v3010
68
+ #define PFN_cuGraphicsVDPAURegisterOutputSurface PFN_cuGraphicsVDPAURegisterOutputSurface_v3010
69
+
70
+
71
+ /**
72
+ * Type definitions for functions defined in cudaVDPAU.h
73
+ */
74
+ typedef CUresult (CUDAAPI *PFN_cuVDPAUGetDevice_v3010)(CUdevice_v1 *pDevice, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
75
+ typedef CUresult (CUDAAPI *PFN_cuVDPAUCtxCreate_v3020)(CUcontext *pCtx, unsigned int flags, CUdevice_v1 device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
76
+ typedef CUresult (CUDAAPI *PFN_cuGraphicsVDPAURegisterVideoSurface_v3010)(CUgraphicsResource *pCudaResource, VdpVideoSurface vdpSurface, unsigned int flags);
77
+ typedef CUresult (CUDAAPI *PFN_cuGraphicsVDPAURegisterOutputSurface_v3010)(CUgraphicsResource *pCudaResource, VdpOutputSurface vdpSurface, unsigned int flags);
78
+
79
+ /*
80
+ * Type definitions for older versioned functions in cudaVDPAU.h
81
+ */
82
+ #if defined(__CUDA_API_VERSION_INTERNAL)
83
+ typedef CUresult (CUDAAPI *PFN_cuVDPAUCtxCreate_v3010)(CUcontext *pCtx, unsigned int flags, CUdevice_v1 device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
84
+ #endif
85
+
86
+ #ifdef __cplusplus
87
+ }
88
+ #endif // __cplusplus
89
+
90
+ #endif // file guard
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_bf16.h ADDED
The diff for this file is too large to render. See raw diff
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp16.h ADDED
The diff for this file is too large to render. See raw diff
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp16.hpp ADDED
@@ -0,0 +1,2614 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_FP16_HPP__)
51
+ #define __CUDA_FP16_HPP__
52
+
53
+ #if !defined(__CUDA_FP16_H__)
54
+ #error "Do not include this file directly. Instead, include cuda_fp16.h."
55
+ #endif
56
+
57
+ #if !defined(_MSC_VER) && __cplusplus >= 201103L
58
+ # define __CPP_VERSION_AT_LEAST_11_FP16
59
+ #elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
60
+ # define __CPP_VERSION_AT_LEAST_11_FP16
61
+ #endif
62
+
63
+ /* C++11 header for std::move.
64
+ * In RTC mode, std::move is provided implicitly; don't include the header
65
+ */
66
+ #if defined(__CPP_VERSION_AT_LEAST_11_FP16) && !defined(__CUDACC_RTC__)
67
+ #include <utility>
68
+ #endif /* __cplusplus >= 201103L && !defined(__CUDACC_RTC__) */
69
+
70
+ /* C++ header for std::memcpy (used for type punning in host-side implementations).
71
+ * When compiling as a CUDA source file memcpy is provided implicitly.
72
+ * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
73
+ */
74
+ #if defined(__cplusplus) && !defined(__CUDACC__)
75
+ #include <cstring>
76
+ #endif /* defined(__cplusplus) && !defined(__CUDACC__) */
77
+
78
+
79
+ /* Set up function decorations */
80
+ #if defined(__CUDACC__)
81
+ #define __CUDA_FP16_DECL__ static __device__ __inline__
82
+ #define __CUDA_HOSTDEVICE_FP16_DECL__ static __host__ __device__ __inline__
83
+ #define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
84
+ #define __CUDA_HOSTDEVICE__ __host__ __device__
85
+ #else /* !defined(__CUDACC__) */
86
+ #if defined(__GNUC__)
87
+ #define __CUDA_HOSTDEVICE_FP16_DECL__ static __attribute__ ((unused))
88
+ #else
89
+ #define __CUDA_HOSTDEVICE_FP16_DECL__ static
90
+ #endif /* defined(__GNUC__) */
91
+ #define __CUDA_HOSTDEVICE__
92
+ #endif /* defined(__CUDACC_) */
93
+
94
+ /* Set up structure-alignment attribute */
95
+ #if defined(__CUDACC__)
96
+ #define __CUDA_ALIGN__(align) __align__(align)
97
+ #else
98
+ /* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */
99
+ #if __cplusplus >= 201103L
100
+ #define __CUDA_ALIGN__(n) alignas(n) /* C++11 kindly gives us a keyword for this */
101
+ #else /* !defined(__CPP_VERSION_AT_LEAST_11_FP16)*/
102
+ #if defined(__GNUC__)
103
+ #define __CUDA_ALIGN__(n) __attribute__ ((aligned(n)))
104
+ #elif defined(_MSC_VER)
105
+ #define __CUDA_ALIGN__(n) __declspec(align(n))
106
+ #else
107
+ #define __CUDA_ALIGN__(n)
108
+ #endif /* defined(__GNUC__) */
109
+ #endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
110
+ #endif /* defined(__CUDACC__) */
111
+
112
+ /* Macros to allow half & half2 to be used by inline assembly */
113
+ #define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
114
+ #define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
115
+ #define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))
116
+ #define __HALF2_TO_CUI(var) *(reinterpret_cast<const unsigned int *>(&(var)))
117
+
118
+ /* Macros for half & half2 binary arithmetic */
119
+ #define __BINARY_OP_HALF_MACRO(name) /* do */ {\
120
+ __half val; \
121
+ asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2;\n}" \
122
+ :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b))); \
123
+ return val; \
124
+ } /* while(0) */
125
+ #define __BINARY_OP_HALF2_MACRO(name) /* do */ {\
126
+ __half2 val; \
127
+ asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2;\n}" \
128
+ :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
129
+ return val; \
130
+ } /* while(0) */
131
+ #define __TERNARY_OP_HALF_MACRO(name) /* do */ {\
132
+ __half val; \
133
+ asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2,%3;\n}" \
134
+ :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b)),"h"(__HALF_TO_CUS(c))); \
135
+ return val; \
136
+ } /* while(0) */
137
+ #define __TERNARY_OP_HALF2_MACRO(name) /* do */ {\
138
+ __half2 val; \
139
+ asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2,%3;\n}" \
140
+ :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b)),"r"(__HALF2_TO_CUI(c))); \
141
+ return val; \
142
+ } /* while(0) */
143
+
144
+ /**
145
+ * Types which allow static initialization of "half" and "half2" until
146
+ * these become an actual builtin. Note this initialization is as a
147
+ * bitfield representation of "half", and not a conversion from short->half.
148
+ * Such a representation will be deprecated in a future version of CUDA.
149
+ * (Note these are visible to non-nvcc compilers, including C-only compilation)
150
+ */
151
+ typedef struct __CUDA_ALIGN__(2) {
152
+ unsigned short x;
153
+ } __half_raw;
154
+
155
+ typedef struct __CUDA_ALIGN__(4) {
156
+ unsigned short x;
157
+ unsigned short y;
158
+ } __half2_raw;
159
+
160
+ /* All other definitions in this file are only visible to C++ compilers */
161
+ #if defined(__cplusplus)
162
+
163
+ /* Hide GCC member initialization list warnings because of host/device in-function init requirement */
164
+ #if defined(__GNUC__)
165
+ #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
166
+ #pragma GCC diagnostic push
167
+ #pragma GCC diagnostic ignored "-Wstrict-aliasing"
168
+ #pragma GCC diagnostic ignored "-Weffc++"
169
+ #endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
170
+ #endif /* defined(__GNUC__) */
171
+
172
+ /* class' : multiple assignment operators specified
173
+ The class has multiple assignment operators of a single type. This warning is informational */
174
+ #if defined(_MSC_VER) && _MSC_VER >= 1500
175
+ #pragma warning( push )
176
+ #pragma warning( disable:4522 )
177
+ #endif /* defined(__GNUC__) */
178
+
179
+ struct __CUDA_ALIGN__(2) __half {
180
+ protected:
181
+ unsigned short __x;
182
+
183
+ public:
184
+ #if defined(__CPP_VERSION_AT_LEAST_11_FP16)
185
+ __half() = default;
186
+ #else
187
+ __CUDA_HOSTDEVICE__ __half() { }
188
+ #endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
189
+
190
+ /* Convert to/from __half_raw */
191
+ __CUDA_HOSTDEVICE__ __half(const __half_raw &hr) : __x(hr.x) { }
192
+ __CUDA_HOSTDEVICE__ __half &operator=(const __half_raw &hr) { __x = hr.x; return *this; }
193
+ __CUDA_HOSTDEVICE__ volatile __half &operator=(const __half_raw &hr) volatile { __x = hr.x; return *this; }
194
+ __CUDA_HOSTDEVICE__ volatile __half &operator=(const volatile __half_raw &hr) volatile { __x = hr.x; return *this; }
195
+ __CUDA_HOSTDEVICE__ operator __half_raw() const { __half_raw ret; ret.x = __x; return ret; }
196
+ __CUDA_HOSTDEVICE__ operator __half_raw() const volatile { __half_raw ret; ret.x = __x; return ret; }
197
+
198
+ #if !defined(__CUDA_NO_HALF_CONVERSIONS__)
199
+
200
+ /* Construct from float/double */
201
+ __CUDA_HOSTDEVICE__ __half(const float f) { __x = __float2half(f).__x; }
202
+ __CUDA_HOSTDEVICE__ __half(const double f) { __x = __double2half(f).__x; }
203
+
204
+ __CUDA_HOSTDEVICE__ operator float() const { return __half2float(*this); }
205
+ __CUDA_HOSTDEVICE__ __half &operator=(const float f) { __x = __float2half(f).__x; return *this; }
206
+
207
+ /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */
208
+ __CUDA_HOSTDEVICE__ __half &operator=(const double f) { __x = __double2half(f).__x; return *this; }
209
+
210
+ /* Member functions only available to nvcc compilation so far */
211
+ #if defined(__CUDACC__)
212
+ /* Allow automatic construction from types supported natively in hardware */
213
+ /* Note we do avoid constructor init-list because of special host/device compilation rules */
214
+ __CUDA_HOSTDEVICE__ __half(const short val) { __x = __short2half_rn(val).__x; }
215
+ __CUDA_HOSTDEVICE__ __half(const unsigned short val) { __x = __ushort2half_rn(val).__x; }
216
+ __CUDA_HOSTDEVICE__ __half(const int val) { __x = __int2half_rn(val).__x; }
217
+ __CUDA_HOSTDEVICE__ __half(const unsigned int val) { __x = __uint2half_rn(val).__x; }
218
+ __CUDA_HOSTDEVICE__ __half(const long long val) { __x = __ll2half_rn(val).__x; }
219
+ __CUDA_HOSTDEVICE__ __half(const unsigned long long val) { __x = __ull2half_rn(val).__x; }
220
+
221
+ /* Allow automatic casts to supported builtin types, matching all that are permitted with float */
222
+ __CUDA_HOSTDEVICE__ operator short() const { return __half2short_rz(*this); }
223
+ __CUDA_HOSTDEVICE__ __half &operator=(const short val) { __x = __short2half_rn(val).__x; return *this; }
224
+
225
+ __CUDA_HOSTDEVICE__ operator unsigned short() const { return __half2ushort_rz(*this); }
226
+ __CUDA_HOSTDEVICE__ __half &operator=(const unsigned short val) { __x = __ushort2half_rn(val).__x; return *this; }
227
+
228
+ __CUDA_HOSTDEVICE__ operator int() const { return __half2int_rz(*this); }
229
+ __CUDA_HOSTDEVICE__ __half &operator=(const int val) { __x = __int2half_rn(val).__x; return *this; }
230
+
231
+ __CUDA_HOSTDEVICE__ operator unsigned int() const { return __half2uint_rz(*this); }
232
+ __CUDA_HOSTDEVICE__ __half &operator=(const unsigned int val) { __x = __uint2half_rn(val).__x; return *this; }
233
+
234
+ __CUDA_HOSTDEVICE__ operator long long() const { return __half2ll_rz(*this); }
235
+ __CUDA_HOSTDEVICE__ __half &operator=(const long long val) { __x = __ll2half_rn(val).__x; return *this; }
236
+
237
+ __CUDA_HOSTDEVICE__ operator unsigned long long() const { return __half2ull_rz(*this); }
238
+ __CUDA_HOSTDEVICE__ __half &operator=(const unsigned long long val) { __x = __ull2half_rn(val).__x; return *this; }
239
+
240
+ /* Boolean conversion - note both 0 and -0 must return false */
241
+ __CUDA_HOSTDEVICE__ operator bool() const { return (__x & 0x7FFFU) != 0U; }
242
+ #endif /* defined(__CUDACC__) */
243
+ #endif /* !defined(__CUDA_NO_HALF_CONVERSIONS__) */
244
+ };
245
+
246
+ /* Global-space operator functions are only available to nvcc compilation */
247
+ #if defined(__CUDACC__)
248
+
249
+ /* Arithmetic FP16 operations only supported on arch >= 5.3 */
250
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
251
+ #if !defined(__CUDA_NO_HALF_OPERATORS__)
252
+ /* Some basic arithmetic operations expected of a builtin */
253
+ __device__ __forceinline__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); }
254
+ __device__ __forceinline__ __half operator-(const __half &lh, const __half &rh) { return __hsub(lh, rh); }
255
+ __device__ __forceinline__ __half operator*(const __half &lh, const __half &rh) { return __hmul(lh, rh); }
256
+ __device__ __forceinline__ __half operator/(const __half &lh, const __half &rh) { return __hdiv(lh, rh); }
257
+
258
+ __device__ __forceinline__ __half &operator+=(__half &lh, const __half &rh) { lh = __hadd(lh, rh); return lh; }
259
+ __device__ __forceinline__ __half &operator-=(__half &lh, const __half &rh) { lh = __hsub(lh, rh); return lh; }
260
+ __device__ __forceinline__ __half &operator*=(__half &lh, const __half &rh) { lh = __hmul(lh, rh); return lh; }
261
+ __device__ __forceinline__ __half &operator/=(__half &lh, const __half &rh) { lh = __hdiv(lh, rh); return lh; }
262
+
263
+ /* Note for increment and decrement we use the raw value 0x3C00U equating to half(1.0F), to avoid the extra conversion */
264
+ __device__ __forceinline__ __half &operator++(__half &h) { __half_raw one; one.x = 0x3C00U; h += one; return h; }
265
+ __device__ __forceinline__ __half &operator--(__half &h) { __half_raw one; one.x = 0x3C00U; h -= one; return h; }
266
+ __device__ __forceinline__ __half operator++(__half &h, const int ignored)
267
+ {
268
+ // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
269
+ static_cast<void>(ignored);
270
+
271
+ const __half ret = h;
272
+ __half_raw one;
273
+ one.x = 0x3C00U;
274
+ h += one;
275
+ return ret;
276
+ }
277
+ __device__ __forceinline__ __half operator--(__half &h, const int ignored)
278
+ {
279
+ // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
280
+ static_cast<void>(ignored);
281
+
282
+ const __half ret = h;
283
+ __half_raw one;
284
+ one.x = 0x3C00U;
285
+ h -= one;
286
+ return ret;
287
+ }
288
+
289
+ /* Unary plus and inverse operators */
290
+ __device__ __forceinline__ __half operator+(const __half &h) { return h; }
291
+ __device__ __forceinline__ __half operator-(const __half &h) { return __hneg(h); }
292
+
293
+ /* Some basic comparison operations to make it look like a builtin */
294
+ __device__ __forceinline__ bool operator==(const __half &lh, const __half &rh) { return __heq(lh, rh); }
295
+ __device__ __forceinline__ bool operator!=(const __half &lh, const __half &rh) { return __hneu(lh, rh); }
296
+ __device__ __forceinline__ bool operator> (const __half &lh, const __half &rh) { return __hgt(lh, rh); }
297
+ __device__ __forceinline__ bool operator< (const __half &lh, const __half &rh) { return __hlt(lh, rh); }
298
+ __device__ __forceinline__ bool operator>=(const __half &lh, const __half &rh) { return __hge(lh, rh); }
299
+ __device__ __forceinline__ bool operator<=(const __half &lh, const __half &rh) { return __hle(lh, rh); }
300
+ #endif /* !defined(__CUDA_NO_HALF_OPERATORS__) */
301
+ #endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) */
302
+ #endif /* defined(__CUDACC__) */
303
+
304
+ /* __half2 is visible to non-nvcc host compilers */
305
+ struct __CUDA_ALIGN__(4) __half2 {
306
+ __half x;
307
+ __half y;
308
+
309
+ // All construct/copy/assign/move
310
+ public:
311
+ #if defined(__CPP_VERSION_AT_LEAST_11_FP16)
312
+ __half2() = default;
313
+ __CUDA_HOSTDEVICE__ __half2(const __half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); }
314
+ __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); return *this; }
315
+ #else
316
+ __CUDA_HOSTDEVICE__ __half2() { }
317
+ #endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
318
+ __CUDA_HOSTDEVICE__ __half2(const __half &a, const __half &b) : x(a), y(b) { }
319
+ __CUDA_HOSTDEVICE__ __half2(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); }
320
+ __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); return *this; }
321
+
322
+ /* Convert to/from __half2_raw */
323
+ __CUDA_HOSTDEVICE__ __half2(const __half2_raw &h2r ) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); }
324
+ __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2_raw &h2r) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); return *this; }
325
+ __CUDA_HOSTDEVICE__ operator __half2_raw() const { __half2_raw ret; ret.x = 0U; ret.y = 0U; __HALF2_TO_UI(ret) = __HALF2_TO_CUI(*this); return ret; }
326
+ };
327
+
328
+ /* Global-space operator functions are only available to nvcc compilation */
329
+ #if defined(__CUDACC__)
330
+
331
+ /* Arithmetic FP16x2 operations only supported on arch >= 5.3 */
332
+ #if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) && !defined(__CUDA_NO_HALF2_OPERATORS__)
333
+
334
+ __device__ __forceinline__ __half2 operator+(const __half2 &lh, const __half2 &rh) { return __hadd2(lh, rh); }
335
+ __device__ __forceinline__ __half2 operator-(const __half2 &lh, const __half2 &rh) { return __hsub2(lh, rh); }
336
+ __device__ __forceinline__ __half2 operator*(const __half2 &lh, const __half2 &rh) { return __hmul2(lh, rh); }
337
+ __device__ __forceinline__ __half2 operator/(const __half2 &lh, const __half2 &rh) { return __h2div(lh, rh); }
338
+
339
+ __device__ __forceinline__ __half2& operator+=(__half2 &lh, const __half2 &rh) { lh = __hadd2(lh, rh); return lh; }
340
+ __device__ __forceinline__ __half2& operator-=(__half2 &lh, const __half2 &rh) { lh = __hsub2(lh, rh); return lh; }
341
+ __device__ __forceinline__ __half2& operator*=(__half2 &lh, const __half2 &rh) { lh = __hmul2(lh, rh); return lh; }
342
+ __device__ __forceinline__ __half2& operator/=(__half2 &lh, const __half2 &rh) { lh = __h2div(lh, rh); return lh; }
343
+
344
+ __device__ __forceinline__ __half2 &operator++(__half2 &h) { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hadd2(h, one); return h; }
345
+ __device__ __forceinline__ __half2 &operator--(__half2 &h) { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hsub2(h, one); return h; }
346
+ __device__ __forceinline__ __half2 operator++(__half2 &h, const int ignored)
347
+ {
348
+ // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
349
+ static_cast<void>(ignored);
350
+
351
+ const __half2 ret = h;
352
+ __half2_raw one;
353
+ one.x = 0x3C00U;
354
+ one.y = 0x3C00U;
355
+ h = __hadd2(h, one);
356
+ return ret;
357
+ }
358
+ __device__ __forceinline__ __half2 operator--(__half2 &h, const int ignored)
359
+ {
360
+ // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
361
+ static_cast<void>(ignored);
362
+
363
+ const __half2 ret = h;
364
+ __half2_raw one;
365
+ one.x = 0x3C00U;
366
+ one.y = 0x3C00U;
367
+ h = __hsub2(h, one);
368
+ return ret;
369
+ }
370
+
371
+ __device__ __forceinline__ __half2 operator+(const __half2 &h) { return h; }
372
+ __device__ __forceinline__ __half2 operator-(const __half2 &h) { return __hneg2(h); }
373
+
374
+ __device__ __forceinline__ bool operator==(const __half2 &lh, const __half2 &rh) { return __hbeq2(lh, rh); }
375
+ __device__ __forceinline__ bool operator!=(const __half2 &lh, const __half2 &rh) { return __hbneu2(lh, rh); }
376
+ __device__ __forceinline__ bool operator>(const __half2 &lh, const __half2 &rh) { return __hbgt2(lh, rh); }
377
+ __device__ __forceinline__ bool operator<(const __half2 &lh, const __half2 &rh) { return __hblt2(lh, rh); }
378
+ __device__ __forceinline__ bool operator>=(const __half2 &lh, const __half2 &rh) { return __hbge2(lh, rh); }
379
+ __device__ __forceinline__ bool operator<=(const __half2 &lh, const __half2 &rh) { return __hble2(lh, rh); }
380
+
381
+ #endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) */
382
+ #endif /* defined(__CUDACC__) */
383
+
384
+ /* Restore warning for multiple assignment operators */
385
+ #if defined(_MSC_VER) && _MSC_VER >= 1500
386
+ #pragma warning( pop )
387
+ #endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */
388
+
389
+ /* Restore -Weffc++ warnings from here on */
390
+ #if defined(__GNUC__)
391
+ #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
392
+ #pragma GCC diagnostic pop
393
+ #endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
394
+ #endif /* defined(__GNUC__) */
395
+
396
+ #undef __CUDA_HOSTDEVICE__
397
+ #undef __CUDA_ALIGN__
398
+
399
+ #ifndef __CUDACC_RTC__ /* no host functions in NVRTC mode */
400
+ static inline unsigned short __internal_float2half(const float f, unsigned int &sign, unsigned int &remainder)
401
+ {
402
+ unsigned int x;
403
+ unsigned int u;
404
+ unsigned int result;
405
+ #if defined(__CUDACC__)
406
+ (void)memcpy(&x, &f, sizeof(f));
407
+ #else
408
+ (void)std::memcpy(&x, &f, sizeof(f));
409
+ #endif
410
+ u = (x & 0x7fffffffU);
411
+ sign = ((x >> 16U) & 0x8000U);
412
+ // NaN/+Inf/-Inf
413
+ if (u >= 0x7f800000U) {
414
+ remainder = 0U;
415
+ result = ((u == 0x7f800000U) ? (sign | 0x7c00U) : 0x7fffU);
416
+ } else if (u > 0x477fefffU) { // Overflows
417
+ remainder = 0x80000000U;
418
+ result = (sign | 0x7bffU);
419
+ } else if (u >= 0x38800000U) { // Normal numbers
420
+ remainder = u << 19U;
421
+ u -= 0x38000000U;
422
+ result = (sign | (u >> 13U));
423
+ } else if (u < 0x33000001U) { // +0/-0
424
+ remainder = u;
425
+ result = sign;
426
+ } else { // Denormal numbers
427
+ const unsigned int exponent = u >> 23U;
428
+ const unsigned int shift = 0x7eU - exponent;
429
+ unsigned int mantissa = (u & 0x7fffffU);
430
+ mantissa |= 0x800000U;
431
+ remainder = mantissa << (32U - shift);
432
+ result = (sign | (mantissa >> shift));
433
+ result &= 0x0000FFFFU;
434
+ }
435
+ return static_cast<unsigned short>(result);
436
+ }
437
+ #endif /* #if !defined(__CUDACC_RTC__) */
438
+
439
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a)
440
+ {
441
+ #if defined(__CUDA_ARCH__)
442
+ __half val;
443
+ asm("{ cvt.rn.f16.f64 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "d"(a));
444
+ return val;
445
+ #else
446
+ __half result;
447
+ /*
448
+ // Perform rounding to 11 bits of precision, convert value
449
+ // to float and call existing float to half conversion.
450
+ // By pre-rounding to 11 bits we avoid additional rounding
451
+ // in float to half conversion.
452
+ */
453
+ unsigned long long int absa;
454
+ unsigned long long int ua;
455
+ #if defined(__CUDACC__)
456
+ (void)memcpy(&ua, &a, sizeof(a));
457
+ #else
458
+ (void)std::memcpy(&ua, &a, sizeof(a));
459
+ #endif
460
+ absa = (ua & 0x7fffffffffffffffULL);
461
+ if ((absa >= 0x40f0000000000000ULL) || (absa <= 0x3e60000000000000ULL))
462
+ {
463
+ /*
464
+ // |a| >= 2^16 or NaN or |a| <= 2^(-25)
465
+ // double-rounding is not a problem
466
+ */
467
+ result = __float2half(static_cast<float>(a));
468
+ }
469
+ else
470
+ {
471
+ /*
472
+ // here 2^(-25) < |a| < 2^16
473
+ // prepare shifter value such that a + shifter
474
+ // done in double precision performs round-to-nearest-even
475
+ // and (a + shifter) - shifter results in a rounded to
476
+ // 11 bits of precision. Shifter needs to have exponent of
477
+ // a plus 53 - 11 = 42 and a leading bit in mantissa to guard
478
+ // against negative values.
479
+ // So need to have |a| capped to avoid overflow in exponent.
480
+ // For inputs that are smaller than half precision minnorm
481
+ // we prepare fixed shifter exponent.
482
+ */
483
+ unsigned long long shifterBits;
484
+ if (absa >= 0x3f10000000000000ULL)
485
+ {
486
+ /*
487
+ // Here if |a| >= 2^(-14)
488
+ // add 42 to exponent bits
489
+ */
490
+ shifterBits = (ua & 0x7ff0000000000000ULL) + 0x02A0000000000000ULL;
491
+ }
492
+ else
493
+ {
494
+ /*
495
+ // 2^(-25) < |a| < 2^(-14), potentially results in denormal
496
+ // set exponent bits to 42 - 14 + bias
497
+ */
498
+ shifterBits = 0x41B0000000000000ULL;
499
+ }
500
+ // set leading mantissa bit to protect against negative inputs
501
+ shifterBits |= 0x0008000000000000ULL;
502
+ double shifter;
503
+ #if defined(__CUDACC__)
504
+ (void)memcpy(&shifter, &shifterBits, sizeof(shifterBits));
505
+ #else
506
+ (void)std::memcpy(&shifter, &shifterBits, sizeof(shifterBits));
507
+ #endif
508
+ double aShiftRound = a + shifter;
509
+
510
+ /*
511
+ // Prevent the compiler from optimizing away a + shifter - shifter
512
+ // by doing intermediate memcopy and harmless bitwize operation
513
+ */
514
+ unsigned long long int aShiftRoundBits;
515
+ #if defined(__CUDACC__)
516
+ (void)memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound));
517
+ #else
518
+ (void)std::memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound));
519
+ #endif
520
+
521
+ // the value is positive, so this operation doesn't change anything
522
+ aShiftRoundBits &= 0x7fffffffffffffffULL;
523
+
524
+ #if defined(__CUDACC__)
525
+ (void)memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound));
526
+ #else
527
+ (void)std::memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound));
528
+ #endif
529
+
530
+ result = __float2half(static_cast<float>(aShiftRound - shifter));
531
+ }
532
+
533
+ return result;
534
+ #endif
535
+ }
536
+
537
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a)
538
+ {
539
+ __half val;
540
+ #if defined(__CUDA_ARCH__)
541
+ asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
542
+ #else
543
+ __half_raw r;
544
+ unsigned int sign = 0U;
545
+ unsigned int remainder = 0U;
546
+ r.x = __internal_float2half(a, sign, remainder);
547
+ if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
548
+ r.x++;
549
+ }
550
+ val = r;
551
+ #endif
552
+ return val;
553
+ }
554
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a)
555
+ {
556
+ __half val;
557
+ #if defined(__CUDA_ARCH__)
558
+ asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
559
+ #else
560
+ __half_raw r;
561
+ unsigned int sign = 0U;
562
+ unsigned int remainder = 0U;
563
+ r.x = __internal_float2half(a, sign, remainder);
564
+ if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
565
+ r.x++;
566
+ }
567
+ val = r;
568
+ #endif
569
+ return val;
570
+ }
571
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a)
572
+ {
573
+ __half val;
574
+ #if defined(__CUDA_ARCH__)
575
+ asm("{ cvt.rz.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
576
+ #else
577
+ __half_raw r;
578
+ unsigned int sign = 0U;
579
+ unsigned int remainder = 0U;
580
+ r.x = __internal_float2half(a, sign, remainder);
581
+ val = r;
582
+ #endif
583
+ return val;
584
+ }
585
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a)
586
+ {
587
+ __half val;
588
+ #if defined(__CUDA_ARCH__)
589
+ asm("{ cvt.rm.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
590
+ #else
591
+ __half_raw r;
592
+ unsigned int sign = 0U;
593
+ unsigned int remainder = 0U;
594
+ r.x = __internal_float2half(a, sign, remainder);
595
+ if ((remainder != 0U) && (sign != 0U)) {
596
+ r.x++;
597
+ }
598
+ val = r;
599
+ #endif
600
+ return val;
601
+ }
602
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a)
603
+ {
604
+ __half val;
605
+ #if defined(__CUDA_ARCH__)
606
+ asm("{ cvt.rp.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
607
+ #else
608
+ __half_raw r;
609
+ unsigned int sign = 0U;
610
+ unsigned int remainder = 0U;
611
+ r.x = __internal_float2half(a, sign, remainder);
612
+ if ((remainder != 0U) && (sign == 0U)) {
613
+ r.x++;
614
+ }
615
+ val = r;
616
+ #endif
617
+ return val;
618
+ }
619
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a)
620
+ {
621
+ __half2 val;
622
+ #if defined(__CUDA_ARCH__)
623
+ asm("{.reg .f16 low;\n"
624
+ " cvt.rn.f16.f32 low, %1;\n"
625
+ " mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a));
626
+ #else
627
+ val = __half2(__float2half_rn(a), __float2half_rn(a));
628
+ #endif
629
+ return val;
630
+ }
631
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b)
632
+ {
633
+ __half2 val;
634
+ #if defined(__CUDA_ARCH__)
635
+ #if (__CUDA_ARCH__ >= 800)
636
+ asm("{ cvt.rn.f16x2.f32 %0, %2, %1; }\n"
637
+ : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b));
638
+ #else
639
+ asm("{.reg .f16 low,high;\n"
640
+ " cvt.rn.f16.f32 low, %1;\n"
641
+ " cvt.rn.f16.f32 high, %2;\n"
642
+ " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b));
643
+ #endif
644
+ #else
645
+ val = __half2(__float2half_rn(a), __float2half_rn(b));
646
+ #endif
647
+ return val;
648
+ }
649
+
650
+ #ifndef __CUDACC_RTC__ /* no host functions in NVRTC mode */
651
+ static inline float __internal_half2float(const unsigned short h)
652
+ {
653
+ unsigned int sign = ((static_cast<unsigned int>(h) >> 15U) & 1U);
654
+ unsigned int exponent = ((static_cast<unsigned int>(h) >> 10U) & 0x1fU);
655
+ unsigned int mantissa = ((static_cast<unsigned int>(h) & 0x3ffU) << 13U);
656
+ float f;
657
+ if (exponent == 0x1fU) { /* NaN or Inf */
658
+ /* discard sign of a NaN */
659
+ sign = ((mantissa != 0U) ? (sign >> 1U) : sign);
660
+ mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U);
661
+ exponent = 0xffU;
662
+ } else if (exponent == 0U) { /* Denorm or Zero */
663
+ if (mantissa != 0U) {
664
+ unsigned int msb;
665
+ exponent = 0x71U;
666
+ do {
667
+ msb = (mantissa & 0x400000U);
668
+ mantissa <<= 1U; /* normalize */
669
+ --exponent;
670
+ } while (msb == 0U);
671
+ mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
672
+ }
673
+ } else {
674
+ exponent += 0x70U;
675
+ }
676
+ const unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa);
677
+ #if defined(__CUDACC__)
678
+ (void)memcpy(&f, &u, sizeof(u));
679
+ #else
680
+ (void)std::memcpy(&f, &u, sizeof(u));
681
+ #endif
682
+ return f;
683
+ }
684
+ #endif /* !defined(__CUDACC_RTC__) */
685
+
686
+ __CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a)
687
+ {
688
+ float val;
689
+ #if defined(__CUDA_ARCH__)
690
+ asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(a)));
691
+ #else
692
+ val = __internal_half2float(static_cast<__half_raw>(a).x);
693
+ #endif
694
+ return val;
695
+ }
696
+ __CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a)
697
+ {
698
+ float val;
699
+ #if defined(__CUDA_ARCH__)
700
+ asm("{.reg .f16 low,high;\n"
701
+ " mov.b32 {low,high},%1;\n"
702
+ " cvt.f32.f16 %0, low;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a)));
703
+ #else
704
+ val = __internal_half2float(static_cast<__half2_raw>(a).x);
705
+ #endif
706
+ return val;
707
+ }
708
+ __CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a)
709
+ {
710
+ float val;
711
+ #if defined(__CUDA_ARCH__)
712
+ asm("{.reg .f16 low,high;\n"
713
+ " mov.b32 {low,high},%1;\n"
714
+ " cvt.f32.f16 %0, high;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a)));
715
+ #else
716
+ val = __internal_half2float(static_cast<__half2_raw>(a).y);
717
+ #endif
718
+ return val;
719
+ }
720
+ __CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h)
721
+ {
722
+ short int i;
723
+ #if defined __CUDA_ARCH__
724
+ asm("cvt.rzi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
725
+ #else
726
+ const float f = __half2float(h);
727
+ const short int max_val = (short int)0x7fffU;
728
+ const short int min_val = (short int)0x8000U;
729
+ const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
730
+ // saturation fixup
731
+ if (bits > (unsigned short)0xF800U) {
732
+ // NaN
733
+ i = 0;
734
+ } else if (f > static_cast<float>(max_val)) {
735
+ // saturate maximum
736
+ i = max_val;
737
+ } else if (f < static_cast<float>(min_val)) {
738
+ // saturate minimum
739
+ i = min_val;
740
+ } else {
741
+ // normal value, conversion is well-defined
742
+ i = static_cast<short int>(f);
743
+ }
744
+ #endif
745
+ return i;
746
+ }
747
+ __CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h)
748
+ {
749
+ unsigned short int i;
750
+ #if defined __CUDA_ARCH__
751
+ asm("cvt.rzi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
752
+ #else
753
+ const float f = __half2float(h);
754
+ const unsigned short int max_val = 0xffffU;
755
+ const unsigned short int min_val = 0U;
756
+ const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
757
+ // saturation fixup
758
+ if (bits > (unsigned short)0xF800U) {
759
+ // NaN
760
+ i = 0U;
761
+ } else if (f > static_cast<float>(max_val)) {
762
+ // saturate maximum
763
+ i = max_val;
764
+ } else if (f < static_cast<float>(min_val)) {
765
+ // saturate minimum
766
+ i = min_val;
767
+ } else {
768
+ // normal value, conversion is well-defined
769
+ i = static_cast<unsigned short int>(f);
770
+ }
771
+ #endif
772
+ return i;
773
+ }
774
+ __CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h)
775
+ {
776
+ int i;
777
+ #if defined __CUDA_ARCH__
778
+ asm("cvt.rzi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
779
+ #else
780
+ const float f = __half2float(h);
781
+ const int max_val = (int)0x7fffffffU;
782
+ const int min_val = (int)0x80000000U;
783
+ const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
784
+ // saturation fixup
785
+ if (bits > (unsigned short)0xF800U) {
786
+ // NaN
787
+ i = 0;
788
+ } else if (f > static_cast<float>(max_val)) {
789
+ // saturate maximum
790
+ i = max_val;
791
+ } else if (f < static_cast<float>(min_val)) {
792
+ // saturate minimum
793
+ i = min_val;
794
+ } else {
795
+ // normal value, conversion is well-defined
796
+ i = static_cast<int>(f);
797
+ }
798
+ #endif
799
+ return i;
800
+ }
801
+ __CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h)
802
+ {
803
+ unsigned int i;
804
+ #if defined __CUDA_ARCH__
805
+ asm("cvt.rzi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
806
+ #else
807
+ const float f = __half2float(h);
808
+ const unsigned int max_val = 0xffffffffU;
809
+ const unsigned int min_val = 0U;
810
+ const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
811
+ // saturation fixup
812
+ if (bits > (unsigned short)0xF800U) {
813
+ // NaN
814
+ i = 0U;
815
+ } else if (f > static_cast<float>(max_val)) {
816
+ // saturate maximum
817
+ i = max_val;
818
+ } else if (f < static_cast<float>(min_val)) {
819
+ // saturate minimum
820
+ i = min_val;
821
+ } else {
822
+ // normal value, conversion is well-defined
823
+ i = static_cast<unsigned int>(f);
824
+ }
825
+ #endif
826
+ return i;
827
+ }
828
+ __CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h)
829
+ {
830
+ long long int i;
831
+ #if defined __CUDA_ARCH__
832
+ asm("cvt.rzi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
833
+ #else
834
+ const float f = __half2float(h);
835
+ const long long int max_val = (long long int)0x7fffffffffffffffULL;
836
+ const long long int min_val = (long long int)0x8000000000000000ULL;
837
+ const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
838
+ // saturation fixup
839
+ if (bits > (unsigned short)0xF800U) {
840
+ // NaN
841
+ i = min_val;
842
+ } else if (f > static_cast<float>(max_val)) {
843
+ // saturate maximum
844
+ i = max_val;
845
+ } else if (f < static_cast<float>(min_val)) {
846
+ // saturate minimum
847
+ i = min_val;
848
+ } else {
849
+ // normal value, conversion is well-defined
850
+ i = static_cast<long long int>(f);
851
+ }
852
+ #endif
853
+ return i;
854
+ }
855
+ __CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h)
856
+ {
857
+ unsigned long long int i;
858
+ #if defined __CUDA_ARCH__
859
+ asm("cvt.rzi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
860
+ #else
861
+ const float f = __half2float(h);
862
+ const unsigned long long int max_val = 0xffffffffffffffffULL;
863
+ const unsigned long long int min_val = 0ULL;
864
+ const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
865
+ // saturation fixup
866
+ if (bits > (unsigned short)0xF800U) {
867
+ // NaN
868
+ i = 0x8000000000000000ULL;
869
+ } else if (f > static_cast<float>(max_val)) {
870
+ // saturate maximum
871
+ i = max_val;
872
+ } else if (f < static_cast<float>(min_val)) {
873
+ // saturate minimum
874
+ i = min_val;
875
+ } else {
876
+ // normal value, conversion is well-defined
877
+ i = static_cast<unsigned long long int>(f);
878
+ }
879
+ #endif
880
+ return i;
881
+ }
882
+
883
+ /* Intrinsic functions only available to nvcc compilers */
884
+ #if defined(__CUDACC__)
885
+
886
+ /* CUDA vector-types compatible vector creation function (note returns __half2, not half2) */
887
+ __VECTOR_FUNCTIONS_DECL__ __half2 make_half2(const __half x, const __half y)
888
+ {
889
+ __half2 t; t.x = x; t.y = y; return t;
890
+ }
891
+ #undef __VECTOR_FUNCTIONS_DECL__
892
+
893
+
894
+ /* Definitions of intrinsics */
895
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a)
896
+ {
897
+ const __half2 val = __floats2half2_rn(a.x, a.y);
898
+ return val;
899
+ }
900
+ __CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a)
901
+ {
902
+ float hi_float;
903
+ float lo_float;
904
+ #if defined(__CUDA_ARCH__)
905
+ asm("{.reg .f16 low,high;\n"
906
+ " mov.b32 {low,high},%1;\n"
907
+ " cvt.f32.f16 %0, low;}\n" : "=f"(lo_float) : "r"(__HALF2_TO_CUI(a)));
908
+
909
+ asm("{.reg .f16 low,high;\n"
910
+ " mov.b32 {low,high},%1;\n"
911
+ " cvt.f32.f16 %0, high;}\n" : "=f"(hi_float) : "r"(__HALF2_TO_CUI(a)));
912
+ #else
913
+ lo_float = __internal_half2float(((__half2_raw)a).x);
914
+ hi_float = __internal_half2float(((__half2_raw)a).y);
915
+ #endif
916
+ return make_float2(lo_float, hi_float);
917
+ }
918
+ __CUDA_FP16_DECL__ int __half2int_rn(const __half h)
919
+ {
920
+ int i;
921
+ asm("cvt.rni.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
922
+ return i;
923
+ }
924
+ __CUDA_FP16_DECL__ int __half2int_rd(const __half h)
925
+ {
926
+ int i;
927
+ asm("cvt.rmi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
928
+ return i;
929
+ }
930
+ __CUDA_FP16_DECL__ int __half2int_ru(const __half h)
931
+ {
932
+ int i;
933
+ asm("cvt.rpi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
934
+ return i;
935
+ }
936
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i)
937
+ {
938
+ __half h;
939
+ #if defined(__CUDA_ARCH__)
940
+ asm("cvt.rn.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
941
+ #else
942
+ // double-rounding is not a problem here: if integer
943
+ // has more than 24 bits, it is already too large to
944
+ // be represented in half precision, and result will
945
+ // be infinity.
946
+ const float f = static_cast<float>(i);
947
+ h = __float2half_rn(f);
948
+ #endif
949
+ return h;
950
+ }
951
+ __CUDA_FP16_DECL__ __half __int2half_rz(const int i)
952
+ {
953
+ __half h;
954
+ asm("cvt.rz.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
955
+ return h;
956
+ }
957
+ __CUDA_FP16_DECL__ __half __int2half_rd(const int i)
958
+ {
959
+ __half h;
960
+ asm("cvt.rm.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
961
+ return h;
962
+ }
963
+ __CUDA_FP16_DECL__ __half __int2half_ru(const int i)
964
+ {
965
+ __half h;
966
+ asm("cvt.rp.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
967
+ return h;
968
+ }
969
+
970
+ __CUDA_FP16_DECL__ short int __half2short_rn(const __half h)
971
+ {
972
+ short int i;
973
+ asm("cvt.rni.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
974
+ return i;
975
+ }
976
+ __CUDA_FP16_DECL__ short int __half2short_rd(const __half h)
977
+ {
978
+ short int i;
979
+ asm("cvt.rmi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
980
+ return i;
981
+ }
982
+ __CUDA_FP16_DECL__ short int __half2short_ru(const __half h)
983
+ {
984
+ short int i;
985
+ asm("cvt.rpi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
986
+ return i;
987
+ }
988
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i)
989
+ {
990
+ __half h;
991
+ #if defined __CUDA_ARCH__
992
+ asm("cvt.rn.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
993
+ #else
994
+ const float f = static_cast<float>(i);
995
+ h = __float2half_rn(f);
996
+ #endif
997
+ return h;
998
+ }
999
+ __CUDA_FP16_DECL__ __half __short2half_rz(const short int i)
1000
+ {
1001
+ __half h;
1002
+ asm("cvt.rz.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
1003
+ return h;
1004
+ }
1005
+ __CUDA_FP16_DECL__ __half __short2half_rd(const short int i)
1006
+ {
1007
+ __half h;
1008
+ asm("cvt.rm.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
1009
+ return h;
1010
+ }
1011
+ __CUDA_FP16_DECL__ __half __short2half_ru(const short int i)
1012
+ {
1013
+ __half h;
1014
+ asm("cvt.rp.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
1015
+ return h;
1016
+ }
1017
+
1018
+ __CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h)
1019
+ {
1020
+ unsigned int i;
1021
+ asm("cvt.rni.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
1022
+ return i;
1023
+ }
1024
+ __CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h)
1025
+ {
1026
+ unsigned int i;
1027
+ asm("cvt.rmi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
1028
+ return i;
1029
+ }
1030
+ __CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h)
1031
+ {
1032
+ unsigned int i;
1033
+ asm("cvt.rpi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
1034
+ return i;
1035
+ }
1036
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i)
1037
+ {
1038
+ __half h;
1039
+ #if defined __CUDA_ARCH__
1040
+ asm("cvt.rn.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
1041
+ #else
1042
+ // double-rounding is not a problem here: if integer
1043
+ // has more than 24 bits, it is already too large to
1044
+ // be represented in half precision, and result will
1045
+ // be infinity.
1046
+ const float f = static_cast<float>(i);
1047
+ h = __float2half_rn(f);
1048
+ #endif
1049
+ return h;
1050
+ }
1051
+ __CUDA_FP16_DECL__ __half __uint2half_rz(const unsigned int i)
1052
+ {
1053
+ __half h;
1054
+ asm("cvt.rz.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
1055
+ return h;
1056
+ }
1057
+ __CUDA_FP16_DECL__ __half __uint2half_rd(const unsigned int i)
1058
+ {
1059
+ __half h;
1060
+ asm("cvt.rm.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
1061
+ return h;
1062
+ }
1063
+ __CUDA_FP16_DECL__ __half __uint2half_ru(const unsigned int i)
1064
+ {
1065
+ __half h;
1066
+ asm("cvt.rp.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
1067
+ return h;
1068
+ }
1069
+
1070
+ __CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h)
1071
+ {
1072
+ unsigned short int i;
1073
+ asm("cvt.rni.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
1074
+ return i;
1075
+ }
1076
+ __CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h)
1077
+ {
1078
+ unsigned short int i;
1079
+ asm("cvt.rmi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
1080
+ return i;
1081
+ }
1082
+ __CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h)
1083
+ {
1084
+ unsigned short int i;
1085
+ asm("cvt.rpi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
1086
+ return i;
1087
+ }
1088
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i)
1089
+ {
1090
+ __half h;
1091
+ #if defined __CUDA_ARCH__
1092
+ asm("cvt.rn.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
1093
+ #else
1094
+ const float f = static_cast<float>(i);
1095
+ h = __float2half_rn(f);
1096
+ #endif
1097
+ return h;
1098
+ }
1099
+ __CUDA_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i)
1100
+ {
1101
+ __half h;
1102
+ asm("cvt.rz.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
1103
+ return h;
1104
+ }
1105
+ __CUDA_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i)
1106
+ {
1107
+ __half h;
1108
+ asm("cvt.rm.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
1109
+ return h;
1110
+ }
1111
+ __CUDA_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i)
1112
+ {
1113
+ __half h;
1114
+ asm("cvt.rp.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
1115
+ return h;
1116
+ }
1117
+
1118
+ __CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h)
1119
+ {
1120
+ unsigned long long int i;
1121
+ asm("cvt.rni.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
1122
+ return i;
1123
+ }
1124
+ __CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h)
1125
+ {
1126
+ unsigned long long int i;
1127
+ asm("cvt.rmi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
1128
+ return i;
1129
+ }
1130
+ __CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h)
1131
+ {
1132
+ unsigned long long int i;
1133
+ asm("cvt.rpi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
1134
+ return i;
1135
+ }
1136
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int i)
1137
+ {
1138
+ __half h;
1139
+ #if defined(__CUDA_ARCH__)
1140
+ asm("cvt.rn.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
1141
+ #else
1142
+ // double-rounding is not a problem here: if integer
1143
+ // has more than 24 bits, it is already too large to
1144
+ // be represented in half precision, and result will
1145
+ // be infinity.
1146
+ const float f = static_cast<float>(i);
1147
+ h = __float2half_rn(f);
1148
+ #endif
1149
+ return h;
1150
+ }
1151
+ __CUDA_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i)
1152
+ {
1153
+ __half h;
1154
+ asm("cvt.rz.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
1155
+ return h;
1156
+ }
1157
+ __CUDA_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i)
1158
+ {
1159
+ __half h;
1160
+ asm("cvt.rm.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
1161
+ return h;
1162
+ }
1163
+ __CUDA_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i)
1164
+ {
1165
+ __half h;
1166
+ asm("cvt.rp.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
1167
+ return h;
1168
+ }
1169
+
1170
+ __CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h)
1171
+ {
1172
+ long long int i;
1173
+ asm("cvt.rni.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
1174
+ return i;
1175
+ }
1176
+ __CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h)
1177
+ {
1178
+ long long int i;
1179
+ asm("cvt.rmi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
1180
+ return i;
1181
+ }
1182
+ __CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h)
1183
+ {
1184
+ long long int i;
1185
+ asm("cvt.rpi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
1186
+ return i;
1187
+ }
1188
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i)
1189
+ {
1190
+ __half h;
1191
+ #if defined(__CUDA_ARCH__)
1192
+ asm("cvt.rn.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
1193
+ #else
1194
+ // double-rounding is not a problem here: if integer
1195
+ // has more than 24 bits, it is already too large to
1196
+ // be represented in half precision, and result will
1197
+ // be infinity.
1198
+ const float f = static_cast<float>(i);
1199
+ h = __float2half_rn(f);
1200
+ #endif
1201
+ return h;
1202
+ }
1203
+ __CUDA_FP16_DECL__ __half __ll2half_rz(const long long int i)
1204
+ {
1205
+ __half h;
1206
+ asm("cvt.rz.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
1207
+ return h;
1208
+ }
1209
+ __CUDA_FP16_DECL__ __half __ll2half_rd(const long long int i)
1210
+ {
1211
+ __half h;
1212
+ asm("cvt.rm.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
1213
+ return h;
1214
+ }
1215
+ __CUDA_FP16_DECL__ __half __ll2half_ru(const long long int i)
1216
+ {
1217
+ __half h;
1218
+ asm("cvt.rp.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
1219
+ return h;
1220
+ }
1221
+
1222
+ __CUDA_FP16_DECL__ __half htrunc(const __half h)
1223
+ {
1224
+ __half r;
1225
+ asm("cvt.rzi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
1226
+ return r;
1227
+ }
1228
+ __CUDA_FP16_DECL__ __half hceil(const __half h)
1229
+ {
1230
+ __half r;
1231
+ asm("cvt.rpi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
1232
+ return r;
1233
+ }
1234
+ __CUDA_FP16_DECL__ __half hfloor(const __half h)
1235
+ {
1236
+ __half r;
1237
+ asm("cvt.rmi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
1238
+ return r;
1239
+ }
1240
+ __CUDA_FP16_DECL__ __half hrint(const __half h)
1241
+ {
1242
+ __half r;
1243
+ asm("cvt.rni.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
1244
+ return r;
1245
+ }
1246
+
1247
+ __CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h)
1248
+ {
1249
+ __half2 val;
1250
+ asm("{.reg .f16 low,high;\n"
1251
+ " mov.b32 {low,high}, %1;\n"
1252
+ " cvt.rzi.f16.f16 low, low;\n"
1253
+ " cvt.rzi.f16.f16 high, high;\n"
1254
+ " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
1255
+ return val;
1256
+ }
1257
+ __CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h)
1258
+ {
1259
+ __half2 val;
1260
+ asm("{.reg .f16 low,high;\n"
1261
+ " mov.b32 {low,high}, %1;\n"
1262
+ " cvt.rpi.f16.f16 low, low;\n"
1263
+ " cvt.rpi.f16.f16 high, high;\n"
1264
+ " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
1265
+ return val;
1266
+ }
1267
+ __CUDA_FP16_DECL__ __half2 h2floor(const __half2 h)
1268
+ {
1269
+ __half2 val;
1270
+ asm("{.reg .f16 low,high;\n"
1271
+ " mov.b32 {low,high}, %1;\n"
1272
+ " cvt.rmi.f16.f16 low, low;\n"
1273
+ " cvt.rmi.f16.f16 high, high;\n"
1274
+ " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
1275
+ return val;
1276
+ }
1277
+ __CUDA_FP16_DECL__ __half2 h2rint(const __half2 h)
1278
+ {
1279
+ __half2 val;
1280
+ asm("{.reg .f16 low,high;\n"
1281
+ " mov.b32 {low,high}, %1;\n"
1282
+ " cvt.rni.f16.f16 low, low;\n"
1283
+ " cvt.rni.f16.f16 high, high;\n"
1284
+ " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
1285
+ return val;
1286
+ }
1287
+ __CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b)
1288
+ {
1289
+ __half2 val;
1290
+ asm("{.reg .f16 alow,ahigh,blow,bhigh;\n"
1291
+ " mov.b32 {alow,ahigh}, %1;\n"
1292
+ " mov.b32 {blow,bhigh}, %2;\n"
1293
+ " mov.b32 %0, {alow,blow};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b)));
1294
+ return val;
1295
+ }
1296
+ __CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b)
1297
+ {
1298
+ __half2 val;
1299
+ asm("{.reg .f16 alow,ahigh,blow,bhigh;\n"
1300
+ " mov.b32 {alow,ahigh}, %1;\n"
1301
+ " mov.b32 {blow,bhigh}, %2;\n"
1302
+ " mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b)));
1303
+ return val;
1304
+ }
1305
+ __CUDA_FP16_DECL__ __half __low2half(const __half2 a)
1306
+ {
1307
+ __half ret;
1308
+ asm("{.reg .f16 low,high;\n"
1309
+ " mov.b32 {low,high}, %1;\n"
1310
+ " mov.b16 %0, low;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a)));
1311
+ return ret;
1312
+ }
1313
+ __CUDA_FP16_DECL__ int __hisinf(const __half a)
1314
+ {
1315
+ int retval;
1316
+ if (__HALF_TO_CUS(a) == 0xFC00U) {
1317
+ retval = -1;
1318
+ } else if (__HALF_TO_CUS(a) == 0x7C00U) {
1319
+ retval = 1;
1320
+ } else {
1321
+ retval = 0;
1322
+ }
1323
+ return retval;
1324
+ }
1325
+ __CUDA_FP16_DECL__ __half2 __low2half2(const __half2 a)
1326
+ {
1327
+ __half2 val;
1328
+ asm("{.reg .f16 low,high;\n"
1329
+ " mov.b32 {low,high}, %1;\n"
1330
+ " mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
1331
+ return val;
1332
+ }
1333
+ __CUDA_FP16_DECL__ __half2 __high2half2(const __half2 a)
1334
+ {
1335
+ __half2 val;
1336
+ asm("{.reg .f16 low,high;\n"
1337
+ " mov.b32 {low,high}, %1;\n"
1338
+ " mov.b32 %0, {high,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
1339
+ return val;
1340
+ }
1341
+ __CUDA_FP16_DECL__ __half __high2half(const __half2 a)
1342
+ {
1343
+ __half ret;
1344
+ asm("{.reg .f16 low,high;\n"
1345
+ " mov.b32 {low,high}, %1;\n"
1346
+ " mov.b16 %0, high;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a)));
1347
+ return ret;
1348
+ }
1349
+ __CUDA_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b)
1350
+ {
1351
+ __half2 val;
1352
+ asm("{ mov.b32 %0, {%1,%2};}\n"
1353
+ : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b)));
1354
+ return val;
1355
+ }
1356
+ __CUDA_FP16_DECL__ __half2 __half2half2(const __half a)
1357
+ {
1358
+ __half2 val;
1359
+ asm("{ mov.b32 %0, {%1,%1};}\n"
1360
+ : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a)));
1361
+ return val;
1362
+ }
1363
+ __CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a)
1364
+ {
1365
+ __half2 val;
1366
+ asm("{.reg .f16 low,high;\n"
1367
+ " mov.b32 {low,high}, %1;\n"
1368
+ " mov.b32 %0, {high,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
1369
+ return val;
1370
+ }
1371
+ __CUDA_FP16_DECL__ short int __half_as_short(const __half h)
1372
+ {
1373
+ return static_cast<short int>(__HALF_TO_CUS(h));
1374
+ }
1375
+ __CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h)
1376
+ {
1377
+ return __HALF_TO_CUS(h);
1378
+ }
1379
+ __CUDA_FP16_DECL__ __half __short_as_half(const short int i)
1380
+ {
1381
+ __half h;
1382
+ __HALF_TO_US(h) = static_cast<unsigned short int>(i);
1383
+ return h;
1384
+ }
1385
+ __CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i)
1386
+ {
1387
+ __half h;
1388
+ __HALF_TO_US(h) = i;
1389
+ return h;
1390
+ }
1391
+
1392
+ /******************************************************************************
1393
+ * __half arithmetic *
1394
+ ******************************************************************************/
1395
+ __CUDA_FP16_DECL__ __half __hmax(const __half a, const __half b)
1396
+ {
1397
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
1398
+ __BINARY_OP_HALF_MACRO(max)
1399
+ #else
1400
+ const float fa = __half2float(a);
1401
+ const float fb = __half2float(b);
1402
+ float fr;
1403
+ asm("{max.f32 %0,%1,%2;\n}"
1404
+ :"=f"(fr) : "f"(fa), "f"(fb));
1405
+ const __half hr = __float2half(fr);
1406
+ return hr;
1407
+ #endif
1408
+ }
1409
+ __CUDA_FP16_DECL__ __half __hmin(const __half a, const __half b)
1410
+ {
1411
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
1412
+ __BINARY_OP_HALF_MACRO(min)
1413
+ #else
1414
+ const float fa = __half2float(a);
1415
+ const float fb = __half2float(b);
1416
+ float fr;
1417
+ asm("{min.f32 %0,%1,%2;\n}"
1418
+ :"=f"(fr) : "f"(fa), "f"(fb));
1419
+ const __half hr = __float2half(fr);
1420
+ return hr;
1421
+ #endif
1422
+ }
1423
+
1424
+ /******************************************************************************
1425
+ * __half2 arithmetic *
1426
+ ******************************************************************************/
1427
+ __CUDA_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b)
1428
+ {
1429
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
1430
+ __BINARY_OP_HALF2_MACRO(max)
1431
+ #else
1432
+ const float2 fa = __half22float2(a);
1433
+ const float2 fb = __half22float2(b);
1434
+ float2 fr;
1435
+ asm("{max.f32 %0,%1,%2;\n}"
1436
+ :"=f"(fr.x) : "f"(fa.x), "f"(fb.x));
1437
+ asm("{max.f32 %0,%1,%2;\n}"
1438
+ :"=f"(fr.y) : "f"(fa.y), "f"(fb.y));
1439
+ const __half2 hr = __float22half2_rn(fr);
1440
+ return hr;
1441
+ #endif
1442
+ }
1443
+ __CUDA_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b)
1444
+ {
1445
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
1446
+ __BINARY_OP_HALF2_MACRO(min)
1447
+ #else
1448
+ const float2 fa = __half22float2(a);
1449
+ const float2 fb = __half22float2(b);
1450
+ float2 fr;
1451
+ asm("{min.f32 %0,%1,%2;\n}"
1452
+ :"=f"(fr.x) : "f"(fa.x), "f"(fb.x));
1453
+ asm("{min.f32 %0,%1,%2;\n}"
1454
+ :"=f"(fr.y) : "f"(fa.y), "f"(fb.y));
1455
+ const __half2 hr = __float22half2_rn(fr);
1456
+ return hr;
1457
+ #endif
1458
+ }
1459
+
1460
+
1461
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300)
1462
+ /******************************************************************************
1463
+ * __half, __half2 warp shuffle *
1464
+ ******************************************************************************/
1465
+ #define __SHUFFLE_HALF2_MACRO(name) /* do */ {\
1466
+ __half2 r; \
1467
+ asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3;\n}" \
1468
+ :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c)); \
1469
+ return r; \
1470
+ } /* while(0) */
1471
+
1472
+ #define __SHUFFLE_SYNC_HALF2_MACRO(name) /* do */ {\
1473
+ __half2 r; \
1474
+ asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3,%4;\n}" \
1475
+ :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \
1476
+ return r; \
1477
+ } /* while(0) */
1478
+
1479
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
1480
+
1481
+ __CUDA_FP16_DECL__ __half2 __shfl(const __half2 var, const int delta, const int width)
1482
+ {
1483
+ unsigned int warp_size;
1484
+ asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
1485
+ const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
1486
+ __SHUFFLE_HALF2_MACRO(shfl.idx.b32)
1487
+ }
1488
+ __CUDA_FP16_DECL__ __half2 __shfl_up(const __half2 var, const unsigned int delta, const int width)
1489
+ {
1490
+ unsigned int warp_size;
1491
+ asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
1492
+ const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U;
1493
+ __SHUFFLE_HALF2_MACRO(shfl.up.b32)
1494
+ }
1495
+ __CUDA_FP16_DECL__ __half2 __shfl_down(const __half2 var, const unsigned int delta, const int width)
1496
+ {
1497
+ unsigned int warp_size;
1498
+ asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
1499
+ const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
1500
+ __SHUFFLE_HALF2_MACRO(shfl.down.b32)
1501
+ }
1502
+ __CUDA_FP16_DECL__ __half2 __shfl_xor(const __half2 var, const int delta, const int width)
1503
+ {
1504
+ unsigned int warp_size;
1505
+ asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
1506
+ const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
1507
+ __SHUFFLE_HALF2_MACRO(shfl.bfly.b32)
1508
+ }
1509
+
1510
+ #endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */
1511
+
1512
+ __CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned mask, const __half2 var, const int delta, const int width)
1513
+ {
1514
+ unsigned int warp_size;
1515
+ asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
1516
+ const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
1517
+ __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.idx.b32)
1518
+ }
1519
+ __CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width)
1520
+ {
1521
+ unsigned int warp_size;
1522
+ asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
1523
+ const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U;
1524
+ __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.up.b32)
1525
+ }
1526
+ __CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width)
1527
+ {
1528
+ unsigned int warp_size;
1529
+ asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
1530
+ const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
1531
+ __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.down.b32)
1532
+ }
1533
+ __CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned mask, const __half2 var, const int delta, const int width)
1534
+ {
1535
+ unsigned int warp_size;
1536
+ asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
1537
+ const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
1538
+ __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.bfly.b32)
1539
+ }
1540
+
1541
+ #undef __SHUFFLE_HALF2_MACRO
1542
+ #undef __SHUFFLE_SYNC_HALF2_MACRO
1543
+
1544
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
1545
+
1546
+ __CUDA_FP16_DECL__ __half __shfl(const __half var, const int delta, const int width)
1547
+ {
1548
+ const __half2 temp1 = __halves2half2(var, var);
1549
+ const __half2 temp2 = __shfl(temp1, delta, width);
1550
+ return __low2half(temp2);
1551
+ }
1552
+ __CUDA_FP16_DECL__ __half __shfl_up(const __half var, const unsigned int delta, const int width)
1553
+ {
1554
+ const __half2 temp1 = __halves2half2(var, var);
1555
+ const __half2 temp2 = __shfl_up(temp1, delta, width);
1556
+ return __low2half(temp2);
1557
+ }
1558
+ __CUDA_FP16_DECL__ __half __shfl_down(const __half var, const unsigned int delta, const int width)
1559
+ {
1560
+ const __half2 temp1 = __halves2half2(var, var);
1561
+ const __half2 temp2 = __shfl_down(temp1, delta, width);
1562
+ return __low2half(temp2);
1563
+ }
1564
+ __CUDA_FP16_DECL__ __half __shfl_xor(const __half var, const int delta, const int width)
1565
+ {
1566
+ const __half2 temp1 = __halves2half2(var, var);
1567
+ const __half2 temp2 = __shfl_xor(temp1, delta, width);
1568
+ return __low2half(temp2);
1569
+ }
1570
+
1571
+ #endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */
1572
+
1573
+ __CUDA_FP16_DECL__ __half __shfl_sync(const unsigned mask, const __half var, const int delta, const int width)
1574
+ {
1575
+ const __half2 temp1 = __halves2half2(var, var);
1576
+ const __half2 temp2 = __shfl_sync(mask, temp1, delta, width);
1577
+ return __low2half(temp2);
1578
+ }
1579
+ __CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned mask, const __half var, const unsigned int delta, const int width)
1580
+ {
1581
+ const __half2 temp1 = __halves2half2(var, var);
1582
+ const __half2 temp2 = __shfl_up_sync(mask, temp1, delta, width);
1583
+ return __low2half(temp2);
1584
+ }
1585
+ __CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned mask, const __half var, const unsigned int delta, const int width)
1586
+ {
1587
+ const __half2 temp1 = __halves2half2(var, var);
1588
+ const __half2 temp2 = __shfl_down_sync(mask, temp1, delta, width);
1589
+ return __low2half(temp2);
1590
+ }
1591
+ __CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned mask, const __half var, const int delta, const int width)
1592
+ {
1593
+ const __half2 temp1 = __halves2half2(var, var);
1594
+ const __half2 temp2 = __shfl_xor_sync(mask, temp1, delta, width);
1595
+ return __low2half(temp2);
1596
+ }
1597
+
1598
+ #endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300)*/
1599
+ /******************************************************************************
1600
+ * __half and __half2 __ldg,__ldcg,__ldca,__ldcs *
1601
+ ******************************************************************************/
1602
+
1603
+ #if defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))
1604
+ #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
1605
+ #define __LDG_PTR "l"
1606
+ #else
1607
+ #define __LDG_PTR "r"
1608
+ #endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
1609
+ __CUDA_FP16_DECL__ __half2 __ldg(const __half2 *const ptr)
1610
+ {
1611
+ __half2 ret;
1612
+ asm ("ld.global.nc.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
1613
+ return ret;
1614
+ }
1615
+ __CUDA_FP16_DECL__ __half __ldg(const __half *const ptr)
1616
+ {
1617
+ __half ret;
1618
+ asm ("ld.global.nc.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
1619
+ return ret;
1620
+ }
1621
+ __CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *const ptr)
1622
+ {
1623
+ __half2 ret;
1624
+ asm ("ld.global.cg.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
1625
+ return ret;
1626
+ }
1627
+ __CUDA_FP16_DECL__ __half __ldcg(const __half *const ptr)
1628
+ {
1629
+ __half ret;
1630
+ asm ("ld.global.cg.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
1631
+ return ret;
1632
+ }
1633
+ __CUDA_FP16_DECL__ __half2 __ldca(const __half2 *const ptr)
1634
+ {
1635
+ __half2 ret;
1636
+ asm ("ld.global.ca.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
1637
+ return ret;
1638
+ }
1639
+ __CUDA_FP16_DECL__ __half __ldca(const __half *const ptr)
1640
+ {
1641
+ __half ret;
1642
+ asm ("ld.global.ca.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
1643
+ return ret;
1644
+ }
1645
+ __CUDA_FP16_DECL__ __half2 __ldcs(const __half2 *const ptr)
1646
+ {
1647
+ __half2 ret;
1648
+ asm ("ld.global.cs.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
1649
+ return ret;
1650
+ }
1651
+ __CUDA_FP16_DECL__ __half __ldcs(const __half *const ptr)
1652
+ {
1653
+ __half ret;
1654
+ asm ("ld.global.cs.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
1655
+ return ret;
1656
+ }
1657
+ __CUDA_FP16_DECL__ __half2 __ldlu(const __half2 *const ptr)
1658
+ {
1659
+ __half2 ret;
1660
+ asm ("ld.global.lu.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
1661
+ return ret;
1662
+ }
1663
+ __CUDA_FP16_DECL__ __half __ldlu(const __half *const ptr)
1664
+ {
1665
+ __half ret;
1666
+ asm ("ld.global.lu.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
1667
+ return ret;
1668
+ }
1669
+ __CUDA_FP16_DECL__ __half2 __ldcv(const __half2 *const ptr)
1670
+ {
1671
+ __half2 ret;
1672
+ asm ("ld.global.cv.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
1673
+ return ret;
1674
+ }
1675
+ __CUDA_FP16_DECL__ __half __ldcv(const __half *const ptr)
1676
+ {
1677
+ __half ret;
1678
+ asm ("ld.global.cv.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
1679
+ return ret;
1680
+ }
1681
+ __CUDA_FP16_DECL__ void __stwb(__half2 *const ptr, const __half2 value)
1682
+ {
1683
+ asm ("st.global.wb.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
1684
+ }
1685
+ __CUDA_FP16_DECL__ void __stwb(__half *const ptr, const __half value)
1686
+ {
1687
+ asm ("st.global.wb.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory");
1688
+ }
1689
+ __CUDA_FP16_DECL__ void __stcg(__half2 *const ptr, const __half2 value)
1690
+ {
1691
+ asm ("st.global.cg.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
1692
+ }
1693
+ __CUDA_FP16_DECL__ void __stcg(__half *const ptr, const __half value)
1694
+ {
1695
+ asm ("st.global.cg.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory");
1696
+ }
1697
+ __CUDA_FP16_DECL__ void __stcs(__half2 *const ptr, const __half2 value)
1698
+ {
1699
+ asm ("st.global.cs.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
1700
+ }
1701
+ __CUDA_FP16_DECL__ void __stcs(__half *const ptr, const __half value)
1702
+ {
1703
+ asm ("st.global.cs.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory");
1704
+ }
1705
+ __CUDA_FP16_DECL__ void __stwt(__half2 *const ptr, const __half2 value)
1706
+ {
1707
+ asm ("st.global.wt.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
1708
+ }
1709
+ __CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value)
1710
+ {
1711
+ asm ("st.global.wt.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory");
1712
+ }
1713
+ #undef __LDG_PTR
1714
+ #endif /*defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))*/
1715
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
1716
+ /******************************************************************************
1717
+ * __half2 comparison *
1718
+ ******************************************************************************/
1719
+ #define __COMPARISON_OP_HALF2_MACRO(name) /* do */ {\
1720
+ __half2 val; \
1721
+ asm( "{ " __CUDA_FP16_STRINGIFY(name) ".f16x2.f16x2 %0,%1,%2;\n}" \
1722
+ :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
1723
+ return val; \
1724
+ } /* while(0) */
1725
+ __CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b)
1726
+ {
1727
+ __COMPARISON_OP_HALF2_MACRO(set.eq)
1728
+ }
1729
+ __CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b)
1730
+ {
1731
+ __COMPARISON_OP_HALF2_MACRO(set.ne)
1732
+ }
1733
+ __CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b)
1734
+ {
1735
+ __COMPARISON_OP_HALF2_MACRO(set.le)
1736
+ }
1737
+ __CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b)
1738
+ {
1739
+ __COMPARISON_OP_HALF2_MACRO(set.ge)
1740
+ }
1741
+ __CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b)
1742
+ {
1743
+ __COMPARISON_OP_HALF2_MACRO(set.lt)
1744
+ }
1745
+ __CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b)
1746
+ {
1747
+ __COMPARISON_OP_HALF2_MACRO(set.gt)
1748
+ }
1749
+ __CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b)
1750
+ {
1751
+ __COMPARISON_OP_HALF2_MACRO(set.equ)
1752
+ }
1753
+ __CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b)
1754
+ {
1755
+ __COMPARISON_OP_HALF2_MACRO(set.neu)
1756
+ }
1757
+ __CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b)
1758
+ {
1759
+ __COMPARISON_OP_HALF2_MACRO(set.leu)
1760
+ }
1761
+ __CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b)
1762
+ {
1763
+ __COMPARISON_OP_HALF2_MACRO(set.geu)
1764
+ }
1765
+ __CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b)
1766
+ {
1767
+ __COMPARISON_OP_HALF2_MACRO(set.ltu)
1768
+ }
1769
+ __CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b)
1770
+ {
1771
+ __COMPARISON_OP_HALF2_MACRO(set.gtu)
1772
+ }
1773
+ #undef __COMPARISON_OP_HALF2_MACRO
1774
+ #define __BOOL_COMPARISON_OP_HALF2_MACRO(name) /* do */ {\
1775
+ __half2 val; \
1776
+ bool retval; \
1777
+ asm( "{ " __CUDA_FP16_STRINGIFY(name) ".f16x2.f16x2 %0,%1,%2;\n}" \
1778
+ :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
1779
+ if (__HALF2_TO_CUI(val) == 0x3C003C00U) {\
1780
+ retval = true; \
1781
+ } else { \
1782
+ retval = false; \
1783
+ }\
1784
+ return retval;\
1785
+ } /* while(0) */
1786
+ __CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b)
1787
+ {
1788
+ __BOOL_COMPARISON_OP_HALF2_MACRO(set.eq)
1789
+ }
1790
+ __CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b)
1791
+ {
1792
+ __BOOL_COMPARISON_OP_HALF2_MACRO(set.ne)
1793
+ }
1794
+ __CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b)
1795
+ {
1796
+ __BOOL_COMPARISON_OP_HALF2_MACRO(set.le)
1797
+ }
1798
+ __CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b)
1799
+ {
1800
+ __BOOL_COMPARISON_OP_HALF2_MACRO(set.ge)
1801
+ }
1802
+ __CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b)
1803
+ {
1804
+ __BOOL_COMPARISON_OP_HALF2_MACRO(set.lt)
1805
+ }
1806
+ __CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b)
1807
+ {
1808
+ __BOOL_COMPARISON_OP_HALF2_MACRO(set.gt)
1809
+ }
1810
+ __CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b)
1811
+ {
1812
+ __BOOL_COMPARISON_OP_HALF2_MACRO(set.equ)
1813
+ }
1814
+ __CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b)
1815
+ {
1816
+ __BOOL_COMPARISON_OP_HALF2_MACRO(set.neu)
1817
+ }
1818
+ __CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b)
1819
+ {
1820
+ __BOOL_COMPARISON_OP_HALF2_MACRO(set.leu)
1821
+ }
1822
+ __CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b)
1823
+ {
1824
+ __BOOL_COMPARISON_OP_HALF2_MACRO(set.geu)
1825
+ }
1826
+ __CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b)
1827
+ {
1828
+ __BOOL_COMPARISON_OP_HALF2_MACRO(set.ltu)
1829
+ }
1830
+ __CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b)
1831
+ {
1832
+ __BOOL_COMPARISON_OP_HALF2_MACRO(set.gtu)
1833
+ }
1834
+ #undef __BOOL_COMPARISON_OP_HALF2_MACRO
1835
+ /******************************************************************************
1836
+ * __half comparison *
1837
+ ******************************************************************************/
1838
+ #define __COMPARISON_OP_HALF_MACRO(name) /* do */ {\
1839
+ unsigned short val; \
1840
+ asm( "{ .reg .pred __$temp3;\n" \
1841
+ " setp." __CUDA_FP16_STRINGIFY(name) ".f16 __$temp3, %1, %2;\n" \
1842
+ " selp.u16 %0, 1, 0, __$temp3;}" \
1843
+ : "=h"(val) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); \
1844
+ return (val != 0U) ? true : false; \
1845
+ } /* while(0) */
1846
+ __CUDA_FP16_DECL__ bool __heq(const __half a, const __half b)
1847
+ {
1848
+ __COMPARISON_OP_HALF_MACRO(eq)
1849
+ }
1850
+ __CUDA_FP16_DECL__ bool __hne(const __half a, const __half b)
1851
+ {
1852
+ __COMPARISON_OP_HALF_MACRO(ne)
1853
+ }
1854
+ __CUDA_FP16_DECL__ bool __hle(const __half a, const __half b)
1855
+ {
1856
+ __COMPARISON_OP_HALF_MACRO(le)
1857
+ }
1858
+ __CUDA_FP16_DECL__ bool __hge(const __half a, const __half b)
1859
+ {
1860
+ __COMPARISON_OP_HALF_MACRO(ge)
1861
+ }
1862
+ __CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b)
1863
+ {
1864
+ __COMPARISON_OP_HALF_MACRO(lt)
1865
+ }
1866
+ __CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b)
1867
+ {
1868
+ __COMPARISON_OP_HALF_MACRO(gt)
1869
+ }
1870
+ __CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b)
1871
+ {
1872
+ __COMPARISON_OP_HALF_MACRO(equ)
1873
+ }
1874
+ __CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b)
1875
+ {
1876
+ __COMPARISON_OP_HALF_MACRO(neu)
1877
+ }
1878
+ __CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b)
1879
+ {
1880
+ __COMPARISON_OP_HALF_MACRO(leu)
1881
+ }
1882
+ __CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b)
1883
+ {
1884
+ __COMPARISON_OP_HALF_MACRO(geu)
1885
+ }
1886
+ __CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b)
1887
+ {
1888
+ __COMPARISON_OP_HALF_MACRO(ltu)
1889
+ }
1890
+ __CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b)
1891
+ {
1892
+ __COMPARISON_OP_HALF_MACRO(gtu)
1893
+ }
1894
+ #undef __COMPARISON_OP_HALF_MACRO
1895
+ /******************************************************************************
1896
+ * __half2 arithmetic *
1897
+ ******************************************************************************/
1898
+ __CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b)
1899
+ {
1900
+ __BINARY_OP_HALF2_MACRO(add)
1901
+ }
1902
+ __CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b)
1903
+ {
1904
+ __BINARY_OP_HALF2_MACRO(sub)
1905
+ }
1906
+ __CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b)
1907
+ {
1908
+ __BINARY_OP_HALF2_MACRO(mul)
1909
+ }
1910
+ __CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b)
1911
+ {
1912
+ __BINARY_OP_HALF2_MACRO(add.sat)
1913
+ }
1914
+ __CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b)
1915
+ {
1916
+ __BINARY_OP_HALF2_MACRO(sub.sat)
1917
+ }
1918
+ __CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b)
1919
+ {
1920
+ __BINARY_OP_HALF2_MACRO(mul.sat)
1921
+ }
1922
+ __CUDA_FP16_DECL__ __half2 __hadd2_rn(const __half2 a, const __half2 b)
1923
+ {
1924
+ __BINARY_OP_HALF2_MACRO(add.rn)
1925
+ }
1926
+ __CUDA_FP16_DECL__ __half2 __hsub2_rn(const __half2 a, const __half2 b)
1927
+ {
1928
+ __BINARY_OP_HALF2_MACRO(sub.rn)
1929
+ }
1930
+ __CUDA_FP16_DECL__ __half2 __hmul2_rn(const __half2 a, const __half2 b)
1931
+ {
1932
+ __BINARY_OP_HALF2_MACRO(mul.rn)
1933
+ }
1934
+ __CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c)
1935
+ {
1936
+ __TERNARY_OP_HALF2_MACRO(fma.rn)
1937
+ }
1938
+ __CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c)
1939
+ {
1940
+ __TERNARY_OP_HALF2_MACRO(fma.rn.sat)
1941
+ }
1942
+ __CUDA_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b) {
1943
+ __half ha = __low2half(a);
1944
+ __half hb = __low2half(b);
1945
+
1946
+ const __half v1 = __hdiv(ha, hb);
1947
+
1948
+ ha = __high2half(a);
1949
+ hb = __high2half(b);
1950
+
1951
+ const __half v2 = __hdiv(ha, hb);
1952
+
1953
+ return __halves2half2(v1, v2);
1954
+ }
1955
+ /******************************************************************************
1956
+ * __half arithmetic *
1957
+ ******************************************************************************/
1958
+ __CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b)
1959
+ {
1960
+ __BINARY_OP_HALF_MACRO(add)
1961
+ }
1962
+ __CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b)
1963
+ {
1964
+ __BINARY_OP_HALF_MACRO(sub)
1965
+ }
1966
+ __CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b)
1967
+ {
1968
+ __BINARY_OP_HALF_MACRO(mul)
1969
+ }
1970
+ __CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b)
1971
+ {
1972
+ __BINARY_OP_HALF_MACRO(add.sat)
1973
+ }
1974
+ __CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b)
1975
+ {
1976
+ __BINARY_OP_HALF_MACRO(sub.sat)
1977
+ }
1978
+ __CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b)
1979
+ {
1980
+ __BINARY_OP_HALF_MACRO(mul.sat)
1981
+ }
1982
+ __CUDA_FP16_DECL__ __half __hadd_rn(const __half a, const __half b)
1983
+ {
1984
+ __BINARY_OP_HALF_MACRO(add.rn)
1985
+ }
1986
+ __CUDA_FP16_DECL__ __half __hsub_rn(const __half a, const __half b)
1987
+ {
1988
+ __BINARY_OP_HALF_MACRO(sub.rn)
1989
+ }
1990
+ __CUDA_FP16_DECL__ __half __hmul_rn(const __half a, const __half b)
1991
+ {
1992
+ __BINARY_OP_HALF_MACRO(mul.rn)
1993
+ }
1994
+ __CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c)
1995
+ {
1996
+ __TERNARY_OP_HALF_MACRO(fma.rn)
1997
+ }
1998
+ __CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c)
1999
+ {
2000
+ __TERNARY_OP_HALF_MACRO(fma.rn.sat)
2001
+ }
2002
+ __CUDA_FP16_DECL__ __half __hdiv(const __half a, const __half b) {
2003
+ __half v;
2004
+ __half abs;
2005
+ __half den;
2006
+ __HALF_TO_US(den) = 0x008FU;
2007
+
2008
+ float rcp;
2009
+ const float fa = __half2float(a);
2010
+ const float fb = __half2float(b);
2011
+
2012
+ asm("{rcp.approx.ftz.f32 %0, %1;\n}" :"=f"(rcp) : "f"(fb));
2013
+
2014
+ float fv = rcp * fa;
2015
+
2016
+ v = __float2half(fv);
2017
+ __HALF_TO_US(abs) = static_cast<unsigned short>(static_cast<unsigned int>(__HALF_TO_CUS(v)) & 0x00007FFFU);
2018
+ if (__hlt(abs, den) && (!(__HALF_TO_CUS(abs) == 0x0000U))) {
2019
+ const float err = __fmaf_rn(-fb, fv, fa);
2020
+ fv = __fmaf_rn(rcp, err, fv);
2021
+ v = __float2half(fv);
2022
+ }
2023
+ return v;
2024
+ }
2025
+
2026
+ /******************************************************************************
2027
+ * __half2 functions *
2028
+ ******************************************************************************/
2029
+ #define __SPEC_CASE2(i,r, spc, ulp) \
2030
+ "{.reg.b32 spc, ulp, p;\n"\
2031
+ " mov.b32 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\
2032
+ " mov.b32 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\
2033
+ " set.eq.f16x2.f16x2 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\
2034
+ " fma.rn.f16x2 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n"
2035
+ #define __SPEC_CASE(i,r, spc, ulp) \
2036
+ "{.reg.b16 spc, ulp, p;\n"\
2037
+ " mov.b16 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\
2038
+ " mov.b16 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\
2039
+ " set.eq.f16.f16 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\
2040
+ " fma.rn.f16 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n"
2041
+ #define __APPROX_FCAST(fun) /* do */ {\
2042
+ __half val;\
2043
+ asm("{.reg.b32 f; \n"\
2044
+ " .reg.b16 r; \n"\
2045
+ " mov.b16 r,%1; \n"\
2046
+ " cvt.f32.f16 f,r; \n"\
2047
+ " " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 f,f; \n"\
2048
+ " cvt.rn.f16.f32 r,f; \n"\
2049
+ " mov.b16 %0,r; \n"\
2050
+ "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));\
2051
+ return val;\
2052
+ } /* while(0) */
2053
+ #define __APPROX_FCAST2(fun) /* do */ {\
2054
+ __half2 val;\
2055
+ asm("{.reg.b16 hl, hu; \n"\
2056
+ " .reg.b32 fl, fu; \n"\
2057
+ " mov.b32 {hl, hu}, %1; \n"\
2058
+ " cvt.f32.f16 fl, hl; \n"\
2059
+ " cvt.f32.f16 fu, hu; \n"\
2060
+ " " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 fl, fl; \n"\
2061
+ " " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 fu, fu; \n"\
2062
+ " cvt.rn.f16.f32 hl, fl; \n"\
2063
+ " cvt.rn.f16.f32 hu, fu; \n"\
2064
+ " mov.b32 %0, {hl, hu}; \n"\
2065
+ "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); \
2066
+ return val;\
2067
+ } /* while(0) */
2068
+ static __device__ __forceinline__ float __float_simpl_sinf(float a);
2069
+ static __device__ __forceinline__ float __float_simpl_cosf(float a);
2070
+ __CUDA_FP16_DECL__ __half hsin(const __half a) {
2071
+ const float sl = __float_simpl_sinf(__half2float(a));
2072
+ __half r = __float2half_rn(sl);
2073
+ asm("{\n\t"
2074
+ " .reg.b16 i,r,t; \n\t"
2075
+ " mov.b16 r, %0; \n\t"
2076
+ " mov.b16 i, %1; \n\t"
2077
+ " and.b16 t, r, 0x8000U; \n\t"
2078
+ " abs.f16 r, r; \n\t"
2079
+ " abs.f16 i, i; \n\t"
2080
+ __SPEC_CASE(i, r, 0X32B3U, 0x0800U)
2081
+ __SPEC_CASE(i, r, 0X5CB0U, 0x9000U)
2082
+ " or.b16 r,r,t; \n\t"
2083
+ " mov.b16 %0, r; \n"
2084
+ "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
2085
+ return r;
2086
+ }
2087
+ __CUDA_FP16_DECL__ __half2 h2sin(const __half2 a) {
2088
+ const float sl = __float_simpl_sinf(__half2float(a.x));
2089
+ const float sh = __float_simpl_sinf(__half2float(a.y));
2090
+ __half2 r = __floats2half2_rn(sl, sh);
2091
+ asm("{\n\t"
2092
+ " .reg.b32 i,r,t; \n\t"
2093
+ " mov.b32 r, %0; \n\t"
2094
+ " mov.b32 i, %1; \n\t"
2095
+ " and.b32 t, r, 0x80008000U; \n\t"
2096
+ " abs.f16x2 r, r; \n\t"
2097
+ " abs.f16x2 i, i; \n\t"
2098
+ __SPEC_CASE2(i, r, 0X32B332B3U, 0x08000800U)
2099
+ __SPEC_CASE2(i, r, 0X5CB05CB0U, 0x90009000U)
2100
+ " or.b32 r, r, t; \n\t"
2101
+ " mov.b32 %0, r; \n"
2102
+ "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
2103
+ return r;
2104
+ }
2105
+ __CUDA_FP16_DECL__ __half hcos(const __half a) {
2106
+ const float cl = __float_simpl_cosf(__half2float(a));
2107
+ __half r = __float2half_rn(cl);
2108
+ asm("{\n\t"
2109
+ " .reg.b16 i,r; \n\t"
2110
+ " mov.b16 r, %0; \n\t"
2111
+ " mov.b16 i, %1; \n\t"
2112
+ " abs.f16 i, i; \n\t"
2113
+ __SPEC_CASE(i, r, 0X2B7CU, 0x1000U)
2114
+ " mov.b16 %0, r; \n"
2115
+ "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
2116
+ return r;
2117
+ }
2118
+ __CUDA_FP16_DECL__ __half2 h2cos(const __half2 a) {
2119
+ const float cl = __float_simpl_cosf(__half2float(a.x));
2120
+ const float ch = __float_simpl_cosf(__half2float(a.y));
2121
+ __half2 r = __floats2half2_rn(cl, ch);
2122
+ asm("{\n\t"
2123
+ " .reg.b32 i,r; \n\t"
2124
+ " mov.b32 r, %0; \n\t"
2125
+ " mov.b32 i, %1; \n\t"
2126
+ " abs.f16x2 i, i; \n\t"
2127
+ __SPEC_CASE2(i, r, 0X2B7C2B7CU, 0x10001000U)
2128
+ " mov.b32 %0, r; \n"
2129
+ "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
2130
+ return r;
2131
+ }
2132
+ static __device__ __forceinline__ float __internal_trig_reduction_kernel(const float a, unsigned int *const quadrant)
2133
+ {
2134
+ const float ar = __fmaf_rn(a, 0.636619772F, 12582912.0F);
2135
+ const unsigned q = __float_as_uint(ar);
2136
+ const float j = __fsub_rn(ar, 12582912.0F);
2137
+ float t = __fmaf_rn(j, -1.5707962512969971e+000F, a);
2138
+ t = __fmaf_rn(j, -7.5497894158615964e-008F, t);
2139
+ *quadrant = q;
2140
+ return t;
2141
+ }
2142
+ static __device__ __forceinline__ float __internal_sin_cos_kernel(const float x, const unsigned int i)
2143
+ {
2144
+ float z;
2145
+ const float x2 = x*x;
2146
+ float a8;
2147
+ float a6;
2148
+ float a4;
2149
+ float a2;
2150
+ float a1;
2151
+ float a0;
2152
+
2153
+ if ((i & 1U) != 0U) {
2154
+ // cos
2155
+ a8 = 2.44331571e-5F;
2156
+ a6 = -1.38873163e-3F;
2157
+ a4 = 4.16666457e-2F;
2158
+ a2 = -5.00000000e-1F;
2159
+ a1 = x2;
2160
+ a0 = 1.0F;
2161
+ }
2162
+ else {
2163
+ // sin
2164
+ a8 = -1.95152959e-4F;
2165
+ a6 = 8.33216087e-3F;
2166
+ a4 = -1.66666546e-1F;
2167
+ a2 = 0.0F;
2168
+ a1 = x;
2169
+ a0 = x;
2170
+ }
2171
+
2172
+ z = __fmaf_rn(a8, x2, a6);
2173
+ z = __fmaf_rn(z, x2, a4);
2174
+ z = __fmaf_rn(z, x2, a2);
2175
+ z = __fmaf_rn(z, a1, a0);
2176
+
2177
+ if ((i & 2U) != 0U) {
2178
+ z = -z;
2179
+ }
2180
+ return z;
2181
+ }
2182
+ static __device__ __forceinline__ float __float_simpl_sinf(float a)
2183
+ {
2184
+ float z;
2185
+ unsigned i;
2186
+ a = __internal_trig_reduction_kernel(a, &i);
2187
+ z = __internal_sin_cos_kernel(a, i);
2188
+ return z;
2189
+ }
2190
+ static __device__ __forceinline__ float __float_simpl_cosf(float a)
2191
+ {
2192
+ float z;
2193
+ unsigned i;
2194
+ a = __internal_trig_reduction_kernel(a, &i);
2195
+ z = __internal_sin_cos_kernel(a, (i & 0x3U) + 1U);
2196
+ return z;
2197
+ }
2198
+
2199
+ __CUDA_FP16_DECL__ __half hexp(const __half a) {
2200
+ __half val;
2201
+ asm("{.reg.b32 f, C, nZ; \n"
2202
+ " .reg.b16 h,r; \n"
2203
+ " mov.b16 h,%1; \n"
2204
+ " cvt.f32.f16 f,h; \n"
2205
+ " mov.b32 C, 0x3fb8aa3bU; \n"
2206
+ " mov.b32 nZ, 0x80000000U;\n"
2207
+ " fma.rn.f32 f,f,C,nZ; \n"
2208
+ " ex2.approx.ftz.f32 f,f; \n"
2209
+ " cvt.rn.f16.f32 r,f; \n"
2210
+ __SPEC_CASE(h, r, 0X1F79U, 0x9400U)
2211
+ __SPEC_CASE(h, r, 0X25CFU, 0x9400U)
2212
+ __SPEC_CASE(h, r, 0XC13BU, 0x0400U)
2213
+ __SPEC_CASE(h, r, 0XC1EFU, 0x0200U)
2214
+ " mov.b16 %0,r; \n"
2215
+ "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
2216
+ return val;
2217
+ }
2218
+ __CUDA_FP16_DECL__ __half2 h2exp(const __half2 a) {
2219
+ __half2 val;
2220
+ asm("{.reg.b16 hl, hu; \n"
2221
+ " .reg.b32 h,r,fl,fu,C,nZ; \n"
2222
+ " mov.b32 {hl, hu}, %1; \n"
2223
+ " mov.b32 h, %1; \n"
2224
+ " cvt.f32.f16 fl, hl; \n"
2225
+ " cvt.f32.f16 fu, hu; \n"
2226
+ " mov.b32 C, 0x3fb8aa3bU; \n"
2227
+ " mov.b32 nZ, 0x80000000U;\n"
2228
+ " fma.rn.f32 fl,fl,C,nZ; \n"
2229
+ " fma.rn.f32 fu,fu,C,nZ; \n"
2230
+ " ex2.approx.ftz.f32 fl, fl; \n"
2231
+ " ex2.approx.ftz.f32 fu, fu; \n"
2232
+ " cvt.rn.f16.f32 hl, fl; \n"
2233
+ " cvt.rn.f16.f32 hu, fu; \n"
2234
+ " mov.b32 r, {hl, hu}; \n"
2235
+ __SPEC_CASE2(h, r, 0X1F791F79U, 0x94009400U)
2236
+ __SPEC_CASE2(h, r, 0X25CF25CFU, 0x94009400U)
2237
+ __SPEC_CASE2(h, r, 0XC13BC13BU, 0x04000400U)
2238
+ __SPEC_CASE2(h, r, 0XC1EFC1EFU, 0x02000200U)
2239
+ " mov.b32 %0, r; \n"
2240
+ "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
2241
+ return val;
2242
+ }
2243
+ __CUDA_FP16_DECL__ __half hexp2(const __half a) {
2244
+ __half val;
2245
+ asm("{.reg.b32 f, ULP; \n"
2246
+ " .reg.b16 r; \n"
2247
+ " mov.b16 r,%1; \n"
2248
+ " cvt.f32.f16 f,r; \n"
2249
+ " ex2.approx.ftz.f32 f,f; \n"
2250
+ " mov.b32 ULP, 0x33800000U;\n"
2251
+ " fma.rn.f32 f,f,ULP,f; \n"
2252
+ " cvt.rn.f16.f32 r,f; \n"
2253
+ " mov.b16 %0,r; \n"
2254
+ "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
2255
+ return val;
2256
+ }
2257
+ __CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a) {
2258
+ __half2 val;
2259
+ asm("{.reg.b16 hl, hu; \n"
2260
+ " .reg.b32 fl, fu, ULP; \n"
2261
+ " mov.b32 {hl, hu}, %1; \n"
2262
+ " cvt.f32.f16 fl, hl; \n"
2263
+ " cvt.f32.f16 fu, hu; \n"
2264
+ " ex2.approx.ftz.f32 fl, fl; \n"
2265
+ " ex2.approx.ftz.f32 fu, fu; \n"
2266
+ " mov.b32 ULP, 0x33800000U;\n"
2267
+ " fma.rn.f32 fl,fl,ULP,fl; \n"
2268
+ " fma.rn.f32 fu,fu,ULP,fu; \n"
2269
+ " cvt.rn.f16.f32 hl, fl; \n"
2270
+ " cvt.rn.f16.f32 hu, fu; \n"
2271
+ " mov.b32 %0, {hl, hu}; \n"
2272
+ "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
2273
+ return val;
2274
+ }
2275
+ __CUDA_FP16_DECL__ __half hexp10(const __half a) {
2276
+ __half val;
2277
+ asm("{.reg.b16 h,r; \n"
2278
+ " .reg.b32 f, C, nZ; \n"
2279
+ " mov.b16 h, %1; \n"
2280
+ " cvt.f32.f16 f, h; \n"
2281
+ " mov.b32 C, 0x40549A78U; \n"
2282
+ " mov.b32 nZ, 0x80000000U;\n"
2283
+ " fma.rn.f32 f,f,C,nZ; \n"
2284
+ " ex2.approx.ftz.f32 f, f; \n"
2285
+ " cvt.rn.f16.f32 r, f; \n"
2286
+ __SPEC_CASE(h, r, 0x34DEU, 0x9800U)
2287
+ __SPEC_CASE(h, r, 0x9766U, 0x9000U)
2288
+ __SPEC_CASE(h, r, 0x9972U, 0x1000U)
2289
+ __SPEC_CASE(h, r, 0xA5C4U, 0x1000U)
2290
+ __SPEC_CASE(h, r, 0xBF0AU, 0x8100U)
2291
+ " mov.b16 %0, r; \n"
2292
+ "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
2293
+ return val;
2294
+ }
2295
+ __CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a) {
2296
+ __half2 val;
2297
+ asm("{.reg.b16 hl, hu; \n"
2298
+ " .reg.b32 h,r,fl,fu,C,nZ; \n"
2299
+ " mov.b32 {hl, hu}, %1; \n"
2300
+ " mov.b32 h, %1; \n"
2301
+ " cvt.f32.f16 fl, hl; \n"
2302
+ " cvt.f32.f16 fu, hu; \n"
2303
+ " mov.b32 C, 0x40549A78U; \n"
2304
+ " mov.b32 nZ, 0x80000000U;\n"
2305
+ " fma.rn.f32 fl,fl,C,nZ; \n"
2306
+ " fma.rn.f32 fu,fu,C,nZ; \n"
2307
+ " ex2.approx.ftz.f32 fl, fl; \n"
2308
+ " ex2.approx.ftz.f32 fu, fu; \n"
2309
+ " cvt.rn.f16.f32 hl, fl; \n"
2310
+ " cvt.rn.f16.f32 hu, fu; \n"
2311
+ " mov.b32 r, {hl, hu}; \n"
2312
+ __SPEC_CASE2(h, r, 0x34DE34DEU, 0x98009800U)
2313
+ __SPEC_CASE2(h, r, 0x97669766U, 0x90009000U)
2314
+ __SPEC_CASE2(h, r, 0x99729972U, 0x10001000U)
2315
+ __SPEC_CASE2(h, r, 0xA5C4A5C4U, 0x10001000U)
2316
+ __SPEC_CASE2(h, r, 0xBF0ABF0AU, 0x81008100U)
2317
+ " mov.b32 %0, r; \n"
2318
+ "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
2319
+ return val;
2320
+ }
2321
+ __CUDA_FP16_DECL__ __half hlog2(const __half a) {
2322
+ __half val;
2323
+ asm("{.reg.b16 h, r; \n"
2324
+ " .reg.b32 f; \n"
2325
+ " mov.b16 h, %1; \n"
2326
+ " cvt.f32.f16 f, h; \n"
2327
+ " lg2.approx.ftz.f32 f, f; \n"
2328
+ " cvt.rn.f16.f32 r, f; \n"
2329
+ __SPEC_CASE(r, r, 0xA2E2U, 0x8080U)
2330
+ __SPEC_CASE(r, r, 0xBF46U, 0x9400U)
2331
+ " mov.b16 %0, r; \n"
2332
+ "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
2333
+ return val;
2334
+ }
2335
+ __CUDA_FP16_DECL__ __half2 h2log2(const __half2 a) {
2336
+ __half2 val;
2337
+ asm("{.reg.b16 hl, hu; \n"
2338
+ " .reg.b32 fl, fu, r, p; \n"
2339
+ " mov.b32 {hl, hu}, %1; \n"
2340
+ " cvt.f32.f16 fl, hl; \n"
2341
+ " cvt.f32.f16 fu, hu; \n"
2342
+ " lg2.approx.ftz.f32 fl, fl; \n"
2343
+ " lg2.approx.ftz.f32 fu, fu; \n"
2344
+ " cvt.rn.f16.f32 hl, fl; \n"
2345
+ " cvt.rn.f16.f32 hu, fu; \n"
2346
+ " mov.b32 r, {hl, hu}; \n"
2347
+ __SPEC_CASE2(r, r, 0xA2E2A2E2U, 0x80808080U)
2348
+ __SPEC_CASE2(r, r, 0xBF46BF46U, 0x94009400U)
2349
+ " mov.b32 %0, r; \n"
2350
+ "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
2351
+ return val;
2352
+ }
2353
+ __CUDA_FP16_DECL__ __half hlog(const __half a) {
2354
+ __half val;
2355
+ asm("{.reg.b32 f, C; \n"
2356
+ " .reg.b16 r,h; \n"
2357
+ " mov.b16 h,%1; \n"
2358
+ " cvt.f32.f16 f,h; \n"
2359
+ " lg2.approx.ftz.f32 f,f; \n"
2360
+ " mov.b32 C, 0x3f317218U; \n"
2361
+ " mul.f32 f,f,C; \n"
2362
+ " cvt.rn.f16.f32 r,f; \n"
2363
+ __SPEC_CASE(h, r, 0X160DU, 0x9C00U)
2364
+ __SPEC_CASE(h, r, 0X3BFEU, 0x8010U)
2365
+ __SPEC_CASE(h, r, 0X3C0BU, 0x8080U)
2366
+ __SPEC_CASE(h, r, 0X6051U, 0x1C00U)
2367
+ " mov.b16 %0,r; \n"
2368
+ "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
2369
+ return val;
2370
+ }
2371
+ __CUDA_FP16_DECL__ __half2 h2log(const __half2 a) {
2372
+ __half2 val;
2373
+ asm("{.reg.b16 hl, hu; \n"
2374
+ " .reg.b32 r, fl, fu, C, h; \n"
2375
+ " mov.b32 {hl, hu}, %1; \n"
2376
+ " mov.b32 h, %1; \n"
2377
+ " cvt.f32.f16 fl, hl; \n"
2378
+ " cvt.f32.f16 fu, hu; \n"
2379
+ " lg2.approx.ftz.f32 fl, fl; \n"
2380
+ " lg2.approx.ftz.f32 fu, fu; \n"
2381
+ " mov.b32 C, 0x3f317218U; \n"
2382
+ " mul.f32 fl,fl,C; \n"
2383
+ " mul.f32 fu,fu,C; \n"
2384
+ " cvt.rn.f16.f32 hl, fl; \n"
2385
+ " cvt.rn.f16.f32 hu, fu; \n"
2386
+ " mov.b32 r, {hl, hu}; \n"
2387
+ __SPEC_CASE2(h, r, 0X160D160DU, 0x9C009C00U)
2388
+ __SPEC_CASE2(h, r, 0X3BFE3BFEU, 0x80108010U)
2389
+ __SPEC_CASE2(h, r, 0X3C0B3C0BU, 0x80808080U)
2390
+ __SPEC_CASE2(h, r, 0X60516051U, 0x1C001C00U)
2391
+ " mov.b32 %0, r; \n"
2392
+ "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
2393
+ return val;
2394
+ }
2395
+ __CUDA_FP16_DECL__ __half hlog10(const __half a) {
2396
+ __half val;
2397
+ asm("{.reg.b16 h, r; \n"
2398
+ " .reg.b32 f, C; \n"
2399
+ " mov.b16 h, %1; \n"
2400
+ " cvt.f32.f16 f, h; \n"
2401
+ " lg2.approx.ftz.f32 f, f; \n"
2402
+ " mov.b32 C, 0x3E9A209BU; \n"
2403
+ " mul.f32 f,f,C; \n"
2404
+ " cvt.rn.f16.f32 r, f; \n"
2405
+ __SPEC_CASE(h, r, 0x338FU, 0x1000U)
2406
+ __SPEC_CASE(h, r, 0x33F8U, 0x9000U)
2407
+ __SPEC_CASE(h, r, 0x57E1U, 0x9800U)
2408
+ __SPEC_CASE(h, r, 0x719DU, 0x9C00U)
2409
+ " mov.b16 %0, r; \n"
2410
+ "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
2411
+ return val;
2412
+ }
2413
+ __CUDA_FP16_DECL__ __half2 h2log10(const __half2 a) {
2414
+ __half2 val;
2415
+ asm("{.reg.b16 hl, hu; \n"
2416
+ " .reg.b32 r, fl, fu, C, h; \n"
2417
+ " mov.b32 {hl, hu}, %1; \n"
2418
+ " mov.b32 h, %1; \n"
2419
+ " cvt.f32.f16 fl, hl; \n"
2420
+ " cvt.f32.f16 fu, hu; \n"
2421
+ " lg2.approx.ftz.f32 fl, fl; \n"
2422
+ " lg2.approx.ftz.f32 fu, fu; \n"
2423
+ " mov.b32 C, 0x3E9A209BU; \n"
2424
+ " mul.f32 fl,fl,C; \n"
2425
+ " mul.f32 fu,fu,C; \n"
2426
+ " cvt.rn.f16.f32 hl, fl; \n"
2427
+ " cvt.rn.f16.f32 hu, fu; \n"
2428
+ " mov.b32 r, {hl, hu}; \n"
2429
+ __SPEC_CASE2(h, r, 0x338F338FU, 0x10001000U)
2430
+ __SPEC_CASE2(h, r, 0x33F833F8U, 0x90009000U)
2431
+ __SPEC_CASE2(h, r, 0x57E157E1U, 0x98009800U)
2432
+ __SPEC_CASE2(h, r, 0x719D719DU, 0x9C009C00U)
2433
+ " mov.b32 %0, r; \n"
2434
+ "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
2435
+ return val;
2436
+ }
2437
+ #undef __SPEC_CASE2
2438
+ #undef __SPEC_CASE
2439
+ __CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a) {
2440
+ __APPROX_FCAST2(rcp)
2441
+ }
2442
+ __CUDA_FP16_DECL__ __half hrcp(const __half a) {
2443
+ __APPROX_FCAST(rcp)
2444
+ }
2445
+ __CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a) {
2446
+ __APPROX_FCAST2(rsqrt)
2447
+ }
2448
+ __CUDA_FP16_DECL__ __half hrsqrt(const __half a) {
2449
+ __APPROX_FCAST(rsqrt)
2450
+ }
2451
+ __CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a) {
2452
+ __APPROX_FCAST2(sqrt)
2453
+ }
2454
+ __CUDA_FP16_DECL__ __half hsqrt(const __half a) {
2455
+ __APPROX_FCAST(sqrt)
2456
+ }
2457
+ #undef __APPROX_FCAST
2458
+ #undef __APPROX_FCAST2
2459
+ __CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a)
2460
+ {
2461
+ __half2 r;
2462
+ asm("{set.nan.f16x2.f16x2 %0,%1,%2;\n}"
2463
+ :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(a)));
2464
+ return r;
2465
+ }
2466
+ __CUDA_FP16_DECL__ bool __hisnan(const __half a)
2467
+ {
2468
+ __half r;
2469
+ asm("{set.nan.f16.f16 %0,%1,%2;\n}"
2470
+ :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(a)));
2471
+ return __HALF_TO_CUS(r) != 0U;
2472
+ }
2473
+ __CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a)
2474
+ {
2475
+ __half2 r;
2476
+ asm("{neg.f16x2 %0,%1;\n}"
2477
+ :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
2478
+ return r;
2479
+ }
2480
+ __CUDA_FP16_DECL__ __half __hneg(const __half a)
2481
+ {
2482
+ __half r;
2483
+ asm("{neg.f16 %0,%1;\n}"
2484
+ :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
2485
+ return r;
2486
+ }
2487
+ __CUDA_FP16_DECL__ __half2 __habs2(const __half2 a)
2488
+ {
2489
+ __half2 r;
2490
+ asm("{abs.f16x2 %0,%1;\n}"
2491
+ :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
2492
+ return r;
2493
+ }
2494
+ __CUDA_FP16_DECL__ __half __habs(const __half a)
2495
+ {
2496
+ __half r;
2497
+ asm("{abs.f16 %0,%1;\n}"
2498
+ :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
2499
+ return r;
2500
+ }
2501
+
2502
+ __CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c)
2503
+ {
2504
+ // fast version of complex multiply-accumulate
2505
+ // (a.re, a.im) * (b.re, b.im) + (c.re, c.im)
2506
+ // acc.re = (c.re + a.re*b.re) - a.im*b.im
2507
+ // acc.im = (c.im + a.re*b.im) + a.im*b.re
2508
+ __half real_tmp = __hfma(a.x, b.x, c.x);
2509
+ __half img_tmp = __hfma(a.x, b.y, c.y);
2510
+ real_tmp = __hfma(__hneg(a.y), b.y, real_tmp);
2511
+ img_tmp = __hfma(a.y, b.x, img_tmp);
2512
+ return make_half2(real_tmp, img_tmp);
2513
+ }
2514
+
2515
+ #endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)*/
2516
+
2517
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
2518
+ __CUDA_FP16_DECL__ __half __hmax_nan(const __half a, const __half b)
2519
+ {
2520
+ __BINARY_OP_HALF_MACRO(max.NaN)
2521
+ }
2522
+ __CUDA_FP16_DECL__ __half __hmin_nan(const __half a, const __half b)
2523
+ {
2524
+ __BINARY_OP_HALF_MACRO(min.NaN)
2525
+ }
2526
+ __CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c)
2527
+ {
2528
+ __TERNARY_OP_HALF_MACRO(fma.rn.relu)
2529
+ }
2530
+
2531
+ __CUDA_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b)
2532
+ {
2533
+ __BINARY_OP_HALF2_MACRO(max.NaN)
2534
+ }
2535
+ __CUDA_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b)
2536
+ {
2537
+ __BINARY_OP_HALF2_MACRO(min.NaN)
2538
+ }
2539
+ __CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c)
2540
+ {
2541
+ __TERNARY_OP_HALF2_MACRO(fma.rn.relu)
2542
+ }
2543
+ #endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)*/
2544
+
2545
+ /* Define __PTR for atomicAdd prototypes below, undef after done */
2546
+ #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
2547
+ #define __PTR "l"
2548
+ #else
2549
+ #define __PTR "r"
2550
+ #endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
2551
+
2552
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
2553
+
2554
+ __CUDA_FP16_DECL__ __half2 atomicAdd(__half2 *const address, const __half2 val) {
2555
+ __half2 r;
2556
+ asm volatile ("{ atom.add.noftz.f16x2 %0,[%1],%2; }\n"
2557
+ : "=r"(__HALF2_TO_UI(r)) : __PTR(address), "r"(__HALF2_TO_CUI(val))
2558
+ : "memory");
2559
+ return r;
2560
+ }
2561
+
2562
+ #endif /*!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600*/
2563
+
2564
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
2565
+
2566
+ __CUDA_FP16_DECL__ __half atomicAdd(__half *const address, const __half val) {
2567
+ __half r;
2568
+ asm volatile ("{ atom.add.noftz.f16 %0,[%1],%2; }\n"
2569
+ : "=h"(__HALF_TO_US(r))
2570
+ : __PTR(address), "h"(__HALF_TO_CUS(val))
2571
+ : "memory");
2572
+ return r;
2573
+ }
2574
+
2575
+ #endif /*!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700*/
2576
+
2577
+ #undef __PTR
2578
+
2579
+ #undef __CUDA_FP16_DECL__
2580
+ #endif /* defined(__CUDACC__) */
2581
+ #endif /* defined(__cplusplus) */
2582
+
2583
+ #undef __TERNARY_OP_HALF2_MACRO
2584
+ #undef __TERNARY_OP_HALF_MACRO
2585
+ #undef __BINARY_OP_HALF2_MACRO
2586
+ #undef __BINARY_OP_HALF_MACRO
2587
+
2588
+ #undef __CUDA_HOSTDEVICE_FP16_DECL__
2589
+ #undef __CUDA_FP16_DECL__
2590
+
2591
+ #undef __HALF_TO_US
2592
+ #undef __HALF_TO_CUS
2593
+ #undef __HALF2_TO_UI
2594
+ #undef __HALF2_TO_CUI
2595
+
2596
+ /* Define first-class types "half" and "half2", unless user specifies otherwise via "#define CUDA_NO_HALF" */
2597
+ /* C cannot ever have these types defined here, because __half and __half2 are C++ classes */
2598
+ #if defined(__cplusplus) && !defined(CUDA_NO_HALF)
2599
+ typedef __half half;
2600
+ typedef __half2 half2;
2601
+ // for consistency with __nv_bfloat16
2602
+ typedef __half __nv_half;
2603
+ typedef __half2 __nv_half2;
2604
+ typedef __half_raw __nv_half_raw;
2605
+ typedef __half2_raw __nv_half2_raw;
2606
+ typedef __half nv_half;
2607
+ typedef __half2 nv_half2;
2608
+ #endif /* defined(__cplusplus) && !defined(CUDA_NO_HALF) */
2609
+
2610
+ #if defined(__CPP_VERSION_AT_LEAST_11_FP16)
2611
+ #undef __CPP_VERSION_AT_LEAST_11_FP16
2612
+ #endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
2613
+
2614
+ #endif /* end of include guard: __CUDA_FP16_HPP__ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_fp8.h ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2022 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef __CUDA_FP8_H__
51
+ #define __CUDA_FP8_H__
52
+
53
+ /* Set up function decorations */
54
+ #if defined(__CUDACC__)
55
+ #define __CUDA_FP8_DECL__ static __device__ __inline__
56
+ #define __CUDA_HOSTDEVICE_FP8__ __host__ __device__
57
+ #define __CUDA_HOSTDEVICE_FP8_DECL__ static __host__ __device__ __inline__
58
+ #else /* !defined(__CUDACC__) */
59
+ #if defined(__GNUC__)
60
+ #define __CUDA_HOSTDEVICE_FP8_DECL__ static __attribute__((unused))
61
+ #else
62
+ #define __CUDA_HOSTDEVICE_FP8_DECL__ static
63
+ #endif /* defined(__GNUC__) */
64
+ #define __CUDA_HOSTDEVICE_FP8__
65
+ #endif /* defined(__CUDACC_) */
66
+
67
+ #if !defined(_MSC_VER) && __cplusplus >= 201103L
68
+ #define __CPP_VERSION_AT_LEAST_11_FP8
69
+ #elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
70
+ #define __CPP_VERSION_AT_LEAST_11_FP8
71
+ #endif
72
+
73
+ /* bring in __half_raw data type */
74
+ #include "cuda_fp16.h"
75
+ /* bring in __nv_bfloat16_raw data type */
76
+ #include "cuda_bf16.h"
77
+ /* bring in float2, double4, etc vector types */
78
+ #include "vector_types.h"
79
+
80
+ /**
81
+ * \defgroup CUDA_MATH_INTRINSIC_FP8 FP8 Intrinsics
82
+ * This section describes fp8 intrinsic functions.
83
+ * To use these functions, include the header file \p cuda_fp8.h in your
84
+ * program.
85
+ */
86
+
87
+ /**
88
+ * \defgroup CUDA_MATH_FP8_MISC FP8 Conversion and Data Movement
89
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
90
+ * To use these functions, include the header file \p cuda_fp8.h in your
91
+ * program.
92
+ */
93
+
94
+ /**
95
+ * \ingroup CUDA_MATH_FP8_MISC
96
+ * \brief 8-bit \p unsigned \p integer
97
+ * type abstraction used to for \p fp8 floating-point
98
+ * numbers storage.
99
+ */
100
+ typedef unsigned char __nv_fp8_storage_t;
101
+
102
+ /**
103
+ * \ingroup CUDA_MATH_FP8_MISC
104
+ * \brief 16-bit \p unsigned \p integer
105
+ * type abstraction used to for storage of pairs of
106
+ * \p fp8 floating-point numbers.
107
+ */
108
+ typedef unsigned short int __nv_fp8x2_storage_t;
109
+
110
+ /**
111
+ * \ingroup CUDA_MATH_FP8_MISC
112
+ * \brief 32-bit \p unsigned \p integer
113
+ * type abstraction used to for storage of tetrads of
114
+ * \p fp8 floating-point numbers.
115
+ */
116
+ typedef unsigned int __nv_fp8x4_storage_t;
117
+
118
+ /**
119
+ * \ingroup CUDA_MATH_FP8_MISC
120
+ * \brief Enumerates the modes applicable when
121
+ * performing a narrowing conversion to \p fp8 destination types.
122
+ */
123
+ typedef enum __nv_saturation_t {
124
+ /**
125
+ * Means no saturation to finite is performed when conversion
126
+ * results in rounding values outside the range of destination
127
+ * type.
128
+ * NOTE: for fp8 type of e4m3 kind, the results that are larger
129
+ * than the maximum representable finite number of the target
130
+ * format become NaN.
131
+ */
132
+ __NV_NOSAT,
133
+ /**
134
+ * Means input larger than the maximum representable
135
+ * finite number MAXNORM of the target format round to the
136
+ * MAXNORM of the same sign as input.
137
+ */
138
+ __NV_SATFINITE,
139
+ } __nv_saturation_t;
140
+
141
+ /**
142
+ * \ingroup CUDA_MATH_FP8_MISC
143
+ * \brief Enumerates the possible
144
+ * interpretations of the 8-bit values when referring to them as
145
+ * \p fp8 types.
146
+ */
147
+ typedef enum __nv_fp8_interpretation_t {
148
+ __NV_E4M3, /**< Stands for \p fp8 numbers of \p e4m3 kind. */
149
+ __NV_E5M2, /**< Stands for \p fp8 numbers of \p e5m2 kind. */
150
+ } __nv_fp8_interpretation_t;
151
+
152
+ /* Forward-declaration of C-style APIs */
153
+
154
+ /**
155
+ * \ingroup CUDA_MATH_FP8_MISC
156
+ * \brief Converts input \p double precision \p x to \p fp8 type of the
157
+ * requested kind using round-to-nearest-even rounding and requested saturation
158
+ * mode.
159
+ *
160
+ * \details Converts input \p x to \p fp8 type of the kind specified by
161
+ * \p fp8_interpretation parameter,
162
+ * using round-to-nearest-even rounding and
163
+ * saturation mode specified by \p saturate parameter.
164
+ *
165
+ * \returns
166
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
167
+ */
168
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
169
+ __nv_cvt_double_to_fp8(const double x, const __nv_saturation_t saturate,
170
+ const __nv_fp8_interpretation_t fp8_interpretation);
171
+
172
+ /**
173
+ * \ingroup CUDA_MATH_FP8_MISC
174
+ * \brief Converts input vector of two \p double precision numbers packed
175
+ * in \p double2 \p x into a vector of two values of \p fp8 type of
176
+ * the requested kind using round-to-nearest-even rounding and requested
177
+ * saturation mode.
178
+ *
179
+ * \details Converts input vector \p x to a vector of two \p fp8 values of the
180
+ * kind specified by \p fp8_interpretation parameter, using
181
+ * round-to-nearest-even rounding and saturation mode specified by \p saturate
182
+ * parameter.
183
+ *
184
+ * \returns
185
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
186
+ */
187
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
188
+ __nv_cvt_double2_to_fp8x2(const double2 x, const __nv_saturation_t saturate,
189
+ const __nv_fp8_interpretation_t fp8_interpretation);
190
+
191
+ /**
192
+ * \ingroup CUDA_MATH_FP8_MISC
193
+ * \brief Converts input \p single precision \p x to \p fp8 type of the
194
+ * requested kind using round-to-nearest-even rounding and requested saturation
195
+ * mode.
196
+ *
197
+ * \details Converts input \p x to \p fp8 type of the kind specified by
198
+ * \p fp8_interpretation parameter,
199
+ * using round-to-nearest-even rounding and
200
+ * saturation mode specified by \p saturate parameter.
201
+ *
202
+ * \returns
203
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
204
+ */
205
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
206
+ __nv_cvt_float_to_fp8(const float x, const __nv_saturation_t saturate,
207
+ const __nv_fp8_interpretation_t fp8_interpretation);
208
+
209
+ /**
210
+ * \ingroup CUDA_MATH_FP8_MISC
211
+ * \brief Converts input vector of two \p single precision numbers packed
212
+ * in \p float2 \p x into a vector of two values of \p fp8 type of
213
+ * the requested kind using round-to-nearest-even rounding and requested
214
+ * saturation mode.
215
+ *
216
+ * \details Converts input vector \p x to a vector of two \p fp8 values of the
217
+ * kind specified by \p fp8_interpretation parameter, using
218
+ * round-to-nearest-even rounding and saturation mode specified by \p saturate
219
+ * parameter.
220
+ *
221
+ * \returns
222
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
223
+ */
224
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
225
+ __nv_cvt_float2_to_fp8x2(const float2 x, const __nv_saturation_t saturate,
226
+ const __nv_fp8_interpretation_t fp8_interpretation);
227
+
228
+ /**
229
+ * \ingroup CUDA_MATH_FP8_MISC
230
+ * \brief Converts input \p half precision \p x to \p fp8 type of the requested
231
+ * kind using round-to-nearest-even rounding and requested saturation mode.
232
+ *
233
+ * \details Converts input \p x to \p fp8 type of the kind specified by
234
+ * \p fp8_interpretation parameter,
235
+ * using round-to-nearest-even rounding and
236
+ * saturation mode specified by \p saturate parameter.
237
+ *
238
+ * \returns
239
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
240
+ */
241
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
242
+ __nv_cvt_halfraw_to_fp8(const __half_raw x, const __nv_saturation_t saturate,
243
+ const __nv_fp8_interpretation_t fp8_interpretation);
244
+
245
+ /**
246
+ * \ingroup CUDA_MATH_FP8_MISC
247
+ * \brief Converts input vector of two \p half precision numbers packed
248
+ * in \p __half2_raw \p x into a vector of two values of \p fp8 type of
249
+ * the requested kind using round-to-nearest-even rounding and requested
250
+ * saturation mode.
251
+ *
252
+ * \details Converts input vector \p x to a vector of two \p fp8 values of the
253
+ * kind specified by \p fp8_interpretation parameter, using
254
+ * round-to-nearest-even rounding and saturation mode specified by \p saturate
255
+ * parameter.
256
+ *
257
+ * \returns
258
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
259
+ */
260
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_halfraw2_to_fp8x2(
261
+ const __half2_raw x, const __nv_saturation_t saturate,
262
+ const __nv_fp8_interpretation_t fp8_interpretation);
263
+
264
+ /**
265
+ * \ingroup CUDA_MATH_FP8_MISC
266
+ * \brief Converts input \p nv_bfloat16 precision \p x to \p fp8 type of the
267
+ * requested kind using round-to-nearest-even rounding and requested saturation
268
+ * mode.
269
+ *
270
+ * \details Converts input \p x to \p fp8 type of the kind specified by
271
+ * \p fp8_interpretation parameter,
272
+ * using round-to-nearest-even rounding and
273
+ * saturation mode specified by \p saturate parameter.
274
+ *
275
+ * \returns
276
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
277
+ */
278
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_bfloat16raw_to_fp8(
279
+ const __nv_bfloat16_raw x, const __nv_saturation_t saturate,
280
+ const __nv_fp8_interpretation_t fp8_interpretation);
281
+
282
+ /**
283
+ * \ingroup CUDA_MATH_FP8_MISC
284
+ * \brief Converts input vector of two \p nv_bfloat16 precision numbers packed
285
+ * in \p __nv_bfloat162_raw \p x into a vector of two values of \p fp8 type of
286
+ * the requested kind using round-to-nearest-even rounding and requested
287
+ * saturation mode.
288
+ *
289
+ * \details Converts input vector \p x to a vector of two \p fp8 values of the
290
+ * kind specified by \p fp8_interpretation parameter, using
291
+ * round-to-nearest-even rounding and saturation mode specified by \p saturate
292
+ * parameter.
293
+ *
294
+ * \returns
295
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
296
+ */
297
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
298
+ __nv_cvt_bfloat16raw2_to_fp8x2(
299
+ const __nv_bfloat162_raw x, const __nv_saturation_t saturate,
300
+ const __nv_fp8_interpretation_t fp8_interpretation);
301
+
302
+ /**
303
+ * \ingroup CUDA_MATH_FP8_MISC
304
+ * \brief Converts input \p fp8 \p x of the specified kind
305
+ * to \p half precision.
306
+ *
307
+ * \details Converts input \p x of \p fp8 type of the kind specified by
308
+ * \p fp8_interpretation parameter
309
+ * to \p half precision.
310
+ *
311
+ * \returns
312
+ * - The \p __half_raw value holds the result of conversion.
313
+ */
314
+ __CUDA_HOSTDEVICE_FP8_DECL__ __half_raw
315
+ __nv_cvt_fp8_to_halfraw(const __nv_fp8_storage_t x,
316
+ const __nv_fp8_interpretation_t fp8_interpretation);
317
+ /**
318
+ * \ingroup CUDA_MATH_FP8_MISC
319
+ * \brief Converts input vector of two \p fp8 values of the specified kind
320
+ * to a vector of two \p half precision values packed in \p __half2_raw
321
+ * structure.
322
+ *
323
+ * \details Converts input vector \p x of \p fp8 type of the kind specified by
324
+ * \p fp8_interpretation parameter
325
+ * to a vector of two \p half precision values and returns as \p __half2_raw
326
+ * structure.
327
+ *
328
+ * \returns
329
+ * - The \p __half2_raw value holds the result of conversion.
330
+ */
331
+ __CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw
332
+ __nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x,
333
+ const __nv_fp8_interpretation_t fp8_interpretation);
334
+
335
+ #if defined(__cplusplus)
336
+
337
+ #define __CUDA_FP8_TYPES_EXIST__
338
+
339
+ /* Forward-declaration of structures defined in "cuda_fp8.hpp" */
340
+ struct __nv_fp8_e5m2;
341
+ struct __nv_fp8x2_e5m2;
342
+ struct __nv_fp8x4_e5m2;
343
+
344
+ struct __nv_fp8_e4m3;
345
+ struct __nv_fp8x2_e4m3;
346
+ struct __nv_fp8x4_e4m3;
347
+
348
+ #endif /* defined(__cplusplus) */
349
+
350
+ #include "cuda_fp8.hpp"
351
+
352
+ #undef __CUDA_FP8_DECL__
353
+ #undef __CUDA_HOSTDEVICE_FP8__
354
+ #undef __CUDA_HOSTDEVICE_FP8_DECL__
355
+
356
+ #if defined(__CPP_VERSION_AT_LEAST_11_FP8)
357
+ #undef __CPP_VERSION_AT_LEAST_11_FP8
358
+ #endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
359
+
360
+ #endif /* end of include guard: __CUDA_FP8_H__ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_gl_interop.h ADDED
@@ -0,0 +1,508 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_GL_INTEROP_H__)
51
+ #define __CUDA_GL_INTEROP_H__
52
+
53
+ #include "cuda_runtime_api.h"
54
+
55
+ #if defined(__APPLE__)
56
+
57
+ #include <OpenGL/gl.h>
58
+
59
+ #else /* __APPLE__ */
60
+
61
+ #if defined(__arm__) || defined(__aarch64__)
62
+ #ifndef GL_VERSION
63
+ #error Please include the appropriate gl headers before including cuda_gl_interop.h
64
+ #endif
65
+ #else
66
+ #include <GL/gl.h>
67
+ #endif
68
+
69
+ #endif /* __APPLE__ */
70
+
71
+ /** \cond impl_private */
72
+ #if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
73
+ #define __CUDA_DEPRECATED
74
+ #elif defined(_MSC_VER)
75
+ #define __CUDA_DEPRECATED __declspec(deprecated)
76
+ #elif defined(__GNUC__)
77
+ #define __CUDA_DEPRECATED __attribute__((deprecated))
78
+ #else
79
+ #define __CUDA_DEPRECATED
80
+ #endif
81
+ /** \endcond impl_private */
82
+
83
+ #if defined(__cplusplus)
84
+ extern "C" {
85
+ #endif /* __cplusplus */
86
+
87
+ /**
88
+ * \addtogroup CUDART_OPENGL OpenGL Interoperability
89
+ * This section describes the OpenGL interoperability functions of the CUDA
90
+ * runtime application programming interface. Note that mapping of OpenGL
91
+ * resources is performed with the graphics API agnostic, resource mapping
92
+ * interface described in \ref CUDART_INTEROP "Graphics Interopability".
93
+ *
94
+ * @{
95
+ */
96
+
97
+ /**
98
+ * CUDA devices corresponding to the current OpenGL context
99
+ */
100
+ enum cudaGLDeviceList
101
+ {
102
+ cudaGLDeviceListAll = 1, /**< The CUDA devices for all GPUs used by the current OpenGL context */
103
+ cudaGLDeviceListCurrentFrame = 2, /**< The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame */
104
+ cudaGLDeviceListNextFrame = 3 /**< The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame */
105
+ };
106
+
107
+ /**
108
+ * \brief Gets the CUDA devices associated with the current OpenGL context
109
+ *
110
+ * Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices
111
+ * corresponding to the current OpenGL context. Also returns in \p *pCudaDevices
112
+ * at most \p cudaDeviceCount of the CUDA-compatible devices corresponding to
113
+ * the current OpenGL context. If any of the GPUs being used by the current OpenGL
114
+ * context are not CUDA capable then the call will return ::cudaErrorNoDevice.
115
+ *
116
+ * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to the
117
+ * current OpenGL context
118
+ * \param pCudaDevices - Returned CUDA devices corresponding to the current
119
+ * OpenGL context
120
+ * \param cudaDeviceCount - The size of the output device array \p pCudaDevices
121
+ * \param deviceList - The set of devices to return. This set may be
122
+ * ::cudaGLDeviceListAll for all devices,
123
+ * ::cudaGLDeviceListCurrentFrame for the devices used to
124
+ * render the current frame (in SLI), or
125
+ * ::cudaGLDeviceListNextFrame for the devices used to
126
+ * render the next frame (in SLI).
127
+ *
128
+ * \return
129
+ * ::cudaSuccess,
130
+ * ::cudaErrorNoDevice,
131
+ * ::cudaErrorInvalidGraphicsContext,
132
+ * ::cudaErrorUnknown
133
+ *
134
+ * \note This function is not supported on Mac OS X.
135
+ * \notefnerr
136
+ *
137
+ * \sa
138
+ * ::cudaGraphicsUnregisterResource,
139
+ * ::cudaGraphicsMapResources,
140
+ * ::cudaGraphicsSubResourceGetMappedArray,
141
+ * ::cudaGraphicsResourceGetMappedPointer,
142
+ * ::cuGLGetDevices
143
+ */
144
+ extern __host__ cudaError_t CUDARTAPI cudaGLGetDevices(unsigned int *pCudaDeviceCount, int *pCudaDevices, unsigned int cudaDeviceCount, enum cudaGLDeviceList deviceList);
145
+
146
+ /**
147
+ * \brief Register an OpenGL texture or renderbuffer object
148
+ *
149
+ * Registers the texture or renderbuffer object specified by \p image for access by CUDA.
150
+ * A handle to the registered object is returned as \p resource.
151
+ *
152
+ * \p target must match the type of the object, and must be one of ::GL_TEXTURE_2D,
153
+ * ::GL_TEXTURE_RECTANGLE, ::GL_TEXTURE_CUBE_MAP, ::GL_TEXTURE_3D, ::GL_TEXTURE_2D_ARRAY,
154
+ * or ::GL_RENDERBUFFER.
155
+ *
156
+ * The register flags \p flags specify the intended usage, as follows:
157
+ * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
158
+ * resource will be used. It is therefore assumed that this resource will be
159
+ * read from and written to by CUDA. This is the default value.
160
+ * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
161
+ * will not write to this resource.
162
+ * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
163
+ * CUDA will not read from this resource and will write over the
164
+ * entire contents of the resource, so none of the data previously
165
+ * stored in the resource will be preserved.
166
+ * - ::cudaGraphicsRegisterFlagsSurfaceLoadStore: Specifies that CUDA will
167
+ * bind this resource to a surface reference.
168
+ * - ::cudaGraphicsRegisterFlagsTextureGather: Specifies that CUDA will perform
169
+ * texture gather operations on this resource.
170
+ *
171
+ * The following image formats are supported. For brevity's sake, the list is abbreviated.
172
+ * For ex., {GL_R, GL_RG} X {8, 16} would expand to the following 4 formats
173
+ * {GL_R8, GL_R16, GL_RG8, GL_RG16} :
174
+ * - GL_RED, GL_RG, GL_RGBA, GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY
175
+ * - {GL_R, GL_RG, GL_RGBA} X {8, 16, 16F, 32F, 8UI, 16UI, 32UI, 8I, 16I, 32I}
176
+ * - {GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY} X
177
+ * {8, 16, 16F_ARB, 32F_ARB, 8UI_EXT, 16UI_EXT, 32UI_EXT, 8I_EXT, 16I_EXT, 32I_EXT}
178
+ *
179
+ * The following image classes are currently disallowed:
180
+ * - Textures with borders
181
+ * - Multisampled renderbuffers
182
+ *
183
+ * \param resource - Pointer to the returned object handle
184
+ * \param image - name of texture or renderbuffer object to be registered
185
+ * \param target - Identifies the type of object specified by \p image
186
+ * \param flags - Register flags
187
+ *
188
+ * \return
189
+ * ::cudaSuccess,
190
+ * ::cudaErrorInvalidDevice,
191
+ * ::cudaErrorInvalidValue,
192
+ * ::cudaErrorInvalidResourceHandle,
193
+ * ::cudaErrorUnknown
194
+ * \notefnerr
195
+ *
196
+ * \sa
197
+ * ::cudaGraphicsUnregisterResource,
198
+ * ::cudaGraphicsMapResources,
199
+ * ::cudaGraphicsSubResourceGetMappedArray,
200
+ * ::cuGraphicsGLRegisterImage
201
+ */
202
+ extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterImage(struct cudaGraphicsResource **resource, GLuint image, GLenum target, unsigned int flags);
203
+
204
+ /**
205
+ * \brief Registers an OpenGL buffer object
206
+ *
207
+ * Registers the buffer object specified by \p buffer for access by
208
+ * CUDA. A handle to the registered object is returned as \p
209
+ * resource. The register flags \p flags specify the intended usage,
210
+ * as follows:
211
+ *
212
+ * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
213
+ * resource will be used. It is therefore assumed that this resource will be
214
+ * read from and written to by CUDA. This is the default value.
215
+ * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
216
+ * will not write to this resource.
217
+ * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
218
+ * CUDA will not read from this resource and will write over the
219
+ * entire contents of the resource, so none of the data previously
220
+ * stored in the resource will be preserved.
221
+ *
222
+ * \param resource - Pointer to the returned object handle
223
+ * \param buffer - name of buffer object to be registered
224
+ * \param flags - Register flags
225
+ *
226
+ * \return
227
+ * ::cudaSuccess,
228
+ * ::cudaErrorInvalidDevice,
229
+ * ::cudaErrorInvalidValue,
230
+ * ::cudaErrorInvalidResourceHandle,
231
+ * ::cudaErrorUnknown
232
+ * \notefnerr
233
+ *
234
+ * \sa
235
+ * ::cudaGraphicsUnregisterResource,
236
+ * ::cudaGraphicsMapResources,
237
+ * ::cudaGraphicsResourceGetMappedPointer,
238
+ * ::cuGraphicsGLRegisterBuffer
239
+ */
240
+ extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterBuffer(struct cudaGraphicsResource **resource, GLuint buffer, unsigned int flags);
241
+
242
+ #ifdef _WIN32
243
+ #ifndef WGL_NV_gpu_affinity
244
+ typedef void* HGPUNV;
245
+ #endif
246
+
247
+ /**
248
+ * \brief Gets the CUDA device associated with hGpu
249
+ *
250
+ * Returns the CUDA device associated with a hGpu, if applicable.
251
+ *
252
+ * \param device - Returns the device associated with hGpu, or -1 if hGpu is
253
+ * not a compute device.
254
+ * \param hGpu - Handle to a GPU, as queried via WGL_NV_gpu_affinity
255
+ *
256
+ * \return
257
+ * ::cudaSuccess
258
+ * \notefnerr
259
+ *
260
+ * \sa
261
+ * ::WGL_NV_gpu_affinity,
262
+ * ::cuWGLGetDevice
263
+ */
264
+ extern __host__ cudaError_t CUDARTAPI cudaWGLGetDevice(int *device, HGPUNV hGpu);
265
+ #endif
266
+
267
+ /** @} */ /* END CUDART_OPENGL */
268
+
269
+ /**
270
+ * \addtogroup CUDART_OPENGL_DEPRECATED OpenGL Interoperability [DEPRECATED]
271
+ * This section describes deprecated OpenGL interoperability functionality.
272
+ *
273
+ * @{
274
+ */
275
+
276
+ /**
277
+ * CUDA GL Map Flags
278
+ */
279
+ enum cudaGLMapFlags
280
+ {
281
+ cudaGLMapFlagsNone = 0, /**< Default; Assume resource can be read/written */
282
+ cudaGLMapFlagsReadOnly = 1, /**< CUDA kernels will not write to this resource */
283
+ cudaGLMapFlagsWriteDiscard = 2 /**< CUDA kernels will only write to and will not read from this resource */
284
+ };
285
+
286
+ /**
287
+ * \brief Sets a CUDA device to use OpenGL interoperability
288
+ *
289
+ * \deprecated This function is deprecated as of CUDA 5.0.
290
+ *
291
+ * This function is deprecated and should no longer be used. It is
292
+ * no longer necessary to associate a CUDA device with an OpenGL
293
+ * context in order to achieve maximum interoperability performance.
294
+ *
295
+ * \param device - Device to use for OpenGL interoperability
296
+ *
297
+ * \return
298
+ * ::cudaSuccess,
299
+ * ::cudaErrorInvalidDevice,
300
+ * ::cudaErrorSetOnActiveProcess
301
+ * \notefnerr
302
+ *
303
+ * \sa ::cudaGraphicsGLRegisterBuffer, ::cudaGraphicsGLRegisterImage
304
+ */
305
+ extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device);
306
+
307
+ /**
308
+ * \brief Registers a buffer object for access by CUDA
309
+ *
310
+ * \deprecated This function is deprecated as of CUDA 3.0.
311
+ *
312
+ * Registers the buffer object of ID \p bufObj for access by
313
+ * CUDA. This function must be called before CUDA can map the buffer
314
+ * object. The OpenGL context used to create the buffer, or another
315
+ * context from the same share group, must be bound to the current
316
+ * thread when this is called.
317
+ *
318
+ * \param bufObj - Buffer object ID to register
319
+ *
320
+ * \return
321
+ * ::cudaSuccess,
322
+ * ::cudaErrorInitializationError
323
+ * \notefnerr
324
+ *
325
+ * \sa ::cudaGraphicsGLRegisterBuffer
326
+ */
327
+ extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint bufObj);
328
+
329
+ /**
330
+ * \brief Maps a buffer object for access by CUDA
331
+ *
332
+ * \deprecated This function is deprecated as of CUDA 3.0.
333
+ *
334
+ * Maps the buffer object of ID \p bufObj into the address space of
335
+ * CUDA and returns in \p *devPtr the base pointer of the resulting
336
+ * mapping. The buffer must have previously been registered by
337
+ * calling ::cudaGLRegisterBufferObject(). While a buffer is mapped
338
+ * by CUDA, any OpenGL operation which references the buffer will
339
+ * result in undefined behavior. The OpenGL context used to create
340
+ * the buffer, or another context from the same share group, must be
341
+ * bound to the current thread when this is called.
342
+ *
343
+ * All streams in the current thread are synchronized with the current
344
+ * GL context.
345
+ *
346
+ * \param devPtr - Returned device pointer to CUDA object
347
+ * \param bufObj - Buffer object ID to map
348
+ *
349
+ * \return
350
+ * ::cudaSuccess,
351
+ * ::cudaErrorMapBufferObjectFailed
352
+ * \notefnerr
353
+ *
354
+ * \sa ::cudaGraphicsMapResources
355
+ */
356
+ extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj);
357
+
358
+ /**
359
+ * \brief Unmaps a buffer object for access by CUDA
360
+ *
361
+ * \deprecated This function is deprecated as of CUDA 3.0.
362
+ *
363
+ * Unmaps the buffer object of ID \p bufObj for access by CUDA. When
364
+ * a buffer is unmapped, the base address returned by
365
+ * ::cudaGLMapBufferObject() is invalid and subsequent references to
366
+ * the address result in undefined behavior. The OpenGL context used
367
+ * to create the buffer, or another context from the same share group,
368
+ * must be bound to the current thread when this is called.
369
+ *
370
+ * All streams in the current thread are synchronized with the current
371
+ * GL context.
372
+ *
373
+ * \param bufObj - Buffer object to unmap
374
+ *
375
+ * \return
376
+ * ::cudaSuccess,
377
+ * ::cudaErrorUnmapBufferObjectFailed
378
+ * \notefnerr
379
+ *
380
+ * \sa ::cudaGraphicsUnmapResources
381
+ */
382
+ extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj);
383
+
384
+ /**
385
+ * \brief Unregisters a buffer object for access by CUDA
386
+ *
387
+ * \deprecated This function is deprecated as of CUDA 3.0.
388
+ *
389
+ * Unregisters the buffer object of ID \p bufObj for access by CUDA
390
+ * and releases any CUDA resources associated with the buffer. Once a
391
+ * buffer is unregistered, it may no longer be mapped by CUDA. The GL
392
+ * context used to create the buffer, or another context from the
393
+ * same share group, must be bound to the current thread when this is
394
+ * called.
395
+ *
396
+ * \param bufObj - Buffer object to unregister
397
+ *
398
+ * \return
399
+ * ::cudaSuccess
400
+ * \notefnerr
401
+ *
402
+ * \sa ::cudaGraphicsUnregisterResource
403
+ */
404
+ extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint bufObj);
405
+
406
+ /**
407
+ * \brief Set usage flags for mapping an OpenGL buffer
408
+ *
409
+ * \deprecated This function is deprecated as of CUDA 3.0.
410
+ *
411
+ * Set flags for mapping the OpenGL buffer \p bufObj
412
+ *
413
+ * Changes to flags will take effect the next time \p bufObj is mapped.
414
+ * The \p flags argument may be any of the following:
415
+ *
416
+ * - ::cudaGLMapFlagsNone: Specifies no hints about how this buffer will
417
+ * be used. It is therefore assumed that this buffer will be read from and
418
+ * written to by CUDA kernels. This is the default value.
419
+ * - ::cudaGLMapFlagsReadOnly: Specifies that CUDA kernels which access this
420
+ * buffer will not write to the buffer.
421
+ * - ::cudaGLMapFlagsWriteDiscard: Specifies that CUDA kernels which access
422
+ * this buffer will not read from the buffer and will write over the
423
+ * entire contents of the buffer, so none of the data previously stored in
424
+ * the buffer will be preserved.
425
+ *
426
+ * If \p bufObj has not been registered for use with CUDA, then
427
+ * ::cudaErrorInvalidResourceHandle is returned. If \p bufObj is presently
428
+ * mapped for access by CUDA, then ::cudaErrorUnknown is returned.
429
+ *
430
+ * \param bufObj - Registered buffer object to set flags for
431
+ * \param flags - Parameters for buffer mapping
432
+ *
433
+ * \return
434
+ * ::cudaSuccess,
435
+ * ::cudaErrorInvalidValue,
436
+ * ::cudaErrorInvalidResourceHandle,
437
+ * ::cudaErrorUnknown
438
+ * \notefnerr
439
+ *
440
+ * \sa ::cudaGraphicsResourceSetMapFlags
441
+ */
442
+ extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetBufferObjectMapFlags(GLuint bufObj, unsigned int flags);
443
+
444
+ /**
445
+ * \brief Maps a buffer object for access by CUDA
446
+ *
447
+ * \deprecated This function is deprecated as of CUDA 3.0.
448
+ *
449
+ * Maps the buffer object of ID \p bufObj into the address space of
450
+ * CUDA and returns in \p *devPtr the base pointer of the resulting
451
+ * mapping. The buffer must have previously been registered by
452
+ * calling ::cudaGLRegisterBufferObject(). While a buffer is mapped
453
+ * by CUDA, any OpenGL operation which references the buffer will
454
+ * result in undefined behavior. The OpenGL context used to create
455
+ * the buffer, or another context from the same share group, must be
456
+ * bound to the current thread when this is called.
457
+ *
458
+ * Stream /p stream is synchronized with the current GL context.
459
+ *
460
+ * \param devPtr - Returned device pointer to CUDA object
461
+ * \param bufObj - Buffer object ID to map
462
+ * \param stream - Stream to synchronize
463
+ *
464
+ * \return
465
+ * ::cudaSuccess,
466
+ * ::cudaErrorMapBufferObjectFailed
467
+ * \notefnerr
468
+ *
469
+ * \sa ::cudaGraphicsMapResources
470
+ */
471
+ extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObjectAsync(void **devPtr, GLuint bufObj, cudaStream_t stream);
472
+
473
+ /**
474
+ * \brief Unmaps a buffer object for access by CUDA
475
+ *
476
+ * \deprecated This function is deprecated as of CUDA 3.0.
477
+ *
478
+ * Unmaps the buffer object of ID \p bufObj for access by CUDA. When
479
+ * a buffer is unmapped, the base address returned by
480
+ * ::cudaGLMapBufferObject() is invalid and subsequent references to
481
+ * the address result in undefined behavior. The OpenGL context used
482
+ * to create the buffer, or another context from the same share group,
483
+ * must be bound to the current thread when this is called.
484
+ *
485
+ * Stream /p stream is synchronized with the current GL context.
486
+ *
487
+ * \param bufObj - Buffer object to unmap
488
+ * \param stream - Stream to synchronize
489
+ *
490
+ * \return
491
+ * ::cudaSuccess,
492
+ * ::cudaErrorUnmapBufferObjectFailed
493
+ * \notefnerr
494
+ *
495
+ * \sa ::cudaGraphicsUnmapResources
496
+ */
497
+ extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObjectAsync(GLuint bufObj, cudaStream_t stream);
498
+
499
+ /** @} */ /* END CUDART_OPENGL_DEPRECATED */
500
+
501
+ #if defined(__cplusplus)
502
+ }
503
+ #endif /* __cplusplus */
504
+
505
+ #undef __CUDA_DEPRECATED
506
+
507
+ #endif /* __CUDA_GL_INTEROP_H__ */
508
+
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_runtime_api.h ADDED
The diff for this file is too large to render. See raw diff
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_vdpau_interop.h ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_VDPAU_INTEROP_H__)
51
+ #define __CUDA_VDPAU_INTEROP_H__
52
+
53
+ #include "cuda_runtime_api.h"
54
+
55
+ #include <vdpau/vdpau.h>
56
+
57
+ #if defined(__cplusplus)
58
+ extern "C" {
59
+ #endif /* __cplusplus */
60
+
61
+ /**
62
+ * \addtogroup CUDART_VDPAU VDPAU Interoperability
63
+ * This section describes the VDPAU interoperability functions of the CUDA
64
+ * runtime application programming interface.
65
+ *
66
+ * @{
67
+ */
68
+
69
+ /**
70
+ * \brief Gets the CUDA device associated with a VdpDevice.
71
+ *
72
+ * Returns the CUDA device associated with a VdpDevice, if applicable.
73
+ *
74
+ * \param device - Returns the device associated with vdpDevice, or -1 if
75
+ * the device associated with vdpDevice is not a compute device.
76
+ * \param vdpDevice - A VdpDevice handle
77
+ * \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
78
+ *
79
+ * \return
80
+ * ::cudaSuccess
81
+ * \notefnerr
82
+ *
83
+ * \sa
84
+ * ::cudaVDPAUSetVDPAUDevice,
85
+ * ::cuVDPAUGetDevice
86
+ */
87
+ extern __host__ cudaError_t CUDARTAPI cudaVDPAUGetDevice(int *device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
88
+
89
+ /**
90
+ * \brief Sets a CUDA device to use VDPAU interoperability
91
+ *
92
+ * Records \p vdpDevice as the VdpDevice for VDPAU interoperability
93
+ * with the CUDA device \p device and sets \p device as the current
94
+ * device for the calling host thread.
95
+ *
96
+ * If \p device has already been initialized then this call will fail
97
+ * with the error ::cudaErrorSetOnActiveProcess. In this case it is
98
+ * necessary to reset \p device using ::cudaDeviceReset() before
99
+ * VDPAU interoperability on \p device may be enabled.
100
+ *
101
+ * \param device - Device to use for VDPAU interoperability
102
+ * \param vdpDevice - The VdpDevice to interoperate with
103
+ * \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
104
+ *
105
+ * \return
106
+ * ::cudaSuccess,
107
+ * ::cudaErrorInvalidDevice,
108
+ * ::cudaErrorSetOnActiveProcess
109
+ * \notefnerr
110
+ *
111
+ * \sa ::cudaGraphicsVDPAURegisterVideoSurface,
112
+ * ::cudaGraphicsVDPAURegisterOutputSurface,
113
+ * ::cudaDeviceReset
114
+ */
115
+ extern __host__ cudaError_t CUDARTAPI cudaVDPAUSetVDPAUDevice(int device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
116
+
117
+ /**
118
+ * \brief Register a VdpVideoSurface object
119
+ *
120
+ * Registers the VdpVideoSurface specified by \p vdpSurface for access by CUDA.
121
+ * A handle to the registered object is returned as \p resource.
122
+ * The surface's intended usage is specified using \p flags, as follows:
123
+ *
124
+ * - ::cudaGraphicsMapFlagsNone: Specifies no hints about how this
125
+ * resource will be used. It is therefore assumed that this resource will be
126
+ * read from and written to by CUDA. This is the default value.
127
+ * - ::cudaGraphicsMapFlagsReadOnly: Specifies that CUDA
128
+ * will not write to this resource.
129
+ * - ::cudaGraphicsMapFlagsWriteDiscard: Specifies that
130
+ * CUDA will not read from this resource and will write over the
131
+ * entire contents of the resource, so none of the data previously
132
+ * stored in the resource will be preserved.
133
+ *
134
+ * \param resource - Pointer to the returned object handle
135
+ * \param vdpSurface - VDPAU object to be registered
136
+ * \param flags - Map flags
137
+ *
138
+ * \return
139
+ * ::cudaSuccess,
140
+ * ::cudaErrorInvalidDevice,
141
+ * ::cudaErrorInvalidValue,
142
+ * ::cudaErrorInvalidResourceHandle,
143
+ * ::cudaErrorUnknown
144
+ * \notefnerr
145
+ *
146
+ * \sa
147
+ * ::cudaVDPAUSetVDPAUDevice,
148
+ * ::cudaGraphicsUnregisterResource,
149
+ * ::cudaGraphicsSubResourceGetMappedArray,
150
+ * ::cuGraphicsVDPAURegisterVideoSurface
151
+ */
152
+ extern __host__ cudaError_t CUDARTAPI cudaGraphicsVDPAURegisterVideoSurface(struct cudaGraphicsResource **resource, VdpVideoSurface vdpSurface, unsigned int flags);
153
+
154
+ /**
155
+ * \brief Register a VdpOutputSurface object
156
+ *
157
+ * Registers the VdpOutputSurface specified by \p vdpSurface for access by CUDA.
158
+ * A handle to the registered object is returned as \p resource.
159
+ * The surface's intended usage is specified using \p flags, as follows:
160
+ *
161
+ * - ::cudaGraphicsMapFlagsNone: Specifies no hints about how this
162
+ * resource will be used. It is therefore assumed that this resource will be
163
+ * read from and written to by CUDA. This is the default value.
164
+ * - ::cudaGraphicsMapFlagsReadOnly: Specifies that CUDA
165
+ * will not write to this resource.
166
+ * - ::cudaGraphicsMapFlagsWriteDiscard: Specifies that
167
+ * CUDA will not read from this resource and will write over the
168
+ * entire contents of the resource, so none of the data previously
169
+ * stored in the resource will be preserved.
170
+ *
171
+ * \param resource - Pointer to the returned object handle
172
+ * \param vdpSurface - VDPAU object to be registered
173
+ * \param flags - Map flags
174
+ *
175
+ * \return
176
+ * ::cudaSuccess,
177
+ * ::cudaErrorInvalidDevice,
178
+ * ::cudaErrorInvalidValue,
179
+ * ::cudaErrorInvalidResourceHandle,
180
+ * ::cudaErrorUnknown
181
+ * \notefnerr
182
+ *
183
+ * \sa
184
+ * ::cudaVDPAUSetVDPAUDevice,
185
+ * ::cudaGraphicsUnregisterResource,
186
+ * ::cudaGraphicsSubResourceGetMappedArray,
187
+ * ::cuGraphicsVDPAURegisterOutputSurface
188
+ */
189
+ extern __host__ cudaError_t CUDARTAPI cudaGraphicsVDPAURegisterOutputSurface(struct cudaGraphicsResource **resource, VdpOutputSurface vdpSurface, unsigned int flags);
190
+
191
+ /** @} */ /* END CUDART_VDPAU */
192
+
193
+ #if defined(__cplusplus)
194
+ }
195
+ #endif /* __cplusplus */
196
+
197
+ #endif /* __CUDA_VDPAU_INTEROP_H__ */
198
+
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_functions.h ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #if defined(_MSC_VER)
52
+ #pragma message("device_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
53
+ #else
54
+ #warning "device_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead."
55
+ #endif
56
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
57
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__
58
+ #endif
59
+
60
+ #include "crt/device_functions.h"
61
+
62
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__)
63
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
64
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__
65
+ #endif
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/math_constants.h ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__MATH_CONSTANTS_H__)
51
+ #define __MATH_CONSTANTS_H__
52
+
53
+ /* single precision constants */
54
+ #define CUDART_INF_F __int_as_float(0x7f800000U)
55
+ #define CUDART_NAN_F __int_as_float(0x7fffffffU)
56
+ #define CUDART_MIN_DENORM_F __int_as_float(0x00000001U)
57
+ #define CUDART_MAX_NORMAL_F __int_as_float(0x7f7fffffU)
58
+ #define CUDART_NEG_ZERO_F __int_as_float(0x80000000U)
59
+ #define CUDART_ZERO_F 0.0F
60
+ #define CUDART_ONE_F 1.0F
61
+ #define CUDART_SQRT_HALF_F 0.707106781F
62
+ #define CUDART_SQRT_HALF_HI_F 0.707106781F
63
+ #define CUDART_SQRT_HALF_LO_F 1.210161749e-08F
64
+ #define CUDART_SQRT_TWO_F 1.414213562F
65
+ #define CUDART_THIRD_F 0.333333333F
66
+ #define CUDART_PIO4_F 0.785398163F
67
+ #define CUDART_PIO2_F 1.570796327F
68
+ #define CUDART_3PIO4_F 2.356194490F
69
+ #define CUDART_2_OVER_PI_F 0.636619772F
70
+ #define CUDART_SQRT_2_OVER_PI_F 0.797884561F
71
+ #define CUDART_PI_F 3.141592654F
72
+ #define CUDART_L2E_F 1.442695041F
73
+ #define CUDART_L2T_F 3.321928094F
74
+ #define CUDART_LG2_F 0.301029996F
75
+ #define CUDART_LGE_F 0.434294482F
76
+ #define CUDART_LN2_F 0.693147181F
77
+ #define CUDART_LNT_F 2.302585093F
78
+ #define CUDART_LNPI_F 1.144729886F
79
+ #define CUDART_TWO_TO_M126_F 1.175494351e-38F
80
+ #define CUDART_TWO_TO_126_F 8.507059173e37F
81
+ #define CUDART_NORM_HUGE_F 3.402823466e38F
82
+ #define CUDART_TWO_TO_23_F 8388608.0F
83
+ #define CUDART_TWO_TO_24_F 16777216.0F
84
+ #define CUDART_TWO_TO_31_F 2147483648.0F
85
+ #define CUDART_TWO_TO_32_F 4294967296.0F
86
+ #define CUDART_REMQUO_BITS_F 3U
87
+ #define CUDART_REMQUO_MASK_F (~((~0U)<<CUDART_REMQUO_BITS_F))
88
+ #define CUDART_TRIG_PLOSS_F 105615.0F
89
+
90
+ /* double precision constants */
91
+ #define CUDART_INF __longlong_as_double(0x7ff0000000000000ULL)
92
+ #define CUDART_NAN __longlong_as_double(0xfff8000000000000ULL)
93
+ #define CUDART_NEG_ZERO __longlong_as_double(0x8000000000000000ULL)
94
+ #define CUDART_MIN_DENORM __longlong_as_double(0x0000000000000001ULL)
95
+ #define CUDART_ZERO 0.0
96
+ #define CUDART_ONE 1.0
97
+ #define CUDART_SQRT_TWO 1.4142135623730951e+0
98
+ #define CUDART_SQRT_HALF 7.0710678118654757e-1
99
+ #define CUDART_SQRT_HALF_HI 7.0710678118654757e-1
100
+ #define CUDART_SQRT_HALF_LO (-4.8336466567264567e-17)
101
+ #define CUDART_THIRD 3.3333333333333333e-1
102
+ #define CUDART_TWOTHIRD 6.6666666666666667e-1
103
+ #define CUDART_PIO4 7.8539816339744828e-1
104
+ #define CUDART_PIO4_HI 7.8539816339744828e-1
105
+ #define CUDART_PIO4_LO 3.0616169978683830e-17
106
+ #define CUDART_PIO2 1.5707963267948966e+0
107
+ #define CUDART_PIO2_HI 1.5707963267948966e+0
108
+ #define CUDART_PIO2_LO 6.1232339957367660e-17
109
+ #define CUDART_3PIO4 2.3561944901923448e+0
110
+ #define CUDART_2_OVER_PI 6.3661977236758138e-1
111
+ #define CUDART_PI 3.1415926535897931e+0
112
+ #define CUDART_PI_HI 3.1415926535897931e+0
113
+ #define CUDART_PI_LO 1.2246467991473532e-16
114
+ #define CUDART_SQRT_2PI 2.5066282746310007e+0
115
+ #define CUDART_SQRT_2PI_HI 2.5066282746310007e+0
116
+ #define CUDART_SQRT_2PI_LO (-1.8328579980459167e-16)
117
+ #define CUDART_SQRT_PIO2 1.2533141373155003e+0
118
+ #define CUDART_SQRT_PIO2_HI 1.2533141373155003e+0
119
+ #define CUDART_SQRT_PIO2_LO (-9.1642899902295834e-17)
120
+ #define CUDART_SQRT_2OPI 7.9788456080286536e-1
121
+ #define CUDART_L2E 1.4426950408889634e+0
122
+ #define CUDART_L2E_HI 1.4426950408889634e+0
123
+ #define CUDART_L2E_LO 2.0355273740931033e-17
124
+ #define CUDART_L2T 3.3219280948873622e+0
125
+ #define CUDART_LG2 3.0102999566398120e-1
126
+ #define CUDART_LG2_HI 3.0102999566398120e-1
127
+ #define CUDART_LG2_LO (-2.8037281277851704e-18)
128
+ #define CUDART_LGE 4.3429448190325182e-1
129
+ #define CUDART_LGE_HI 4.3429448190325182e-1
130
+ #define CUDART_LGE_LO 1.09831965021676510e-17
131
+ #define CUDART_LN2 6.9314718055994529e-1
132
+ #define CUDART_LN2_HI 6.9314718055994529e-1
133
+ #define CUDART_LN2_LO 2.3190468138462996e-17
134
+ #define CUDART_LNT 2.3025850929940459e+0
135
+ #define CUDART_LNT_HI 2.3025850929940459e+0
136
+ #define CUDART_LNT_LO (-2.1707562233822494e-16)
137
+ #define CUDART_LNPI 1.1447298858494002e+0
138
+ #define CUDART_LN2_X_1024 7.0978271289338397e+2
139
+ #define CUDART_LN2_X_1025 7.1047586007394398e+2
140
+ #define CUDART_LN2_X_1075 7.4513321910194122e+2
141
+ #define CUDART_LG2_X_1024 3.0825471555991675e+2
142
+ #define CUDART_LG2_X_1075 3.2360724533877976e+2
143
+ #define CUDART_TWO_TO_23 8388608.0
144
+ #define CUDART_TWO_TO_52 4503599627370496.0
145
+ #define CUDART_TWO_TO_53 9007199254740992.0
146
+ #define CUDART_TWO_TO_54 18014398509481984.0
147
+ #define CUDART_TWO_TO_M54 5.5511151231257827e-17
148
+ #define CUDART_TWO_TO_M1022 2.22507385850720140e-308
149
+ #define CUDART_TRIG_PLOSS 2147483648.0
150
+ #define CUDART_DBL2INT_CVT 6755399441055744.0
151
+
152
+ #endif /* !__MATH_CONSTANTS_H__ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_intrinsics.h ADDED
@@ -0,0 +1,1551 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__SM_20_INTRINSICS_H__)
51
+ #define __SM_20_INTRINSICS_H__
52
+
53
+ #if defined(__CUDACC_RTC__)
54
+ #define __SM_20_INTRINSICS_DECL__ __device__
55
+ #else /* __CUDACC_RTC__ */
56
+ #define __SM_20_INTRINSICS_DECL__ static __inline__ __device__
57
+ #endif /* __CUDACC_RTC__ */
58
+
59
+ #if defined(__cplusplus) && defined(__CUDACC__)
60
+
61
+ /*******************************************************************************
62
+ * *
63
+ * *
64
+ * *
65
+ *******************************************************************************/
66
+
67
+ #include "cuda_runtime_api.h"
68
+
69
+ #ifndef __CUDA_ARCH__
70
+ #define __DEF_IF_HOST { }
71
+ #else /* !__CUDA_ARCH__ */
72
+ #define __DEF_IF_HOST ;
73
+ #endif /* __CUDA_ARCH__ */
74
+
75
+ #if defined(_WIN32)
76
+ # define __DEPRECATED__(msg) __declspec(deprecated(msg))
77
+ #elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
78
+ # define __DEPRECATED__(msg) __attribute__((deprecated))
79
+ #else
80
+ # define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
81
+ #endif
82
+
83
+ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
84
+ #define __WSB_DEPRECATION_MESSAGE(x) #x"() is not valid on compute_70 and above, and should be replaced with "#x"_sync()."\
85
+ "To continue using "#x"(), specify virtual architecture compute_60 when targeting sm_70 and above, for example, using the pair of compiler options: -arch=compute_60 -code=sm_70."
86
+ #else
87
+ #define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)."
88
+ #endif
89
+
90
+ extern "C"
91
+ {
92
+ extern __device__ __device_builtin__ void __threadfence_system(void);
93
+ /**
94
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
95
+ * \brief Divide two floating-point values in round-to-nearest-even mode.
96
+ *
97
+ * Divides two floating-point values \p x by \p y in round-to-nearest-even mode.
98
+ *
99
+ * \return Returns \p x / \p y.
100
+ *
101
+ * \note_accuracy_double
102
+ * \note_requires_fermi
103
+ */
104
+ extern __device__ __device_builtin__ double __ddiv_rn(double x, double y);
105
+ /**
106
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
107
+ * \brief Divide two floating-point values in round-towards-zero mode.
108
+ *
109
+ * Divides two floating-point values \p x by \p y in round-towards-zero mode.
110
+ *
111
+ * \return Returns \p x / \p y.
112
+ *
113
+ * \note_accuracy_double
114
+ * \note_requires_fermi
115
+ */
116
+ extern __device__ __device_builtin__ double __ddiv_rz(double x, double y);
117
+ /**
118
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
119
+ * \brief Divide two floating-point values in round-up mode.
120
+ *
121
+ * Divides two floating-point values \p x by \p y in round-up (to positive infinity) mode.
122
+ *
123
+ * \return Returns \p x / \p y.
124
+ *
125
+ * \note_accuracy_double
126
+ * \note_requires_fermi
127
+ */
128
+ extern __device__ __device_builtin__ double __ddiv_ru(double x, double y);
129
+ /**
130
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
131
+ * \brief Divide two floating-point values in round-down mode.
132
+ *
133
+ * Divides two floating-point values \p x by \p y in round-down (to negative infinity) mode.
134
+ *
135
+ * \return Returns \p x / \p y.
136
+ *
137
+ * \note_accuracy_double
138
+ * \note_requires_fermi
139
+ */
140
+ extern __device__ __device_builtin__ double __ddiv_rd(double x, double y);
141
+ /**
142
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
143
+ * \brief Compute
144
+ * \latexonly $\frac{1}{x}$ \endlatexonly
145
+ * \xmlonly
146
+ * <d4p_MathML outputclass="xmlonly">
147
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
148
+ * <m:mfrac>
149
+ * <m:mn>1</m:mn>
150
+ * <m:mi>x</m:mi>
151
+ * </m:mfrac>
152
+ * </m:math>
153
+ * </d4p_MathML>
154
+ * \endxmlonly
155
+ * in round-to-nearest-even mode.
156
+ *
157
+ * Compute the reciprocal of \p x in round-to-nearest-even mode.
158
+ *
159
+ * \return Returns
160
+ * \latexonly $\frac{1}{x}$ \endlatexonly
161
+ * \xmlonly
162
+ * <d4p_MathML outputclass="xmlonly">
163
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
164
+ * <m:mfrac>
165
+ * <m:mn>1</m:mn>
166
+ * <m:mi>x</m:mi>
167
+ * </m:mfrac>
168
+ * </m:math>
169
+ * </d4p_MathML>\endxmlonly.
170
+ *
171
+ * \note_accuracy_double
172
+ * \note_requires_fermi
173
+ */
174
+ extern __device__ __device_builtin__ double __drcp_rn(double x);
175
+ /**
176
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
177
+ * \brief Compute
178
+ * \latexonly $\frac{1}{x}$ \endlatexonly
179
+ * \xmlonly
180
+ * <d4p_MathML outputclass="xmlonly">
181
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
182
+ * <m:mfrac>
183
+ * <m:mn>1</m:mn>
184
+ * <m:mi>x</m:mi>
185
+ * </m:mfrac>
186
+ * </m:math>
187
+ * </d4p_MathML>
188
+ * \endxmlonly
189
+ * in round-towards-zero mode.
190
+ *
191
+ * Compute the reciprocal of \p x in round-towards-zero mode.
192
+ *
193
+ * \return Returns
194
+ * \latexonly $\frac{1}{x}$ \endlatexonly
195
+ * \xmlonly
196
+ * <d4p_MathML outputclass="xmlonly">
197
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
198
+ * <m:mfrac>
199
+ * <m:mn>1</m:mn>
200
+ * <m:mi>x</m:mi>
201
+ * </m:mfrac>
202
+ * </m:math>
203
+ * </d4p_MathML>\endxmlonly.
204
+ *
205
+ * \note_accuracy_double
206
+ * \note_requires_fermi
207
+ */
208
+ extern __device__ __device_builtin__ double __drcp_rz(double x);
209
+ /**
210
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
211
+ * \brief Compute
212
+ * \latexonly $\frac{1}{x}$ \endlatexonly
213
+ * \xmlonly
214
+ * <d4p_MathML outputclass="xmlonly">
215
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
216
+ * <m:mfrac>
217
+ * <m:mn>1</m:mn>
218
+ * <m:mi>x</m:mi>
219
+ * </m:mfrac>
220
+ * </m:math>
221
+ * </d4p_MathML>
222
+ * \endxmlonly
223
+ * in round-up mode.
224
+ *
225
+ * Compute the reciprocal of \p x in round-up (to positive infinity) mode.
226
+ *
227
+ * \return Returns
228
+ * \latexonly $\frac{1}{x}$ \endlatexonly
229
+ * \xmlonly
230
+ * <d4p_MathML outputclass="xmlonly">
231
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
232
+ * <m:mfrac>
233
+ * <m:mn>1</m:mn>
234
+ * <m:mi>x</m:mi>
235
+ * </m:mfrac>
236
+ * </m:math>
237
+ * </d4p_MathML>\endxmlonly.
238
+ *
239
+ * \note_accuracy_double
240
+ * \note_requires_fermi
241
+ */
242
+ extern __device__ __device_builtin__ double __drcp_ru(double x);
243
+ /**
244
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
245
+ * \brief Compute
246
+ * \latexonly $\frac{1}{x}$ \endlatexonly
247
+ * \xmlonly
248
+ * <d4p_MathML outputclass="xmlonly">
249
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
250
+ * <m:mfrac>
251
+ * <m:mn>1</m:mn>
252
+ * <m:mi>x</m:mi>
253
+ * </m:mfrac>
254
+ * </m:math>
255
+ * </d4p_MathML>
256
+ * \endxmlonly
257
+ * in round-down mode.
258
+ *
259
+ * Compute the reciprocal of \p x in round-down (to negative infinity) mode.
260
+ *
261
+ * \return Returns
262
+ * \latexonly $\frac{1}{x}$ \endlatexonly
263
+ * \xmlonly
264
+ * <d4p_MathML outputclass="xmlonly">
265
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
266
+ * <m:mfrac>
267
+ * <m:mn>1</m:mn>
268
+ * <m:mi>x</m:mi>
269
+ * </m:mfrac>
270
+ * </m:math>
271
+ * </d4p_MathML>\endxmlonly.
272
+ *
273
+ * \note_accuracy_double
274
+ * \note_requires_fermi
275
+ */
276
+ extern __device__ __device_builtin__ double __drcp_rd(double x);
277
+ /**
278
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
279
+ * \brief Compute
280
+ * \latexonly $\sqrt{x}$ \endlatexonly
281
+ * \xmlonly
282
+ * <d4p_MathML outputclass="xmlonly">
283
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
284
+ * <m:msqrt>
285
+ * <m:mi>x</m:mi>
286
+ * </m:msqrt>
287
+ * </m:math>
288
+ * </d4p_MathML>
289
+ * \endxmlonly
290
+ * in round-to-nearest-even mode.
291
+ *
292
+ * Compute the square root of \p x in round-to-nearest-even mode.
293
+ *
294
+ * \return Returns
295
+ * \latexonly $\sqrt{x}$ \endlatexonly
296
+ * \xmlonly
297
+ * <d4p_MathML outputclass="xmlonly">
298
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
299
+ * <m:msqrt>
300
+ * <m:mi>x</m:mi>
301
+ * </m:msqrt>
302
+ * </m:math>
303
+ * </d4p_MathML>\endxmlonly.
304
+ *
305
+ * \note_accuracy_double
306
+ * \note_requires_fermi
307
+ */
308
+ extern __device__ __device_builtin__ double __dsqrt_rn(double x);
309
+ /**
310
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
311
+ * \brief Compute
312
+ * \latexonly $\sqrt{x}$ \endlatexonly
313
+ * \xmlonly
314
+ * <d4p_MathML outputclass="xmlonly">
315
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
316
+ * <m:msqrt>
317
+ * <m:mi>x</m:mi>
318
+ * </m:msqrt>
319
+ * </m:math>
320
+ * </d4p_MathML>
321
+ * \endxmlonly
322
+ * in round-towards-zero mode.
323
+ *
324
+ * Compute the square root of \p x in round-towards-zero mode.
325
+ *
326
+ * \return Returns
327
+ * \latexonly $\sqrt{x}$ \endlatexonly
328
+ * \xmlonly
329
+ * <d4p_MathML outputclass="xmlonly">
330
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
331
+ * <m:msqrt>
332
+ * <m:mi>x</m:mi>
333
+ * </m:msqrt>
334
+ * </m:math>
335
+ * </d4p_MathML>\endxmlonly.
336
+ *
337
+ * \note_accuracy_double
338
+ * \note_requires_fermi
339
+ */
340
+ extern __device__ __device_builtin__ double __dsqrt_rz(double x);
341
+ /**
342
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
343
+ * \brief Compute
344
+ * \latexonly $\sqrt{x}$ \endlatexonly
345
+ * \xmlonly
346
+ * <d4p_MathML outputclass="xmlonly">
347
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
348
+ * <m:msqrt>
349
+ * <m:mi>x</m:mi>
350
+ * </m:msqrt>
351
+ * </m:math>
352
+ * </d4p_MathML>
353
+ * \endxmlonly
354
+ * in round-up mode.
355
+ *
356
+ * Compute the square root of \p x in round-up (to positive infinity) mode.
357
+ *
358
+ * \return Returns
359
+ * \latexonly $\sqrt{x}$ \endlatexonly
360
+ * \xmlonly
361
+ * <d4p_MathML outputclass="xmlonly">
362
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
363
+ * <m:msqrt>
364
+ * <m:mi>x</m:mi>
365
+ * </m:msqrt>
366
+ * </m:math>
367
+ * </d4p_MathML>\endxmlonly.
368
+ *
369
+ * \note_accuracy_double
370
+ * \note_requires_fermi
371
+ */
372
+ extern __device__ __device_builtin__ double __dsqrt_ru(double x);
373
+ /**
374
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
375
+ * \brief Compute
376
+ * \latexonly $\sqrt{x}$ \endlatexonly
377
+ * \xmlonly
378
+ * <d4p_MathML outputclass="xmlonly">
379
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
380
+ * <m:msqrt>
381
+ * <m:mi>x</m:mi>
382
+ * </m:msqrt>
383
+ * </m:math>
384
+ * </d4p_MathML>
385
+ * \endxmlonly
386
+ * in round-down mode.
387
+ *
388
+ * Compute the square root of \p x in round-down (to negative infinity) mode.
389
+ *
390
+ * \return Returns
391
+ * \latexonly $\sqrt{x}$ \endlatexonly
392
+ * \xmlonly
393
+ * <d4p_MathML outputclass="xmlonly">
394
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
395
+ * <m:msqrt>
396
+ * <m:mi>x</m:mi>
397
+ * </m:msqrt>
398
+ * </m:math>
399
+ * </d4p_MathML>\endxmlonly.
400
+ *
401
+ * \note_accuracy_double
402
+ * \note_requires_fermi
403
+ */
404
+ extern __device__ __device_builtin__ double __dsqrt_rd(double x);
405
+ extern __device__ __device_builtin__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__ballot)) unsigned int __ballot(int);
406
+ extern __device__ __device_builtin__ int __syncthreads_count(int);
407
+ extern __device__ __device_builtin__ int __syncthreads_and(int);
408
+ extern __device__ __device_builtin__ int __syncthreads_or(int);
409
+ extern __device__ __device_builtin__ long long int clock64(void);
410
+
411
+
412
+ /**
413
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
414
+ * \brief Compute fused multiply-add operation in round-to-nearest-even mode, ignore \p -ftz=true compiler flag
415
+ *
416
+ * Behavior is the same as ::__fmaf_rn(\p x, \p y, \p z), the difference is in
417
+ * handling denormalized inputs and outputs: \p -ftz compiler flag has no effect.
418
+ */
419
+ extern __device__ __device_builtin__ float __fmaf_ieee_rn(float x, float y, float z);
420
+
421
+ /**
422
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
423
+ * \brief Compute fused multiply-add operation in round-down mode, ignore \p -ftz=true compiler flag
424
+ *
425
+ * Behavior is the same as ::__fmaf_rd(\p x, \p y, \p z), the difference is in
426
+ * handling denormalized inputs and outputs: \p -ftz compiler flag has no effect.
427
+ */
428
+ extern __device__ __device_builtin__ float __fmaf_ieee_rd(float x, float y, float z);
429
+
430
+ /**
431
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
432
+ * \brief Compute fused multiply-add operation in round-up mode, ignore \p -ftz=true compiler flag
433
+ *
434
+ * Behavior is the same as ::__fmaf_ru(\p x, \p y, \p z), the difference is in
435
+ * handling denormalized inputs and outputs: \p -ftz compiler flag has no effect.
436
+ */
437
+ extern __device__ __device_builtin__ float __fmaf_ieee_ru(float x, float y, float z);
438
+
439
+ /**
440
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
441
+ * \brief Compute fused multiply-add operation in round-towards-zero mode, ignore \p -ftz=true compiler flag
442
+ *
443
+ * Behavior is the same as ::__fmaf_rz(\p x, \p y, \p z), the difference is in
444
+ * handling denormalized inputs and outputs: \p -ftz compiler flag has no effect.
445
+ */
446
+ extern __device__ __device_builtin__ float __fmaf_ieee_rz(float x, float y, float z);
447
+
448
+
449
+ // SM_13 intrinsics
450
+
451
+ /**
452
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
453
+ * \brief Reinterpret bits in a double as a 64-bit signed integer.
454
+ *
455
+ * Reinterpret the bits in the double-precision floating-point value \p x
456
+ * as a signed 64-bit integer.
457
+ * \return Returns reinterpreted value.
458
+ */
459
+ extern __device__ __device_builtin__ long long int __double_as_longlong(double x);
460
+ /**
461
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
462
+ * \brief Reinterpret bits in a 64-bit signed integer as a double.
463
+ *
464
+ * Reinterpret the bits in the 64-bit signed integer value \p x as
465
+ * a double-precision floating-point value.
466
+ * \return Returns reinterpreted value.
467
+ */
468
+ extern __device__ __device_builtin__ double __longlong_as_double(long long int x);
469
+ /**
470
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
471
+ * \brief Compute
472
+ * \latexonly $x \times y + z$ \endlatexonly
473
+ * \xmlonly
474
+ * <d4p_MathML outputclass="xmlonly">
475
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
476
+ * <m:mi>x</m:mi>
477
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
478
+ * <m:mi>y</m:mi>
479
+ * <m:mo>+</m:mo>
480
+ * <m:mi>z</m:mi>
481
+ * </m:math>
482
+ * </d4p_MathML>
483
+ * \endxmlonly
484
+ * as a single operation in round-to-nearest-even mode.
485
+ *
486
+ * Computes the value of
487
+ * \latexonly $x \times y + z$ \endlatexonly
488
+ * \xmlonly
489
+ * <d4p_MathML outputclass="xmlonly">
490
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
491
+ * <m:mi>x</m:mi>
492
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
493
+ * <m:mi>y</m:mi>
494
+ * <m:mo>+</m:mo>
495
+ * <m:mi>z</m:mi>
496
+ * </m:math>
497
+ * </d4p_MathML>
498
+ * \endxmlonly
499
+ * as a single ternary operation, rounding the
500
+ * result once in round-to-nearest-even mode.
501
+ *
502
+ * \return Returns the rounded value of
503
+ * \latexonly $x \times y + z$ \endlatexonly
504
+ * \xmlonly
505
+ * <d4p_MathML outputclass="xmlonly">
506
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
507
+ * <m:mi>x</m:mi>
508
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
509
+ * <m:mi>y</m:mi>
510
+ * <m:mo>+</m:mo>
511
+ * <m:mi>z</m:mi>
512
+ * </m:math>
513
+ * </d4p_MathML>
514
+ * \endxmlonly
515
+ * as a single operation.
516
+ * - fmaf(
517
+ * \latexonly $\pm \infty$ \endlatexonly
518
+ * \xmlonly
519
+ * <d4p_MathML outputclass="xmlonly">
520
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
521
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
522
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
523
+ * </m:math>
524
+ * </d4p_MathML>
525
+ * \endxmlonly
526
+ * ,
527
+ * \latexonly $\pm 0$ \endlatexonly
528
+ * \xmlonly
529
+ * <d4p_MathML outputclass="xmlonly">
530
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
531
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
532
+ * <m:mn>0</m:mn>
533
+ * </m:math>
534
+ * </d4p_MathML>
535
+ * \endxmlonly
536
+ * , \p z) returns NaN.
537
+ * - fmaf(
538
+ * \latexonly $\pm 0$ \endlatexonly
539
+ * \xmlonly
540
+ * <d4p_MathML outputclass="xmlonly">
541
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
542
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
543
+ * <m:mn>0</m:mn>
544
+ * </m:math>
545
+ * </d4p_MathML>
546
+ * \endxmlonly
547
+ * ,
548
+ * \latexonly $\pm \infty$ \endlatexonly
549
+ * \xmlonly
550
+ * <d4p_MathML outputclass="xmlonly">
551
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
552
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
553
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
554
+ * </m:math>
555
+ * </d4p_MathML>
556
+ * \endxmlonly
557
+ * , \p z) returns NaN.
558
+ * - fmaf(\p x, \p y,
559
+ * \latexonly $-\infty$ \endlatexonly
560
+ * \xmlonly
561
+ * <d4p_MathML outputclass="xmlonly">
562
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
563
+ * <m:mo>-</m:mo>
564
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
565
+ * </m:math>
566
+ * </d4p_MathML>
567
+ * \endxmlonly
568
+ * ) returns NaN if
569
+ * \latexonly $x \times y$ \endlatexonly
570
+ * \xmlonly
571
+ * <d4p_MathML outputclass="xmlonly">
572
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
573
+ * <m:mi>x</m:mi>
574
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
575
+ * <m:mi>y</m:mi>
576
+ * </m:math>
577
+ * </d4p_MathML>
578
+ * \endxmlonly
579
+ * is an exact
580
+ * \latexonly $+\infty$ \endlatexonly
581
+ * \xmlonly
582
+ * <d4p_MathML outputclass="xmlonly">
583
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
584
+ * <m:mo>+</m:mo>
585
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
586
+ * </m:math>
587
+ * </d4p_MathML>
588
+ * \endxmlonly
589
+ * .
590
+ * - fmaf(\p x, \p y,
591
+ * \latexonly $+\infty$ \endlatexonly
592
+ * \xmlonly
593
+ * <d4p_MathML outputclass="xmlonly">
594
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
595
+ * <m:mo>+</m:mo>
596
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
597
+ * </m:math>
598
+ * </d4p_MathML>
599
+ * \endxmlonly
600
+ * ) returns NaN if
601
+ * \latexonly $x \times y$ \endlatexonly
602
+ * \xmlonly
603
+ * <d4p_MathML outputclass="xmlonly">
604
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
605
+ * <m:mi>x</m:mi>
606
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
607
+ * <m:mi>y</m:mi>
608
+ * </m:math>
609
+ * </d4p_MathML>
610
+ * \endxmlonly
611
+ * is an exact
612
+ * \latexonly $-\infty$ \endlatexonly
613
+ * \xmlonly
614
+ * <d4p_MathML outputclass="xmlonly">
615
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
616
+ * <m:mo>-</m:mo>
617
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
618
+ * </m:math>
619
+ * </d4p_MathML>
620
+ * \endxmlonly
621
+ * .
622
+ *
623
+ * \note_accuracy_double
624
+ */
625
+ extern __device__ __device_builtin__ double __fma_rn(double x, double y, double z);
626
+ /**
627
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
628
+ * \brief Compute
629
+ * \latexonly $x \times y + z$ \endlatexonly
630
+ * \xmlonly
631
+ * <d4p_MathML outputclass="xmlonly">
632
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
633
+ * <m:mi>x</m:mi>
634
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
635
+ * <m:mi>y</m:mi>
636
+ * <m:mo>+</m:mo>
637
+ * <m:mi>z</m:mi>
638
+ * </m:math>
639
+ * </d4p_MathML>
640
+ * \endxmlonly
641
+ * as a single operation in round-towards-zero mode.
642
+ *
643
+ * Computes the value of
644
+ * \latexonly $x \times y + z$ \endlatexonly
645
+ * \xmlonly
646
+ * <d4p_MathML outputclass="xmlonly">
647
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
648
+ * <m:mi>x</m:mi>
649
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
650
+ * <m:mi>y</m:mi>
651
+ * <m:mo>+</m:mo>
652
+ * <m:mi>z</m:mi>
653
+ * </m:math>
654
+ * </d4p_MathML>
655
+ * \endxmlonly
656
+ * as a single ternary operation, rounding the
657
+ * result once in round-towards-zero mode.
658
+ *
659
+ * \return Returns the rounded value of
660
+ * \latexonly $x \times y + z$ \endlatexonly
661
+ * \xmlonly
662
+ * <d4p_MathML outputclass="xmlonly">
663
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
664
+ * <m:mi>x</m:mi>
665
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
666
+ * <m:mi>y</m:mi>
667
+ * <m:mo>+</m:mo>
668
+ * <m:mi>z</m:mi>
669
+ * </m:math>
670
+ * </d4p_MathML>
671
+ * \endxmlonly
672
+ * as a single operation.
673
+ * - fmaf(
674
+ * \latexonly $\pm \infty$ \endlatexonly
675
+ * \xmlonly
676
+ * <d4p_MathML outputclass="xmlonly">
677
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
678
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
679
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
680
+ * </m:math>
681
+ * </d4p_MathML>
682
+ * \endxmlonly
683
+ * ,
684
+ * \latexonly $\pm 0$ \endlatexonly
685
+ * \xmlonly
686
+ * <d4p_MathML outputclass="xmlonly">
687
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
688
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
689
+ * <m:mn>0</m:mn>
690
+ * </m:math>
691
+ * </d4p_MathML>
692
+ * \endxmlonly
693
+ * , \p z) returns NaN.
694
+ * - fmaf(
695
+ * \latexonly $\pm 0$ \endlatexonly
696
+ * \xmlonly
697
+ * <d4p_MathML outputclass="xmlonly">
698
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
699
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
700
+ * <m:mn>0</m:mn>
701
+ * </m:math>
702
+ * </d4p_MathML>
703
+ * \endxmlonly
704
+ * ,
705
+ * \latexonly $\pm \infty$ \endlatexonly
706
+ * \xmlonly
707
+ * <d4p_MathML outputclass="xmlonly">
708
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
709
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
710
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
711
+ * </m:math>
712
+ * </d4p_MathML>
713
+ * \endxmlonly
714
+ * , \p z) returns NaN.
715
+ * - fmaf(\p x, \p y,
716
+ * \latexonly $-\infty$ \endlatexonly
717
+ * \xmlonly
718
+ * <d4p_MathML outputclass="xmlonly">
719
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
720
+ * <m:mo>-</m:mo>
721
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
722
+ * </m:math>
723
+ * </d4p_MathML>
724
+ * \endxmlonly
725
+ * ) returns NaN if
726
+ * \latexonly $x \times y$ \endlatexonly
727
+ * \xmlonly
728
+ * <d4p_MathML outputclass="xmlonly">
729
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
730
+ * <m:mi>x</m:mi>
731
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
732
+ * <m:mi>y</m:mi>
733
+ * </m:math>
734
+ * </d4p_MathML>
735
+ * \endxmlonly
736
+ * is an exact
737
+ * \latexonly $+\infty$ \endlatexonly
738
+ * \xmlonly
739
+ * <d4p_MathML outputclass="xmlonly">
740
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
741
+ * <m:mo>+</m:mo>
742
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
743
+ * </m:math>
744
+ * </d4p_MathML>
745
+ * \endxmlonly
746
+ * .
747
+ * - fmaf(\p x, \p y,
748
+ * \latexonly $+\infty$ \endlatexonly
749
+ * \xmlonly
750
+ * <d4p_MathML outputclass="xmlonly">
751
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
752
+ * <m:mo>+</m:mo>
753
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
754
+ * </m:math>
755
+ * </d4p_MathML>
756
+ * \endxmlonly
757
+ * ) returns NaN if
758
+ * \latexonly $x \times y$ \endlatexonly
759
+ * \xmlonly
760
+ * <d4p_MathML outputclass="xmlonly">
761
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
762
+ * <m:mi>x</m:mi>
763
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
764
+ * <m:mi>y</m:mi>
765
+ * </m:math>
766
+ * </d4p_MathML>
767
+ * \endxmlonly
768
+ * is an exact
769
+ * \latexonly $-\infty$ \endlatexonly
770
+ * \xmlonly
771
+ * <d4p_MathML outputclass="xmlonly">
772
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
773
+ * <m:mo>-</m:mo>
774
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
775
+ * </m:math>
776
+ * </d4p_MathML>
777
+ * \endxmlonly
778
+ * .
779
+ *
780
+ * \note_accuracy_double
781
+ */
782
+ extern __device__ __device_builtin__ double __fma_rz(double x, double y, double z);
783
+ /**
784
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
785
+ * \brief Compute
786
+ * \latexonly $x \times y + z$ \endlatexonly
787
+ * \xmlonly
788
+ * <d4p_MathML outputclass="xmlonly">
789
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
790
+ * <m:mi>x</m:mi>
791
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
792
+ * <m:mi>y</m:mi>
793
+ * <m:mo>+</m:mo>
794
+ * <m:mi>z</m:mi>
795
+ * </m:math>
796
+ * </d4p_MathML>
797
+ * \endxmlonly
798
+ * as a single operation in round-up mode.
799
+ *
800
+ * Computes the value of
801
+ * \latexonly $x \times y + z$ \endlatexonly
802
+ * \xmlonly
803
+ * <d4p_MathML outputclass="xmlonly">
804
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
805
+ * <m:mi>x</m:mi>
806
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
807
+ * <m:mi>y</m:mi>
808
+ * <m:mo>+</m:mo>
809
+ * <m:mi>z</m:mi>
810
+ * </m:math>
811
+ * </d4p_MathML>
812
+ * \endxmlonly
813
+ * as a single ternary operation, rounding the
814
+ * result once in round-up (to positive infinity) mode.
815
+ *
816
+ * \return Returns the rounded value of
817
+ * \latexonly $x \times y + z$ \endlatexonly
818
+ * \xmlonly
819
+ * <d4p_MathML outputclass="xmlonly">
820
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
821
+ * <m:mi>x</m:mi>
822
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
823
+ * <m:mi>y</m:mi>
824
+ * <m:mo>+</m:mo>
825
+ * <m:mi>z</m:mi>
826
+ * </m:math>
827
+ * </d4p_MathML>
828
+ * \endxmlonly
829
+ * as a single operation.
830
+ * - fmaf(
831
+ * \latexonly $\pm \infty$ \endlatexonly
832
+ * \xmlonly
833
+ * <d4p_MathML outputclass="xmlonly">
834
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
835
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
836
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
837
+ * </m:math>
838
+ * </d4p_MathML>
839
+ * \endxmlonly
840
+ * ,
841
+ * \latexonly $\pm 0$ \endlatexonly
842
+ * \xmlonly
843
+ * <d4p_MathML outputclass="xmlonly">
844
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
845
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
846
+ * <m:mn>0</m:mn>
847
+ * </m:math>
848
+ * </d4p_MathML>
849
+ * \endxmlonly
850
+ * , \p z) returns NaN.
851
+ * - fmaf(
852
+ * \latexonly $\pm 0$ \endlatexonly
853
+ * \xmlonly
854
+ * <d4p_MathML outputclass="xmlonly">
855
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
856
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
857
+ * <m:mn>0</m:mn>
858
+ * </m:math>
859
+ * </d4p_MathML>
860
+ * \endxmlonly
861
+ * ,
862
+ * \latexonly $\pm \infty$ \endlatexonly
863
+ * \xmlonly
864
+ * <d4p_MathML outputclass="xmlonly">
865
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
866
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
867
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
868
+ * </m:math>
869
+ * </d4p_MathML>
870
+ * \endxmlonly
871
+ * , \p z) returns NaN.
872
+ * - fmaf(\p x, \p y,
873
+ * \latexonly $-\infty$ \endlatexonly
874
+ * \xmlonly
875
+ * <d4p_MathML outputclass="xmlonly">
876
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
877
+ * <m:mo>-</m:mo>
878
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
879
+ * </m:math>
880
+ * </d4p_MathML>
881
+ * \endxmlonly
882
+ * ) returns NaN if
883
+ * \latexonly $x \times y$ \endlatexonly
884
+ * \xmlonly
885
+ * <d4p_MathML outputclass="xmlonly">
886
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
887
+ * <m:mi>x</m:mi>
888
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
889
+ * <m:mi>y</m:mi>
890
+ * </m:math>
891
+ * </d4p_MathML>
892
+ * \endxmlonly
893
+ * is an exact
894
+ * \latexonly $+\infty$ \endlatexonly
895
+ * \xmlonly
896
+ * <d4p_MathML outputclass="xmlonly">
897
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
898
+ * <m:mo>+</m:mo>
899
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
900
+ * </m:math>
901
+ * </d4p_MathML>
902
+ * \endxmlonly
903
+ * .
904
+ * - fmaf(\p x, \p y,
905
+ * \latexonly $+\infty$ \endlatexonly
906
+ * \xmlonly
907
+ * <d4p_MathML outputclass="xmlonly">
908
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
909
+ * <m:mo>+</m:mo>
910
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
911
+ * </m:math>
912
+ * </d4p_MathML>
913
+ * \endxmlonly
914
+ * ) returns NaN if
915
+ * \latexonly $x \times y$ \endlatexonly
916
+ * \xmlonly
917
+ * <d4p_MathML outputclass="xmlonly">
918
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
919
+ * <m:mi>x</m:mi>
920
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
921
+ * <m:mi>y</m:mi>
922
+ * </m:math>
923
+ * </d4p_MathML>
924
+ * \endxmlonly
925
+ * is an exact
926
+ * \latexonly $-\infty$ \endlatexonly
927
+ * \xmlonly
928
+ * <d4p_MathML outputclass="xmlonly">
929
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
930
+ * <m:mo>-</m:mo>
931
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
932
+ * </m:math>
933
+ * </d4p_MathML>
934
+ * \endxmlonly
935
+ * .
936
+ *
937
+ * \note_accuracy_double
938
+ */
939
+ extern __device__ __device_builtin__ double __fma_ru(double x, double y, double z);
940
+ /**
941
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
942
+ * \brief Compute
943
+ * \latexonly $x \times y + z$ \endlatexonly
944
+ * \xmlonly
945
+ * <d4p_MathML outputclass="xmlonly">
946
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
947
+ * <m:mi>x</m:mi>
948
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
949
+ * <m:mi>y</m:mi>
950
+ * <m:mo>+</m:mo>
951
+ * <m:mi>z</m:mi>
952
+ * </m:math>
953
+ * </d4p_MathML>
954
+ * \endxmlonly
955
+ * as a single operation in round-down mode.
956
+ *
957
+ * Computes the value of
958
+ * \latexonly $x \times y + z$ \endlatexonly
959
+ * \xmlonly
960
+ * <d4p_MathML outputclass="xmlonly">
961
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
962
+ * <m:mi>x</m:mi>
963
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
964
+ * <m:mi>y</m:mi>
965
+ * <m:mo>+</m:mo>
966
+ * <m:mi>z</m:mi>
967
+ * </m:math>
968
+ * </d4p_MathML>
969
+ * \endxmlonly
970
+ * as a single ternary operation, rounding the
971
+ * result once in round-down (to negative infinity) mode.
972
+ *
973
+ * \return Returns the rounded value of
974
+ * \latexonly $x \times y + z$ \endlatexonly
975
+ * \xmlonly
976
+ * <d4p_MathML outputclass="xmlonly">
977
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
978
+ * <m:mi>x</m:mi>
979
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
980
+ * <m:mi>y</m:mi>
981
+ * <m:mo>+</m:mo>
982
+ * <m:mi>z</m:mi>
983
+ * </m:math>
984
+ * </d4p_MathML>
985
+ * \endxmlonly
986
+ * as a single operation.
987
+ * - fmaf(
988
+ * \latexonly $\pm \infty$ \endlatexonly
989
+ * \xmlonly
990
+ * <d4p_MathML outputclass="xmlonly">
991
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
992
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
993
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
994
+ * </m:math>
995
+ * </d4p_MathML>
996
+ * \endxmlonly
997
+ * ,
998
+ * \latexonly $\pm 0$ \endlatexonly
999
+ * \xmlonly
1000
+ * <d4p_MathML outputclass="xmlonly">
1001
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
1002
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
1003
+ * <m:mn>0</m:mn>
1004
+ * </m:math>
1005
+ * </d4p_MathML>
1006
+ * \endxmlonly
1007
+ * , \p z) returns NaN.
1008
+ * - fmaf(
1009
+ * \latexonly $\pm 0$ \endlatexonly
1010
+ * \xmlonly
1011
+ * <d4p_MathML outputclass="xmlonly">
1012
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
1013
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
1014
+ * <m:mn>0</m:mn>
1015
+ * </m:math>
1016
+ * </d4p_MathML>
1017
+ * \endxmlonly
1018
+ * ,
1019
+ * \latexonly $\pm \infty$ \endlatexonly
1020
+ * \xmlonly
1021
+ * <d4p_MathML outputclass="xmlonly">
1022
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
1023
+ * <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
1024
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
1025
+ * </m:math>
1026
+ * </d4p_MathML>
1027
+ * \endxmlonly
1028
+ * , \p z) returns NaN.
1029
+ * - fmaf(\p x, \p y,
1030
+ * \latexonly $-\infty$ \endlatexonly
1031
+ * \xmlonly
1032
+ * <d4p_MathML outputclass="xmlonly">
1033
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
1034
+ * <m:mo>-</m:mo>
1035
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
1036
+ * </m:math>
1037
+ * </d4p_MathML>
1038
+ * \endxmlonly
1039
+ * ) returns NaN if
1040
+ * \latexonly $x \times y$ \endlatexonly
1041
+ * \xmlonly
1042
+ * <d4p_MathML outputclass="xmlonly">
1043
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
1044
+ * <m:mi>x</m:mi>
1045
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
1046
+ * <m:mi>y</m:mi>
1047
+ * </m:math>
1048
+ * </d4p_MathML>
1049
+ * \endxmlonly
1050
+ * is an exact
1051
+ * \latexonly $+\infty$ \endlatexonly
1052
+ * \xmlonly
1053
+ * <d4p_MathML outputclass="xmlonly">
1054
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
1055
+ * <m:mo>+</m:mo>
1056
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
1057
+ * </m:math>
1058
+ * </d4p_MathML>
1059
+ * \endxmlonly
1060
+ * .
1061
+ * - fmaf(\p x, \p y,
1062
+ * \latexonly $+\infty$ \endlatexonly
1063
+ * \xmlonly
1064
+ * <d4p_MathML outputclass="xmlonly">
1065
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
1066
+ * <m:mo>+</m:mo>
1067
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
1068
+ * </m:math>
1069
+ * </d4p_MathML>
1070
+ * \endxmlonly
1071
+ * ) returns NaN if
1072
+ * \latexonly $x \times y$ \endlatexonly
1073
+ * \xmlonly
1074
+ * <d4p_MathML outputclass="xmlonly">
1075
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
1076
+ * <m:mi>x</m:mi>
1077
+ * <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
1078
+ * <m:mi>y</m:mi>
1079
+ * </m:math>
1080
+ * </d4p_MathML>
1081
+ * \endxmlonly
1082
+ * is an exact
1083
+ * \latexonly $-\infty$ \endlatexonly
1084
+ * \xmlonly
1085
+ * <d4p_MathML outputclass="xmlonly">
1086
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
1087
+ * <m:mo>-</m:mo>
1088
+ * <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
1089
+ * </m:math>
1090
+ * </d4p_MathML>
1091
+ * \endxmlonly
1092
+ * .
1093
+ *
1094
+ * \note_accuracy_double
1095
+ */
1096
+ extern __device__ __device_builtin__ double __fma_rd(double x, double y, double z);
1097
+ /**
1098
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
1099
+ * \brief Add two floating-point values in round-to-nearest-even mode.
1100
+ *
1101
+ * Adds two floating-point values \p x and \p y in round-to-nearest-even mode.
1102
+ *
1103
+ * \return Returns \p x + \p y.
1104
+ *
1105
+ * \note_accuracy_double
1106
+ * \note_nofma
1107
+ */
1108
+ extern __device__ __device_builtin__ double __dadd_rn(double x, double y);
1109
+ /**
1110
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
1111
+ * \brief Add two floating-point values in round-towards-zero mode.
1112
+ *
1113
+ * Adds two floating-point values \p x and \p y in round-towards-zero mode.
1114
+ *
1115
+ * \return Returns \p x + \p y.
1116
+ *
1117
+ * \note_accuracy_double
1118
+ * \note_nofma
1119
+ */
1120
+ extern __device__ __device_builtin__ double __dadd_rz(double x, double y);
1121
+ /**
1122
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
1123
+ * \brief Add two floating-point values in round-up mode.
1124
+ *
1125
+ * Adds two floating-point values \p x and \p y in round-up (to positive infinity) mode.
1126
+ *
1127
+ * \return Returns \p x + \p y.
1128
+ *
1129
+ * \note_accuracy_double
1130
+ * \note_nofma
1131
+ */
1132
+ extern __device__ __device_builtin__ double __dadd_ru(double x, double y);
1133
+ /**
1134
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
1135
+ * \brief Add two floating-point values in round-down mode.
1136
+ *
1137
+ * Adds two floating-point values \p x and \p y in round-down (to negative infinity) mode.
1138
+ *
1139
+ * \return Returns \p x + \p y.
1140
+ *
1141
+ * \note_accuracy_double
1142
+ * \note_nofma
1143
+ */
1144
+ extern __device__ __device_builtin__ double __dadd_rd(double x, double y);
1145
+ /**
1146
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
1147
+ * \brief Subtract two floating-point values in round-to-nearest-even mode.
1148
+ *
1149
+ * Subtracts two floating-point values \p x and \p y in round-to-nearest-even mode.
1150
+ *
1151
+ * \return Returns \p x - \p y.
1152
+ *
1153
+ * \note_accuracy_double
1154
+ * \note_nofma
1155
+ */
1156
+ extern __device__ __device_builtin__ double __dsub_rn(double x, double y);
1157
+ /**
1158
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
1159
+ * \brief Subtract two floating-point values in round-towards-zero mode.
1160
+ *
1161
+ * Subtracts two floating-point values \p x and \p y in round-towards-zero mode.
1162
+ *
1163
+ * \return Returns \p x - \p y.
1164
+ *
1165
+ * \note_accuracy_double
1166
+ * \note_nofma
1167
+ */
1168
+ extern __device__ __device_builtin__ double __dsub_rz(double x, double y);
1169
+ /**
1170
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
1171
+ * \brief Subtract two floating-point values in round-up mode.
1172
+ *
1173
+ * Subtracts two floating-point values \p x and \p y in round-up (to positive infinity) mode.
1174
+ *
1175
+ * \return Returns \p x - \p y.
1176
+ *
1177
+ * \note_accuracy_double
1178
+ * \note_nofma
1179
+ */
1180
+ extern __device__ __device_builtin__ double __dsub_ru(double x, double y);
1181
+ /**
1182
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
1183
+ * \brief Subtract two floating-point values in round-down mode.
1184
+ *
1185
+ * Subtracts two floating-point values \p x and \p y in round-down (to negative infinity) mode.
1186
+ *
1187
+ * \return Returns \p x - \p y.
1188
+ *
1189
+ * \note_accuracy_double
1190
+ * \note_nofma
1191
+ */
1192
+ extern __device__ __device_builtin__ double __dsub_rd(double x, double y);
1193
+ /**
1194
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
1195
+ * \brief Multiply two floating-point values in round-to-nearest-even mode.
1196
+ *
1197
+ * Multiplies two floating-point values \p x and \p y in round-to-nearest-even mode.
1198
+ *
1199
+ * \return Returns \p x * \p y.
1200
+ *
1201
+ * \note_accuracy_double
1202
+ * \note_nofma
1203
+ */
1204
+ extern __device__ __device_builtin__ double __dmul_rn(double x, double y);
1205
+ /**
1206
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
1207
+ * \brief Multiply two floating-point values in round-towards-zero mode.
1208
+ *
1209
+ * Multiplies two floating-point values \p x and \p y in round-towards-zero mode.
1210
+ *
1211
+ * \return Returns \p x * \p y.
1212
+ *
1213
+ * \note_accuracy_double
1214
+ * \note_nofma
1215
+ */
1216
+ extern __device__ __device_builtin__ double __dmul_rz(double x, double y);
1217
+ /**
1218
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
1219
+ * \brief Multiply two floating-point values in round-up mode.
1220
+ *
1221
+ * Multiplies two floating-point values \p x and \p y in round-up (to positive infinity) mode.
1222
+ *
1223
+ * \return Returns \p x * \p y.
1224
+ *
1225
+ * \note_accuracy_double
1226
+ * \note_nofma
1227
+ */
1228
+ extern __device__ __device_builtin__ double __dmul_ru(double x, double y);
1229
+ /**
1230
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
1231
+ * \brief Multiply two floating-point values in round-down mode.
1232
+ *
1233
+ * Multiplies two floating-point values \p x and \p y in round-down (to negative infinity) mode.
1234
+ *
1235
+ * \return Returns \p x * \p y.
1236
+ *
1237
+ * \note_accuracy_double
1238
+ * \note_nofma
1239
+ */
1240
+ extern __device__ __device_builtin__ double __dmul_rd(double x, double y);
1241
+ /**
1242
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1243
+ * \brief Convert a double to a float in round-to-nearest-even mode.
1244
+ *
1245
+ * Convert the double-precision floating-point value \p x to a single-precision
1246
+ * floating-point value in round-to-nearest-even mode.
1247
+ * \return Returns converted value.
1248
+ */
1249
+ extern __device__ __device_builtin__ float __double2float_rn(double x);
1250
+ /**
1251
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1252
+ * \brief Convert a double to a float in round-towards-zero mode.
1253
+ *
1254
+ * Convert the double-precision floating-point value \p x to a single-precision
1255
+ * floating-point value in round-towards-zero mode.
1256
+ * \return Returns converted value.
1257
+ */
1258
+ extern __device__ __device_builtin__ float __double2float_rz(double x);
1259
+ /**
1260
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1261
+ * \brief Convert a double to a float in round-up mode.
1262
+ *
1263
+ * Convert the double-precision floating-point value \p x to a single-precision
1264
+ * floating-point value in round-up (to positive infinity) mode.
1265
+ * \return Returns converted value.
1266
+ */
1267
+ extern __device__ __device_builtin__ float __double2float_ru(double x);
1268
+ /**
1269
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1270
+ * \brief Convert a double to a float in round-down mode.
1271
+ *
1272
+ * Convert the double-precision floating-point value \p x to a single-precision
1273
+ * floating-point value in round-down (to negative infinity) mode.
1274
+ * \return Returns converted value.
1275
+ */
1276
+ extern __device__ __device_builtin__ float __double2float_rd(double x);
1277
+ /**
1278
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1279
+ * \brief Convert a double to a signed int in round-to-nearest-even mode.
1280
+ *
1281
+ * Convert the double-precision floating-point value \p x to a
1282
+ * signed integer value in round-to-nearest-even mode.
1283
+ * \return Returns converted value.
1284
+ */
1285
+ extern __device__ __device_builtin__ int __double2int_rn(double x);
1286
+ /**
1287
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1288
+ * \brief Convert a double to a signed int in round-up mode.
1289
+ *
1290
+ * Convert the double-precision floating-point value \p x to a
1291
+ * signed integer value in round-up (to positive infinity) mode.
1292
+ * \return Returns converted value.
1293
+ */
1294
+ extern __device__ __device_builtin__ int __double2int_ru(double x);
1295
+ /**
1296
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1297
+ * \brief Convert a double to a signed int in round-down mode.
1298
+ *
1299
+ * Convert the double-precision floating-point value \p x to a
1300
+ * signed integer value in round-down (to negative infinity) mode.
1301
+ * \return Returns converted value.
1302
+ */
1303
+ extern __device__ __device_builtin__ int __double2int_rd(double x);
1304
+ /**
1305
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1306
+ * \brief Convert a double to an unsigned int in round-to-nearest-even mode.
1307
+ *
1308
+ * Convert the double-precision floating-point value \p x to an
1309
+ * unsigned integer value in round-to-nearest-even mode.
1310
+ * \return Returns converted value.
1311
+ */
1312
+ extern __device__ __device_builtin__ unsigned int __double2uint_rn(double x);
1313
+ /**
1314
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1315
+ * \brief Convert a double to an unsigned int in round-up mode.
1316
+ *
1317
+ * Convert the double-precision floating-point value \p x to an
1318
+ * unsigned integer value in round-up (to positive infinity) mode.
1319
+ * \return Returns converted value.
1320
+ */
1321
+ extern __device__ __device_builtin__ unsigned int __double2uint_ru(double x);
1322
+ /**
1323
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1324
+ * \brief Convert a double to an unsigned int in round-down mode.
1325
+ *
1326
+ * Convert the double-precision floating-point value \p x to an
1327
+ * unsigned integer value in round-down (to negative infinity) mode.
1328
+ * \return Returns converted value.
1329
+ */
1330
+ extern __device__ __device_builtin__ unsigned int __double2uint_rd(double x);
1331
+ /**
1332
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1333
+ * \brief Convert a double to a signed 64-bit int in round-to-nearest-even mode.
1334
+ *
1335
+ * Convert the double-precision floating-point value \p x to a
1336
+ * signed 64-bit integer value in round-to-nearest-even mode.
1337
+ * \return Returns converted value.
1338
+ */
1339
+ extern __device__ __device_builtin__ long long int __double2ll_rn(double x);
1340
+ /**
1341
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1342
+ * \brief Convert a double to a signed 64-bit int in round-up mode.
1343
+ *
1344
+ * Convert the double-precision floating-point value \p x to a
1345
+ * signed 64-bit integer value in round-up (to positive infinity) mode.
1346
+ * \return Returns converted value.
1347
+ */
1348
+ extern __device__ __device_builtin__ long long int __double2ll_ru(double x);
1349
+ /**
1350
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1351
+ * \brief Convert a double to a signed 64-bit int in round-down mode.
1352
+ *
1353
+ * Convert the double-precision floating-point value \p x to a
1354
+ * signed 64-bit integer value in round-down (to negative infinity) mode.
1355
+ * \return Returns converted value.
1356
+ */
1357
+ extern __device__ __device_builtin__ long long int __double2ll_rd(double x);
1358
+ /**
1359
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1360
+ * \brief Convert a double to an unsigned 64-bit int in round-to-nearest-even mode.
1361
+ *
1362
+ * Convert the double-precision floating-point value \p x to an
1363
+ * unsigned 64-bit integer value in round-to-nearest-even mode.
1364
+ * \return Returns converted value.
1365
+ */
1366
+ extern __device__ __device_builtin__ unsigned long long int __double2ull_rn(double x);
1367
+ /**
1368
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1369
+ * \brief Convert a double to an unsigned 64-bit int in round-up mode.
1370
+ *
1371
+ * Convert the double-precision floating-point value \p x to an
1372
+ * unsigned 64-bit integer value in round-up (to positive infinity) mode.
1373
+ * \return Returns converted value.
1374
+ */
1375
+ extern __device__ __device_builtin__ unsigned long long int __double2ull_ru(double x);
1376
+ /**
1377
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1378
+ * \brief Convert a double to an unsigned 64-bit int in round-down mode.
1379
+ *
1380
+ * Convert the double-precision floating-point value \p x to an
1381
+ * unsigned 64-bit integer value in round-down (to negative infinity) mode.
1382
+ * \return Returns converted value.
1383
+ */
1384
+ extern __device__ __device_builtin__ unsigned long long int __double2ull_rd(double x);
1385
+ /**
1386
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1387
+ * \brief Convert a signed int to a double.
1388
+ *
1389
+ * Convert the signed integer value \p x to a double-precision floating-point value.
1390
+ * \return Returns converted value.
1391
+ */
1392
+ extern __device__ __device_builtin__ double __int2double_rn(int x);
1393
+ /**
1394
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1395
+ * \brief Convert an unsigned int to a double.
1396
+ *
1397
+ * Convert the unsigned integer value \p x to a double-precision floating-point value.
1398
+ * \return Returns converted value.
1399
+ */
1400
+ extern __device__ __device_builtin__ double __uint2double_rn(unsigned int x);
1401
+ /**
1402
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1403
+ * \brief Convert a signed 64-bit int to a double in round-to-nearest-even mode.
1404
+ *
1405
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
1406
+ * value in round-to-nearest-even mode.
1407
+ * \return Returns converted value.
1408
+ */
1409
+ extern __device__ __device_builtin__ double __ll2double_rn(long long int x);
1410
+ /**
1411
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1412
+ * \brief Convert a signed 64-bit int to a double in round-towards-zero mode.
1413
+ *
1414
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
1415
+ * value in round-towards-zero mode.
1416
+ * \return Returns converted value.
1417
+ */
1418
+ extern __device__ __device_builtin__ double __ll2double_rz(long long int x);
1419
+ /**
1420
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1421
+ * \brief Convert a signed 64-bit int to a double in round-up mode.
1422
+ *
1423
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
1424
+ * value in round-up (to positive infinity) mode.
1425
+ * \return Returns converted value.
1426
+ */
1427
+ extern __device__ __device_builtin__ double __ll2double_ru(long long int x);
1428
+ /**
1429
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1430
+ * \brief Convert a signed 64-bit int to a double in round-down mode.
1431
+ *
1432
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
1433
+ * value in round-down (to negative infinity) mode.
1434
+ * \return Returns converted value.
1435
+ */
1436
+ extern __device__ __device_builtin__ double __ll2double_rd(long long int x);
1437
+ /**
1438
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1439
+ * \brief Convert an unsigned 64-bit int to a double in round-to-nearest-even mode.
1440
+ *
1441
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
1442
+ * value in round-to-nearest-even mode.
1443
+ * \return Returns converted value.
1444
+ */
1445
+ extern __device__ __device_builtin__ double __ull2double_rn(unsigned long long int x);
1446
+ /**
1447
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1448
+ * \brief Convert an unsigned 64-bit int to a double in round-towards-zero mode.
1449
+ *
1450
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
1451
+ * value in round-towards-zero mode.
1452
+ * \return Returns converted value.
1453
+ */
1454
+ extern __device__ __device_builtin__ double __ull2double_rz(unsigned long long int x);
1455
+ /**
1456
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1457
+ * \brief Convert an unsigned 64-bit int to a double in round-up mode.
1458
+ *
1459
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
1460
+ * value in round-up (to positive infinity) mode.
1461
+ * \return Returns converted value.
1462
+ */
1463
+ extern __device__ __device_builtin__ double __ull2double_ru(unsigned long long int x);
1464
+ /**
1465
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1466
+ * \brief Convert an unsigned 64-bit int to a double in round-down mode.
1467
+ *
1468
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
1469
+ * value in round-down (to negative infinity) mode.
1470
+ * \return Returns converted value.
1471
+ */
1472
+ extern __device__ __device_builtin__ double __ull2double_rd(unsigned long long int x);
1473
+ /**
1474
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1475
+ * \brief Reinterpret high 32 bits in a double as a signed integer.
1476
+ *
1477
+ * Reinterpret the high 32 bits in the double-precision floating-point value \p x
1478
+ * as a signed integer.
1479
+ * \return Returns reinterpreted value.
1480
+ */
1481
+ extern __device__ __device_builtin__ int __double2hiint(double x);
1482
+ /**
1483
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1484
+ * \brief Reinterpret low 32 bits in a double as a signed integer.
1485
+ *
1486
+ * Reinterpret the low 32 bits in the double-precision floating-point value \p x
1487
+ * as a signed integer.
1488
+ * \return Returns reinterpreted value.
1489
+ */
1490
+ extern __device__ __device_builtin__ int __double2loint(double x);
1491
+ /**
1492
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
1493
+ * \brief Reinterpret high and low 32-bit integer values as a double.
1494
+ *
1495
+ * Reinterpret the integer value of \p hi as the high 32 bits of a
1496
+ * double-precision floating-point value and the integer value of \p lo
1497
+ * as the low 32 bits of the same double-precision floating-point value.
1498
+ * \return Returns reinterpreted value.
1499
+ */
1500
+ extern __device__ __device_builtin__ double __hiloint2double(int hi, int lo);
1501
+
1502
+
1503
+ }
1504
+
1505
+ /*******************************************************************************
1506
+ * *
1507
+ * *
1508
+ * *
1509
+ *******************************************************************************/
1510
+ __SM_20_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__ballot)) unsigned int ballot(bool pred) __DEF_IF_HOST
1511
+
1512
+ __SM_20_INTRINSICS_DECL__ int syncthreads_count(bool pred) __DEF_IF_HOST
1513
+
1514
+ __SM_20_INTRINSICS_DECL__ bool syncthreads_and(bool pred) __DEF_IF_HOST
1515
+
1516
+ __SM_20_INTRINSICS_DECL__ bool syncthreads_or(bool pred) __DEF_IF_HOST
1517
+
1518
+ #undef __DEPRECATED__
1519
+ #undef __WSB_DEPRECATION_MESSAGE
1520
+
1521
+ __SM_20_INTRINSICS_DECL__ unsigned int __isGlobal(const void *ptr) __DEF_IF_HOST
1522
+ __SM_20_INTRINSICS_DECL__ unsigned int __isShared(const void *ptr) __DEF_IF_HOST
1523
+ __SM_20_INTRINSICS_DECL__ unsigned int __isConstant(const void *ptr) __DEF_IF_HOST
1524
+ __SM_20_INTRINSICS_DECL__ unsigned int __isLocal(const void *ptr) __DEF_IF_HOST
1525
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
1526
+ __SM_20_INTRINSICS_DECL__ unsigned int __isGridConstant(const void *ptr) __DEF_IF_HOST
1527
+ #endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
1528
+ __SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_global(const void *ptr) __DEF_IF_HOST
1529
+ __SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_shared(const void *ptr) __DEF_IF_HOST
1530
+ __SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_constant(const void *ptr) __DEF_IF_HOST
1531
+ __SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_local(const void *ptr) __DEF_IF_HOST
1532
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
1533
+ __SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_grid_constant(const void *ptr) __DEF_IF_HOST
1534
+ #endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
1535
+
1536
+ __SM_20_INTRINSICS_DECL__ void * __cvta_global_to_generic(size_t rawbits) __DEF_IF_HOST
1537
+ __SM_20_INTRINSICS_DECL__ void * __cvta_shared_to_generic(size_t rawbits) __DEF_IF_HOST
1538
+ __SM_20_INTRINSICS_DECL__ void * __cvta_constant_to_generic(size_t rawbits) __DEF_IF_HOST
1539
+ __SM_20_INTRINSICS_DECL__ void * __cvta_local_to_generic(size_t rawbits) __DEF_IF_HOST
1540
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
1541
+ __SM_20_INTRINSICS_DECL__ void * __cvta_grid_constant_to_generic(size_t rawbits) __DEF_IF_HOST
1542
+ #endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
1543
+ #endif /* __cplusplus && __CUDACC__ */
1544
+
1545
+ #undef __DEF_IF_HOST
1546
+ #undef __SM_20_INTRINSICS_DECL__
1547
+
1548
+ #if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
1549
+ #include "sm_20_intrinsics.hpp"
1550
+ #endif /* !__CUDACC_RTC__ */
1551
+ #endif /* !__SM_20_INTRINSICS_H__ && defined(__CUDA_ARCH__) */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_intrinsics.hpp ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__SM_20_INTRINSICS_HPP__)
51
+ #define __SM_20_INTRINSICS_HPP__
52
+
53
+ #if defined(__CUDACC_RTC__)
54
+ #define __SM_20_INTRINSICS_DECL__ __device__
55
+ #else /* __CUDACC_RTC__ */
56
+ #define __SM_20_INTRINSICS_DECL__ static __inline__ __device__
57
+ #endif /* __CUDACC_RTC__ */
58
+
59
+ #if defined(__cplusplus) && defined(__CUDACC__)
60
+
61
+ /*******************************************************************************
62
+ * *
63
+ * *
64
+ * *
65
+ *******************************************************************************/
66
+
67
+ #include "cuda_runtime_api.h"
68
+
69
+ /*******************************************************************************
70
+ * *
71
+ * *
72
+ * *
73
+ *******************************************************************************/
74
+
75
+ __SM_20_INTRINSICS_DECL__ unsigned int ballot(bool pred)
76
+ {
77
+ return __ballot((int)pred);
78
+ }
79
+
80
+ __SM_20_INTRINSICS_DECL__ int syncthreads_count(bool pred)
81
+ {
82
+ return __syncthreads_count((int)pred);
83
+ }
84
+
85
+ __SM_20_INTRINSICS_DECL__ bool syncthreads_and(bool pred)
86
+ {
87
+ return (bool)__syncthreads_and((int)pred);
88
+ }
89
+
90
+ __SM_20_INTRINSICS_DECL__ bool syncthreads_or(bool pred)
91
+ {
92
+ return (bool)__syncthreads_or((int)pred);
93
+ }
94
+
95
+
96
+ extern "C" {
97
+ __device__ unsigned __nv_isGlobal_impl(const void *);
98
+ __device__ unsigned __nv_isShared_impl(const void *);
99
+ __device__ unsigned __nv_isConstant_impl(const void *);
100
+ __device__ unsigned __nv_isLocal_impl(const void *);
101
+ __device__ unsigned __nv_isGridConstant_impl(const void *);
102
+ }
103
+
104
+ __SM_20_INTRINSICS_DECL__ unsigned int __isGlobal(const void *ptr)
105
+ {
106
+ return __nv_isGlobal_impl(ptr);
107
+ }
108
+
109
+ __SM_20_INTRINSICS_DECL__ unsigned int __isShared(const void *ptr)
110
+ {
111
+ return __nv_isShared_impl(ptr);
112
+ }
113
+
114
+ __SM_20_INTRINSICS_DECL__ unsigned int __isConstant(const void *ptr)
115
+ {
116
+ return __nv_isConstant_impl(ptr);
117
+ }
118
+
119
+ __SM_20_INTRINSICS_DECL__ unsigned int __isLocal(const void *ptr)
120
+ {
121
+ return __nv_isLocal_impl(ptr);
122
+ }
123
+
124
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
125
+ __SM_20_INTRINSICS_DECL__ unsigned int __isGridConstant(const void *ptr)
126
+ {
127
+ return __nv_isGridConstant_impl(ptr);
128
+ }
129
+ #endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
130
+
131
+ extern "C" {
132
+ __device__ size_t __nv_cvta_generic_to_global_impl(const void *);
133
+ __device__ size_t __nv_cvta_generic_to_shared_impl(const void *);
134
+ __device__ size_t __nv_cvta_generic_to_constant_impl(const void *);
135
+ __device__ size_t __nv_cvta_generic_to_local_impl(const void *);
136
+ __device__ void * __nv_cvta_global_to_generic_impl(size_t);
137
+ __device__ void * __nv_cvta_shared_to_generic_impl(size_t);
138
+ __device__ void * __nv_cvta_constant_to_generic_impl(size_t);
139
+ __device__ void * __nv_cvta_local_to_generic_impl(size_t);
140
+ }
141
+
142
+ __SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_global(const void *p)
143
+ {
144
+ return __nv_cvta_generic_to_global_impl(p);
145
+ }
146
+
147
+ __SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_shared(const void *p)
148
+ {
149
+ return __nv_cvta_generic_to_shared_impl(p);
150
+ }
151
+
152
+ __SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_constant(const void *p)
153
+ {
154
+ return __nv_cvta_generic_to_constant_impl(p);
155
+ }
156
+
157
+ __SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_local(const void *p)
158
+ {
159
+ return __nv_cvta_generic_to_local_impl(p);
160
+ }
161
+
162
+ __SM_20_INTRINSICS_DECL__ void * __cvta_global_to_generic(size_t rawbits)
163
+ {
164
+ return __nv_cvta_global_to_generic_impl(rawbits);
165
+ }
166
+
167
+ __SM_20_INTRINSICS_DECL__ void * __cvta_shared_to_generic(size_t rawbits)
168
+ {
169
+ return __nv_cvta_shared_to_generic_impl(rawbits);
170
+ }
171
+
172
+ __SM_20_INTRINSICS_DECL__ void * __cvta_constant_to_generic(size_t rawbits)
173
+ {
174
+ return __nv_cvta_constant_to_generic_impl(rawbits);
175
+ }
176
+
177
+ __SM_20_INTRINSICS_DECL__ void * __cvta_local_to_generic(size_t rawbits)
178
+ {
179
+ return __nv_cvta_local_to_generic_impl(rawbits);
180
+ }
181
+
182
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
183
+ #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
184
+ #define __CVTA_PTR_64 1
185
+ #endif
186
+
187
+ __SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_grid_constant(const void *ptr)
188
+ {
189
+ #if __CVTA_PTR_64
190
+ unsigned long long ret;
191
+ asm("cvta.to.param.u64 %0, %1;" : "=l"(ret) : "l"(ptr));
192
+ #else /* !__CVTA_PTR_64 */
193
+ unsigned ret;
194
+ asm("cvta.to.param.u32 %0, %1;" : "=r"(ret) : "r"(ptr));
195
+ #endif /* __CVTA_PTR_64 */
196
+ return (size_t)ret;
197
+
198
+ }
199
+
200
+ __SM_20_INTRINSICS_DECL__ void * __cvta_grid_constant_to_generic(size_t rawbits)
201
+ {
202
+ void *ret;
203
+ #if __CVTA_PTR_64
204
+ unsigned long long in = rawbits;
205
+ asm("cvta.param.u64 %0, %1;" : "=l"(ret) : "l"(in));
206
+ #else /* !__CVTA_PTR_64 */
207
+ unsigned in = rawbits;
208
+ asm("cvta.param.u32 %0, %1;" : "=r"(ret) : "r"(in));
209
+ #endif /* __CVTA_PTR_64 */
210
+ return ret;
211
+ }
212
+ #undef __CVTA_PTR_64
213
+ #endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
214
+
215
+
216
+ #endif /* __cplusplus && __CUDACC__ */
217
+
218
+ #undef __SM_20_INTRINSICS_DECL__
219
+
220
+ #endif /* !__SM_20_INTRINSICS_HPP__ */
221
+
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_30_intrinsics.h ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__SM_30_INTRINSICS_H__)
51
+ #define __SM_30_INTRINSICS_H__
52
+
53
+ #if defined(__CUDACC_RTC__)
54
+ #define __SM_30_INTRINSICS_DECL__ __device__
55
+ #else /* !__CUDACC_RTC__ */
56
+ #define __SM_30_INTRINSICS_DECL__ static __device__ __inline__
57
+ #endif /* __CUDACC_RTC__ */
58
+
59
+ #if defined(__cplusplus) && defined(__CUDACC__)
60
+
61
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
62
+
63
+ /*******************************************************************************
64
+ * *
65
+ * *
66
+ * *
67
+ *******************************************************************************/
68
+
69
+ #include "cuda_runtime_api.h"
70
+
71
+ #ifndef __CUDA_ARCH__
72
+ #define __DEF_IF_HOST { }
73
+ #else /* !__CUDA_ARCH__ */
74
+ #define __DEF_IF_HOST ;
75
+ #endif /* __CUDA_ARCH__ */
76
+
77
+
78
+ /*******************************************************************************
79
+ * *
80
+ * Below are declarations of SM-3.0 intrinsics which are included as *
81
+ * source (instead of being built in to the compiler) *
82
+ * *
83
+ *******************************************************************************/
84
+
85
+ #if !defined warpSize && !defined __local_warpSize
86
+ #define warpSize 32
87
+ #define __local_warpSize
88
+ #endif
89
+
90
+ #if defined(_WIN32)
91
+ # define __DEPRECATED__(msg) __declspec(deprecated(msg))
92
+ #elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
93
+ # define __DEPRECATED__(msg) __attribute__((deprecated))
94
+ #else
95
+ # define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
96
+ #endif
97
+
98
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
99
+ #define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)."
100
+ #endif
101
+
102
+ __SM_30_INTRINSICS_DECL__ unsigned __fns(unsigned mask, unsigned base, int offset) __DEF_IF_HOST
103
+ __SM_30_INTRINSICS_DECL__ void __barrier_sync(unsigned id) __DEF_IF_HOST
104
+ __SM_30_INTRINSICS_DECL__ void __barrier_sync_count(unsigned id, unsigned cnt) __DEF_IF_HOST
105
+ __SM_30_INTRINSICS_DECL__ void __syncwarp(unsigned mask=0xFFFFFFFF) __DEF_IF_HOST
106
+ __SM_30_INTRINSICS_DECL__ int __all_sync(unsigned mask, int pred) __DEF_IF_HOST
107
+ __SM_30_INTRINSICS_DECL__ int __any_sync(unsigned mask, int pred) __DEF_IF_HOST
108
+ __SM_30_INTRINSICS_DECL__ int __uni_sync(unsigned mask, int pred) __DEF_IF_HOST
109
+ __SM_30_INTRINSICS_DECL__ unsigned __ballot_sync(unsigned mask, int pred) __DEF_IF_HOST
110
+ __SM_30_INTRINSICS_DECL__ unsigned __activemask() __DEF_IF_HOST
111
+
112
+ // Warp register exchange (shuffle) intrinsics.
113
+ // Notes:
114
+ // a) Warp size is hardcoded to 32 here, because the compiler does not know
115
+ // the "warpSize" constant at this time
116
+ // b) we cannot map the float __shfl to the int __shfl because it'll mess with
117
+ // the register number (especially if you're doing two shfls to move a double).
118
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
119
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) int __shfl(int var, int srcLane, int width=warpSize) __DEF_IF_HOST
120
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) unsigned int __shfl(unsigned int var, int srcLane, int width=warpSize) __DEF_IF_HOST
121
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) int __shfl_up(int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
122
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) unsigned int __shfl_up(unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
123
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) int __shfl_down(int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
124
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) unsigned int __shfl_down(unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
125
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) int __shfl_xor(int var, int laneMask, int width=warpSize) __DEF_IF_HOST
126
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) unsigned int __shfl_xor(unsigned int var, int laneMask, int width=warpSize) __DEF_IF_HOST
127
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) float __shfl(float var, int srcLane, int width=warpSize) __DEF_IF_HOST
128
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) float __shfl_up(float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
129
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) float __shfl_down(float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
130
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) float __shfl_xor(float var, int laneMask, int width=warpSize) __DEF_IF_HOST
131
+ #endif
132
+
133
+ __SM_30_INTRINSICS_DECL__ int __shfl_sync(unsigned mask, int var, int srcLane, int width=warpSize) __DEF_IF_HOST
134
+ __SM_30_INTRINSICS_DECL__ unsigned int __shfl_sync(unsigned mask, unsigned int var, int srcLane, int width=warpSize) __DEF_IF_HOST
135
+ __SM_30_INTRINSICS_DECL__ int __shfl_up_sync(unsigned mask, int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
136
+ __SM_30_INTRINSICS_DECL__ unsigned int __shfl_up_sync(unsigned mask, unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
137
+ __SM_30_INTRINSICS_DECL__ int __shfl_down_sync(unsigned mask, int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
138
+ __SM_30_INTRINSICS_DECL__ unsigned int __shfl_down_sync(unsigned mask, unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
139
+ __SM_30_INTRINSICS_DECL__ int __shfl_xor_sync(unsigned mask, int var, int laneMask, int width=warpSize) __DEF_IF_HOST
140
+ __SM_30_INTRINSICS_DECL__ unsigned int __shfl_xor_sync(unsigned mask, unsigned int var, int laneMask, int width=warpSize) __DEF_IF_HOST
141
+ __SM_30_INTRINSICS_DECL__ float __shfl_sync(unsigned mask, float var, int srcLane, int width=warpSize) __DEF_IF_HOST
142
+ __SM_30_INTRINSICS_DECL__ float __shfl_up_sync(unsigned mask, float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
143
+ __SM_30_INTRINSICS_DECL__ float __shfl_down_sync(unsigned mask, float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
144
+ __SM_30_INTRINSICS_DECL__ float __shfl_xor_sync(unsigned mask, float var, int laneMask, int width=warpSize) __DEF_IF_HOST
145
+
146
+ // 64-bits SHFL
147
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
148
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) unsigned long long __shfl(unsigned long long var, int srcLane, int width=warpSize) __DEF_IF_HOST
149
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) long long __shfl(long long var, int srcLane, int width=warpSize) __DEF_IF_HOST
150
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) long long __shfl_up(long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
151
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) unsigned long long __shfl_up(unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
152
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) long long __shfl_down(long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
153
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) unsigned long long __shfl_down(unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
154
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) long long __shfl_xor(long long var, int laneMask, int width=warpSize) __DEF_IF_HOST
155
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) unsigned long long __shfl_xor(unsigned long long var, int laneMask, int width=warpSize) __DEF_IF_HOST
156
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) double __shfl(double var, int srcLane, int width=warpSize) __DEF_IF_HOST
157
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) double __shfl_up(double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
158
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) double __shfl_down(double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
159
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) double __shfl_xor(double var, int laneMask, int width=warpSize) __DEF_IF_HOST
160
+ #endif
161
+
162
+ __SM_30_INTRINSICS_DECL__ long long __shfl_sync(unsigned mask, long long var, int srcLane, int width=warpSize) __DEF_IF_HOST
163
+ __SM_30_INTRINSICS_DECL__ unsigned long long __shfl_sync(unsigned mask, unsigned long long var, int srcLane, int width=warpSize) __DEF_IF_HOST
164
+ __SM_30_INTRINSICS_DECL__ long long __shfl_up_sync(unsigned mask, long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
165
+ __SM_30_INTRINSICS_DECL__ unsigned long long __shfl_up_sync(unsigned mask, unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
166
+ __SM_30_INTRINSICS_DECL__ long long __shfl_down_sync(unsigned mask, long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
167
+ __SM_30_INTRINSICS_DECL__ unsigned long long __shfl_down_sync(unsigned mask, unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
168
+ __SM_30_INTRINSICS_DECL__ long long __shfl_xor_sync(unsigned mask, long long var, int laneMask, int width=warpSize) __DEF_IF_HOST
169
+ __SM_30_INTRINSICS_DECL__ unsigned long long __shfl_xor_sync(unsigned mask, unsigned long long var, int laneMask, int width=warpSize) __DEF_IF_HOST
170
+ __SM_30_INTRINSICS_DECL__ double __shfl_sync(unsigned mask, double var, int srcLane, int width=warpSize) __DEF_IF_HOST
171
+ __SM_30_INTRINSICS_DECL__ double __shfl_up_sync(unsigned mask, double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
172
+ __SM_30_INTRINSICS_DECL__ double __shfl_down_sync(unsigned mask, double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
173
+ __SM_30_INTRINSICS_DECL__ double __shfl_xor_sync(unsigned mask, double var, int laneMask, int width=warpSize) __DEF_IF_HOST
174
+
175
+ // long needs some help to choose between 32-bits and 64-bits
176
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
177
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) long __shfl(long var, int srcLane, int width=warpSize) __DEF_IF_HOST
178
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) unsigned long __shfl(unsigned long var, int srcLane, int width=warpSize) __DEF_IF_HOST
179
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) long __shfl_up(long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
180
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) unsigned long __shfl_up(unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
181
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) long __shfl_down(long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
182
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) unsigned long __shfl_down(unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
183
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) long __shfl_xor(long var, int laneMask, int width=warpSize) __DEF_IF_HOST
184
+ __SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) unsigned long __shfl_xor(unsigned long var, int laneMask, int width=warpSize) __DEF_IF_HOST
185
+ #endif
186
+
187
+ __SM_30_INTRINSICS_DECL__ long __shfl_sync(unsigned mask, long var, int srcLane, int width=warpSize) __DEF_IF_HOST
188
+ __SM_30_INTRINSICS_DECL__ unsigned long __shfl_sync(unsigned mask, unsigned long var, int srcLane, int width=warpSize) __DEF_IF_HOST
189
+ __SM_30_INTRINSICS_DECL__ long __shfl_up_sync(unsigned mask, long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
190
+ __SM_30_INTRINSICS_DECL__ unsigned long __shfl_up_sync(unsigned mask, unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
191
+ __SM_30_INTRINSICS_DECL__ long __shfl_down_sync(unsigned mask, long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
192
+ __SM_30_INTRINSICS_DECL__ unsigned long __shfl_down_sync(unsigned mask, unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
193
+ __SM_30_INTRINSICS_DECL__ long __shfl_xor_sync(unsigned mask, long var, int laneMask, int width=warpSize) __DEF_IF_HOST
194
+ __SM_30_INTRINSICS_DECL__ unsigned long __shfl_xor_sync(unsigned mask, unsigned long var, int laneMask, int width=warpSize) __DEF_IF_HOST
195
+
196
+ #undef __DEPRECATED__
197
+ #undef __WSB_DEPRECATION_MESSAGE
198
+
199
+ #if defined(__local_warpSize)
200
+ #undef warpSize
201
+ #undef __local_warpSize
202
+ #endif
203
+
204
+ #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 300 */
205
+
206
+ #endif /* __cplusplus && __CUDACC__ */
207
+
208
+ #undef __DEF_IF_HOST
209
+ #undef __SM_30_INTRINSICS_DECL__
210
+
211
+ #if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
212
+ #include "sm_30_intrinsics.hpp"
213
+ #endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
214
+
215
+ #endif /* !__SM_30_INTRINSICS_H__ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_30_intrinsics.hpp ADDED
@@ -0,0 +1,604 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__SM_30_INTRINSICS_HPP__)
51
+ #define __SM_30_INTRINSICS_HPP__
52
+
53
+ #if defined(__CUDACC_RTC__)
54
+ #define __SM_30_INTRINSICS_DECL__ __device__
55
+ #else /* !__CUDACC_RTC__ */
56
+ #define __SM_30_INTRINSICS_DECL__ static __device__ __inline__
57
+ #endif /* __CUDACC_RTC__ */
58
+
59
+ #if defined(__cplusplus) && defined(__CUDACC__)
60
+
61
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
62
+
63
+ /*******************************************************************************
64
+ * *
65
+ * *
66
+ * *
67
+ *******************************************************************************/
68
+
69
+ #include "cuda_runtime_api.h"
70
+
71
+ // In here are intrinsics which are built in to the compiler. These may be
72
+ // referenced by intrinsic implementations from this file.
73
+ extern "C"
74
+ {
75
+ }
76
+
77
+ /*******************************************************************************
78
+ * *
79
+ * Below are implementations of SM-3.0 intrinsics which are included as *
80
+ * source (instead of being built in to the compiler) *
81
+ * *
82
+ *******************************************************************************/
83
+
84
+ #if !defined warpSize && !defined __local_warpSize
85
+ #define warpSize 32
86
+ #define __local_warpSize
87
+ #endif
88
+
89
+ __SM_30_INTRINSICS_DECL__
90
+ unsigned __fns(unsigned mask, unsigned base, int offset) {
91
+ extern __device__ __device_builtin__ unsigned int __nvvm_fns(unsigned int mask, unsigned int base, int offset);
92
+ return __nvvm_fns(mask, base, offset);
93
+ }
94
+
95
+ __SM_30_INTRINSICS_DECL__
96
+ void __barrier_sync(unsigned id) {
97
+ extern __device__ __device_builtin__ void __nvvm_barrier_sync(unsigned id);
98
+ return __nvvm_barrier_sync(id);
99
+ }
100
+
101
+ __SM_30_INTRINSICS_DECL__
102
+ void __barrier_sync_count(unsigned id, unsigned cnt) {
103
+ extern __device__ __device_builtin__ void __nvvm_barrier_sync_cnt(unsigned id, unsigned cnt);
104
+ return __nvvm_barrier_sync_cnt(id, cnt);
105
+ }
106
+
107
+ __SM_30_INTRINSICS_DECL__
108
+ void __syncwarp(unsigned mask) {
109
+ extern __device__ __device_builtin__ void __nvvm_bar_warp_sync(unsigned mask);
110
+ return __nvvm_bar_warp_sync(mask);
111
+ }
112
+
113
+ __SM_30_INTRINSICS_DECL__
114
+ int __all_sync(unsigned mask, int pred) {
115
+ extern __device__ __device_builtin__ int __nvvm_vote_all_sync(unsigned int mask, int pred);
116
+ return __nvvm_vote_all_sync(mask, pred);
117
+ }
118
+
119
+ __SM_30_INTRINSICS_DECL__
120
+ int __any_sync(unsigned mask, int pred) {
121
+ extern __device__ __device_builtin__ int __nvvm_vote_any_sync(unsigned int mask, int pred);
122
+ return __nvvm_vote_any_sync(mask, pred);
123
+ }
124
+
125
+ __SM_30_INTRINSICS_DECL__
126
+ int __uni_sync(unsigned mask, int pred) {
127
+ extern __device__ __device_builtin__ int __nvvm_vote_uni_sync(unsigned int mask, int pred);
128
+ return __nvvm_vote_uni_sync(mask, pred);
129
+ }
130
+
131
+ __SM_30_INTRINSICS_DECL__
132
+ unsigned __ballot_sync(unsigned mask, int pred) {
133
+ extern __device__ __device_builtin__ unsigned int __nvvm_vote_ballot_sync(unsigned int mask, int pred);
134
+ return __nvvm_vote_ballot_sync(mask, pred);
135
+ }
136
+
137
+ __SM_30_INTRINSICS_DECL__
138
+ unsigned __activemask() {
139
+ unsigned ret;
140
+ asm volatile ("activemask.b32 %0;" : "=r"(ret));
141
+ return ret;
142
+ }
143
+
144
+ // These are removed starting with compute_70 and onwards
145
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
146
+
147
+ __SM_30_INTRINSICS_DECL__ int __shfl(int var, int srcLane, int width) {
148
+ int ret;
149
+ int c = ((warpSize-width) << 8) | 0x1f;
150
+ asm volatile ("shfl.idx.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(srcLane), "r"(c));
151
+ return ret;
152
+ }
153
+
154
+ __SM_30_INTRINSICS_DECL__ unsigned int __shfl(unsigned int var, int srcLane, int width) {
155
+ return (unsigned int) __shfl((int)var, srcLane, width);
156
+ }
157
+
158
+ __SM_30_INTRINSICS_DECL__ int __shfl_up(int var, unsigned int delta, int width) {
159
+ int ret;
160
+ int c = (warpSize-width) << 8;
161
+ asm volatile ("shfl.up.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(delta), "r"(c));
162
+ return ret;
163
+ }
164
+
165
+ __SM_30_INTRINSICS_DECL__ unsigned int __shfl_up(unsigned int var, unsigned int delta, int width) {
166
+ return (unsigned int) __shfl_up((int)var, delta, width);
167
+ }
168
+
169
+ __SM_30_INTRINSICS_DECL__ int __shfl_down(int var, unsigned int delta, int width) {
170
+ int ret;
171
+ int c = ((warpSize-width) << 8) | 0x1f;
172
+ asm volatile ("shfl.down.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(delta), "r"(c));
173
+ return ret;
174
+ }
175
+
176
+ __SM_30_INTRINSICS_DECL__ unsigned int __shfl_down(unsigned int var, unsigned int delta, int width) {
177
+ return (unsigned int) __shfl_down((int)var, delta, width);
178
+ }
179
+
180
+ __SM_30_INTRINSICS_DECL__ int __shfl_xor(int var, int laneMask, int width) {
181
+ int ret;
182
+ int c = ((warpSize-width) << 8) | 0x1f;
183
+ asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(laneMask), "r"(c));
184
+ return ret;
185
+ }
186
+
187
+ __SM_30_INTRINSICS_DECL__ unsigned int __shfl_xor(unsigned int var, int laneMask, int width) {
188
+ return (unsigned int) __shfl_xor((int)var, laneMask, width);
189
+ }
190
+
191
+ __SM_30_INTRINSICS_DECL__ float __shfl(float var, int srcLane, int width) {
192
+ float ret;
193
+ int c;
194
+ c = ((warpSize-width) << 8) | 0x1f;
195
+ asm volatile ("shfl.idx.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(srcLane), "r"(c));
196
+ return ret;
197
+ }
198
+
199
+ __SM_30_INTRINSICS_DECL__ float __shfl_up(float var, unsigned int delta, int width) {
200
+ float ret;
201
+ int c;
202
+ c = (warpSize-width) << 8;
203
+ asm volatile ("shfl.up.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(delta), "r"(c));
204
+ return ret;
205
+ }
206
+
207
+ __SM_30_INTRINSICS_DECL__ float __shfl_down(float var, unsigned int delta, int width) {
208
+ float ret;
209
+ int c;
210
+ c = ((warpSize-width) << 8) | 0x1f;
211
+ asm volatile ("shfl.down.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(delta), "r"(c));
212
+ return ret;
213
+ }
214
+
215
+ __SM_30_INTRINSICS_DECL__ float __shfl_xor(float var, int laneMask, int width) {
216
+ float ret;
217
+ int c;
218
+ c = ((warpSize-width) << 8) | 0x1f;
219
+ asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(laneMask), "r"(c));
220
+ return ret;
221
+ }
222
+
223
+ // 64-bits SHFL
224
+
225
+ __SM_30_INTRINSICS_DECL__ long long __shfl(long long var, int srcLane, int width) {
226
+ int lo, hi;
227
+ asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
228
+ hi = __shfl(hi, srcLane, width);
229
+ lo = __shfl(lo, srcLane, width);
230
+ asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
231
+ return var;
232
+ }
233
+
234
+ __SM_30_INTRINSICS_DECL__ unsigned long long __shfl(unsigned long long var, int srcLane, int width) {
235
+ return (unsigned long long) __shfl((long long) var, srcLane, width);
236
+ }
237
+
238
+ __SM_30_INTRINSICS_DECL__ long long __shfl_up(long long var, unsigned int delta, int width) {
239
+ int lo, hi;
240
+ asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
241
+ hi = __shfl_up(hi, delta, width);
242
+ lo = __shfl_up(lo, delta, width);
243
+ asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
244
+ return var;
245
+ }
246
+
247
+ __SM_30_INTRINSICS_DECL__ unsigned long long __shfl_up(unsigned long long var, unsigned int delta, int width) {
248
+ return (unsigned long long) __shfl_up((long long) var, delta, width);
249
+ }
250
+
251
+ __SM_30_INTRINSICS_DECL__ long long __shfl_down(long long var, unsigned int delta, int width) {
252
+ int lo, hi;
253
+ asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
254
+ hi = __shfl_down(hi, delta, width);
255
+ lo = __shfl_down(lo, delta, width);
256
+ asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
257
+ return var;
258
+ }
259
+
260
+ __SM_30_INTRINSICS_DECL__ unsigned long long __shfl_down(unsigned long long var, unsigned int delta, int width) {
261
+ return (unsigned long long) __shfl_down((long long) var, delta, width);
262
+ }
263
+
264
+ __SM_30_INTRINSICS_DECL__ long long __shfl_xor(long long var, int laneMask, int width) {
265
+ int lo, hi;
266
+ asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
267
+ hi = __shfl_xor(hi, laneMask, width);
268
+ lo = __shfl_xor(lo, laneMask, width);
269
+ asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
270
+ return var;
271
+ }
272
+
273
+ __SM_30_INTRINSICS_DECL__ unsigned long long __shfl_xor(unsigned long long var, int laneMask, int width) {
274
+ return (unsigned long long) __shfl_xor((long long) var, laneMask, width);
275
+ }
276
+
277
+ __SM_30_INTRINSICS_DECL__ double __shfl(double var, int srcLane, int width) {
278
+ unsigned lo, hi;
279
+ asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
280
+ hi = __shfl(hi, srcLane, width);
281
+ lo = __shfl(lo, srcLane, width);
282
+ asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
283
+ return var;
284
+ }
285
+
286
+ __SM_30_INTRINSICS_DECL__ double __shfl_up(double var, unsigned int delta, int width) {
287
+ unsigned lo, hi;
288
+ asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
289
+ hi = __shfl_up(hi, delta, width);
290
+ lo = __shfl_up(lo, delta, width);
291
+ asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
292
+ return var;
293
+ }
294
+
295
+ __SM_30_INTRINSICS_DECL__ double __shfl_down(double var, unsigned int delta, int width) {
296
+ unsigned lo, hi;
297
+ asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
298
+ hi = __shfl_down(hi, delta, width);
299
+ lo = __shfl_down(lo, delta, width);
300
+ asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
301
+ return var;
302
+ }
303
+
304
+ __SM_30_INTRINSICS_DECL__ double __shfl_xor(double var, int laneMask, int width) {
305
+ unsigned lo, hi;
306
+ asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
307
+ hi = __shfl_xor(hi, laneMask, width);
308
+ lo = __shfl_xor(lo, laneMask, width);
309
+ asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
310
+ return var;
311
+ }
312
+
313
+ __SM_30_INTRINSICS_DECL__ long __shfl(long var, int srcLane, int width) {
314
+ return (sizeof(long) == sizeof(long long)) ?
315
+ __shfl((long long) var, srcLane, width) :
316
+ __shfl((int) var, srcLane, width);
317
+ }
318
+
319
+ __SM_30_INTRINSICS_DECL__ unsigned long __shfl(unsigned long var, int srcLane, int width) {
320
+ return (sizeof(long) == sizeof(long long)) ?
321
+ __shfl((unsigned long long) var, srcLane, width) :
322
+ __shfl((unsigned int) var, srcLane, width);
323
+ }
324
+
325
+ __SM_30_INTRINSICS_DECL__ long __shfl_up(long var, unsigned int delta, int width) {
326
+ return (sizeof(long) == sizeof(long long)) ?
327
+ __shfl_up((long long) var, delta, width) :
328
+ __shfl_up((int) var, delta, width);
329
+ }
330
+
331
+ __SM_30_INTRINSICS_DECL__ unsigned long __shfl_up(unsigned long var, unsigned int delta, int width) {
332
+ return (sizeof(long) == sizeof(long long)) ?
333
+ __shfl_up((unsigned long long) var, delta, width) :
334
+ __shfl_up((unsigned int) var, delta, width);
335
+ }
336
+
337
+ __SM_30_INTRINSICS_DECL__ long __shfl_down(long var, unsigned int delta, int width) {
338
+ return (sizeof(long) == sizeof(long long)) ?
339
+ __shfl_down((long long) var, delta, width) :
340
+ __shfl_down((int) var, delta, width);
341
+ }
342
+
343
+ __SM_30_INTRINSICS_DECL__ unsigned long __shfl_down(unsigned long var, unsigned int delta, int width) {
344
+ return (sizeof(long) == sizeof(long long)) ?
345
+ __shfl_down((unsigned long long) var, delta, width) :
346
+ __shfl_down((unsigned int) var, delta, width);
347
+ }
348
+
349
+ __SM_30_INTRINSICS_DECL__ long __shfl_xor(long var, int laneMask, int width) {
350
+ return (sizeof(long) == sizeof(long long)) ?
351
+ __shfl_xor((long long) var, laneMask, width) :
352
+ __shfl_xor((int) var, laneMask, width);
353
+ }
354
+
355
+ __SM_30_INTRINSICS_DECL__ unsigned long __shfl_xor(unsigned long var, int laneMask, int width) {
356
+ return (sizeof(long) == sizeof(long long)) ?
357
+ __shfl_xor((unsigned long long) var, laneMask, width) :
358
+ __shfl_xor((unsigned int) var, laneMask, width);
359
+ }
360
+
361
+ #endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */
362
+
363
+ // Warp register exchange (shuffle) intrinsics.
364
+ // Notes:
365
+ // a) Warp size is hardcoded to 32 here, because the compiler does not know
366
+ // the "warpSize" constant at this time
367
+ // b) we cannot map the float __shfl to the int __shfl because it'll mess with
368
+ // the register number (especially if you're doing two shfls to move a double).
369
+ __SM_30_INTRINSICS_DECL__ int __shfl_sync(unsigned mask, int var, int srcLane, int width) {
370
+ extern __device__ __device_builtin__ unsigned __nvvm_shfl_idx_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
371
+ int ret;
372
+ int c = ((warpSize-width) << 8) | 0x1f;
373
+ ret = __nvvm_shfl_idx_sync(mask, var, srcLane, c);
374
+ return ret;
375
+ }
376
+
377
+ __SM_30_INTRINSICS_DECL__ unsigned int __shfl_sync(unsigned mask, unsigned int var, int srcLane, int width) {
378
+ return (unsigned int) __shfl_sync(mask, (int)var, srcLane, width);
379
+ }
380
+
381
+ __SM_30_INTRINSICS_DECL__ int __shfl_up_sync(unsigned mask, int var, unsigned int delta, int width) {
382
+ extern __device__ __device_builtin__ unsigned __nvvm_shfl_up_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
383
+ int ret;
384
+ int c = (warpSize-width) << 8;
385
+ ret = __nvvm_shfl_up_sync(mask, var, delta, c);
386
+ return ret;
387
+ }
388
+
389
+ __SM_30_INTRINSICS_DECL__ unsigned int __shfl_up_sync(unsigned mask, unsigned int var, unsigned int delta, int width) {
390
+ return (unsigned int) __shfl_up_sync(mask, (int)var, delta, width);
391
+ }
392
+
393
+ __SM_30_INTRINSICS_DECL__ int __shfl_down_sync(unsigned mask, int var, unsigned int delta, int width) {
394
+ extern __device__ __device_builtin__ unsigned __nvvm_shfl_down_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
395
+ int ret;
396
+ int c = ((warpSize-width) << 8) | 0x1f;
397
+ ret = __nvvm_shfl_down_sync(mask, var, delta, c);
398
+ return ret;
399
+ }
400
+
401
+ __SM_30_INTRINSICS_DECL__ unsigned int __shfl_down_sync(unsigned mask, unsigned int var, unsigned int delta, int width) {
402
+ return (unsigned int) __shfl_down_sync(mask, (int)var, delta, width);
403
+ }
404
+
405
+ __SM_30_INTRINSICS_DECL__ int __shfl_xor_sync(unsigned mask, int var, int laneMask, int width) {
406
+ extern __device__ __device_builtin__ unsigned __nvvm_shfl_bfly_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
407
+ int ret;
408
+ int c = ((warpSize-width) << 8) | 0x1f;
409
+ ret = __nvvm_shfl_bfly_sync(mask, var, laneMask, c);
410
+ return ret;
411
+ }
412
+
413
+ __SM_30_INTRINSICS_DECL__ unsigned int __shfl_xor_sync(unsigned mask, unsigned int var, int laneMask, int width) {
414
+ return (unsigned int) __shfl_xor_sync(mask, (int)var, laneMask, width);
415
+ }
416
+
417
+ __SM_30_INTRINSICS_DECL__ float __shfl_sync(unsigned mask, float var, int srcLane, int width) {
418
+ extern __device__ __device_builtin__ unsigned __nvvm_shfl_idx_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
419
+ int ret;
420
+ int c;
421
+ c = ((warpSize-width) << 8) | 0x1f;
422
+ ret = __nvvm_shfl_idx_sync(mask, __float_as_int(var), srcLane, c);
423
+ return __int_as_float(ret);
424
+ }
425
+
426
+ __SM_30_INTRINSICS_DECL__ float __shfl_up_sync(unsigned mask, float var, unsigned int delta, int width) {
427
+ extern __device__ __device_builtin__ unsigned __nvvm_shfl_up_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
428
+ int ret;
429
+ int c;
430
+ c = (warpSize-width) << 8;
431
+ ret = __nvvm_shfl_up_sync(mask, __float_as_int(var), delta, c);
432
+ return __int_as_float(ret);
433
+ }
434
+
435
+ __SM_30_INTRINSICS_DECL__ float __shfl_down_sync(unsigned mask, float var, unsigned int delta, int width) {
436
+ extern __device__ __device_builtin__ unsigned __nvvm_shfl_down_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
437
+ int ret;
438
+ int c;
439
+ c = ((warpSize-width) << 8) | 0x1f;
440
+ ret = __nvvm_shfl_down_sync(mask, __float_as_int(var), delta, c);
441
+ return __int_as_float(ret);
442
+ }
443
+
444
+ __SM_30_INTRINSICS_DECL__ float __shfl_xor_sync(unsigned mask, float var, int laneMask, int width) {
445
+ extern __device__ __device_builtin__ unsigned __nvvm_shfl_bfly_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
446
+ int ret;
447
+ int c;
448
+ c = ((warpSize-width) << 8) | 0x1f;
449
+ ret = __nvvm_shfl_bfly_sync(mask, __float_as_int(var), laneMask, c);
450
+ return __int_as_float(ret);
451
+ }
452
+
453
+ // 64-bits SHFL
454
+ __SM_30_INTRINSICS_DECL__ long long __shfl_sync(unsigned mask, long long var, int srcLane, int width) {
455
+ int lo, hi;
456
+ asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
457
+ hi = __shfl_sync(mask, hi, srcLane, width);
458
+ lo = __shfl_sync(mask, lo, srcLane, width);
459
+ asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
460
+ return var;
461
+ }
462
+
463
+ __SM_30_INTRINSICS_DECL__ unsigned long long __shfl_sync(unsigned mask, unsigned long long var, int srcLane, int width) {
464
+ return (unsigned long long) __shfl_sync(mask, (long long) var, srcLane, width);
465
+ }
466
+
467
+ __SM_30_INTRINSICS_DECL__ long long __shfl_up_sync(unsigned mask, long long var, unsigned int delta, int width) {
468
+ int lo, hi;
469
+ asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
470
+ hi = __shfl_up_sync(mask, hi, delta, width);
471
+ lo = __shfl_up_sync(mask, lo, delta, width);
472
+ asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
473
+ return var;
474
+ }
475
+
476
+ __SM_30_INTRINSICS_DECL__ unsigned long long __shfl_up_sync(unsigned mask, unsigned long long var, unsigned int delta, int width) {
477
+ return (unsigned long long) __shfl_up_sync(mask, (long long) var, delta, width);
478
+ }
479
+
480
+ __SM_30_INTRINSICS_DECL__ long long __shfl_down_sync(unsigned mask, long long var, unsigned int delta, int width) {
481
+ int lo, hi;
482
+ asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
483
+ hi = __shfl_down_sync(mask, hi, delta, width);
484
+ lo = __shfl_down_sync(mask, lo, delta, width);
485
+ asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
486
+ return var;
487
+ }
488
+
489
+ __SM_30_INTRINSICS_DECL__ unsigned long long __shfl_down_sync(unsigned mask, unsigned long long var, unsigned int delta, int width) {
490
+ return (unsigned long long) __shfl_down_sync(mask, (long long) var, delta, width);
491
+ }
492
+
493
+ __SM_30_INTRINSICS_DECL__ long long __shfl_xor_sync(unsigned mask, long long var, int laneMask, int width) {
494
+ int lo, hi;
495
+ asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
496
+ hi = __shfl_xor_sync(mask, hi, laneMask, width);
497
+ lo = __shfl_xor_sync(mask, lo, laneMask, width);
498
+ asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
499
+ return var;
500
+ }
501
+
502
+ __SM_30_INTRINSICS_DECL__ unsigned long long __shfl_xor_sync(unsigned mask, unsigned long long var, int laneMask, int width) {
503
+ return (unsigned long long) __shfl_xor_sync(mask, (long long) var, laneMask, width);
504
+ }
505
+
506
+ __SM_30_INTRINSICS_DECL__ double __shfl_sync(unsigned mask, double var, int srcLane, int width) {
507
+ unsigned lo, hi;
508
+ asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
509
+ hi = __shfl_sync(mask, hi, srcLane, width);
510
+ lo = __shfl_sync(mask, lo, srcLane, width);
511
+ asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
512
+ return var;
513
+ }
514
+
515
+ __SM_30_INTRINSICS_DECL__ double __shfl_up_sync(unsigned mask, double var, unsigned int delta, int width) {
516
+ unsigned lo, hi;
517
+ asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
518
+ hi = __shfl_up_sync(mask, hi, delta, width);
519
+ lo = __shfl_up_sync(mask, lo, delta, width);
520
+ asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
521
+ return var;
522
+ }
523
+
524
+ __SM_30_INTRINSICS_DECL__ double __shfl_down_sync(unsigned mask, double var, unsigned int delta, int width) {
525
+ unsigned lo, hi;
526
+ asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
527
+ hi = __shfl_down_sync(mask, hi, delta, width);
528
+ lo = __shfl_down_sync(mask, lo, delta, width);
529
+ asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
530
+ return var;
531
+ }
532
+
533
+ __SM_30_INTRINSICS_DECL__ double __shfl_xor_sync(unsigned mask, double var, int laneMask, int width) {
534
+ unsigned lo, hi;
535
+ asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
536
+ hi = __shfl_xor_sync(mask, hi, laneMask, width);
537
+ lo = __shfl_xor_sync(mask, lo, laneMask, width);
538
+ asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
539
+ return var;
540
+ }
541
+
542
+ // long needs some help to choose between 32-bits and 64-bits
543
+
544
+ __SM_30_INTRINSICS_DECL__ long __shfl_sync(unsigned mask, long var, int srcLane, int width) {
545
+ return (sizeof(long) == sizeof(long long)) ?
546
+ __shfl_sync(mask, (long long) var, srcLane, width) :
547
+ __shfl_sync(mask, (int) var, srcLane, width);
548
+ }
549
+
550
+ __SM_30_INTRINSICS_DECL__ unsigned long __shfl_sync(unsigned mask, unsigned long var, int srcLane, int width) {
551
+ return (sizeof(long) == sizeof(long long)) ?
552
+ __shfl_sync(mask, (unsigned long long) var, srcLane, width) :
553
+ __shfl_sync(mask, (unsigned int) var, srcLane, width);
554
+ }
555
+
556
+ __SM_30_INTRINSICS_DECL__ long __shfl_up_sync(unsigned mask, long var, unsigned int delta, int width) {
557
+ return (sizeof(long) == sizeof(long long)) ?
558
+ __shfl_up_sync(mask, (long long) var, delta, width) :
559
+ __shfl_up_sync(mask, (int) var, delta, width);
560
+ }
561
+
562
+ __SM_30_INTRINSICS_DECL__ unsigned long __shfl_up_sync(unsigned mask, unsigned long var, unsigned int delta, int width) {
563
+ return (sizeof(long) == sizeof(long long)) ?
564
+ __shfl_up_sync(mask, (unsigned long long) var, delta, width) :
565
+ __shfl_up_sync(mask, (unsigned int) var, delta, width);
566
+ }
567
+
568
+ __SM_30_INTRINSICS_DECL__ long __shfl_down_sync(unsigned mask, long var, unsigned int delta, int width) {
569
+ return (sizeof(long) == sizeof(long long)) ?
570
+ __shfl_down_sync(mask, (long long) var, delta, width) :
571
+ __shfl_down_sync(mask, (int) var, delta, width);
572
+ }
573
+
574
+ __SM_30_INTRINSICS_DECL__ unsigned long __shfl_down_sync(unsigned mask, unsigned long var, unsigned int delta, int width) {
575
+ return (sizeof(long) == sizeof(long long)) ?
576
+ __shfl_down_sync(mask, (unsigned long long) var, delta, width) :
577
+ __shfl_down_sync(mask, (unsigned int) var, delta, width);
578
+ }
579
+
580
+ __SM_30_INTRINSICS_DECL__ long __shfl_xor_sync(unsigned mask, long var, int laneMask, int width) {
581
+ return (sizeof(long) == sizeof(long long)) ?
582
+ __shfl_xor_sync(mask, (long long) var, laneMask, width) :
583
+ __shfl_xor_sync(mask, (int) var, laneMask, width);
584
+ }
585
+
586
+ __SM_30_INTRINSICS_DECL__ unsigned long __shfl_xor_sync(unsigned mask, unsigned long var, int laneMask, int width) {
587
+ return (sizeof(long) == sizeof(long long)) ?
588
+ __shfl_xor_sync(mask, (unsigned long long) var, laneMask, width) :
589
+ __shfl_xor_sync(mask, (unsigned int) var, laneMask, width);
590
+ }
591
+
592
+ #if defined(__local_warpSize)
593
+ #undef warpSize
594
+ #undef __local_warpSize
595
+ #endif
596
+
597
+ #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 300 */
598
+
599
+ #endif /* __cplusplus && __CUDACC__ */
600
+
601
+ #undef __SM_30_INTRINSICS_DECL__
602
+
603
+ #endif /* !__SM_30_INTRINSICS_HPP__ */
604
+
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_35_atomic_functions.h ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 35.235 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.35.235 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__SM_35_ATOMIC_FUNCTIONS_H__)
51
+ #define __SM_35_ATOMIC_FUNCTIONS_H__
52
+
53
+ /*******************************************************************************
54
+ * All sm_35 atomics are supported by sm_32 so simply include its header file *
55
+ *******************************************************************************/
56
+ #include "sm_32_atomic_functions.h"
57
+
58
+ #endif /* !__SM_35_ATOMIC_FUNCTIONS_H__ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_61_intrinsics.hpp ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2016 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__SM_61_INTRINSICS_HPP__)
51
+ #define __SM_61_INTRINSICS_HPP__
52
+
53
+ #if defined(__CUDACC_RTC__)
54
+ #define __SM_61_INTRINSICS_DECL__ __device__
55
+ #else /* !__CUDACC_RTC__ */
56
+ #define __SM_61_INTRINSICS_DECL__ static __device__ __inline__
57
+ #endif /* __CUDACC_RTC__ */
58
+
59
+ #if defined(__cplusplus) && defined(__CUDACC__)
60
+
61
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 610
62
+
63
+ /*******************************************************************************
64
+ * *
65
+ * *
66
+ * *
67
+ *******************************************************************************/
68
+
69
+ #include "cuda_runtime_api.h"
70
+
71
+ /*******************************************************************************
72
+ * *
73
+ * Below are implementations of SM-6.1 intrinsics which are included as *
74
+ * source (instead of being built in to the compiler) *
75
+ * *
76
+ *******************************************************************************/
77
+
78
+ // 4a
79
+ __SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, int srcB, int c) {
80
+ int ret;
81
+ asm volatile ("dp4a.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
82
+ return ret;
83
+ }
84
+
85
+ __SM_61_INTRINSICS_DECL__ unsigned int __dp4a(unsigned int srcA, unsigned int srcB, unsigned int c) {
86
+ unsigned int ret;
87
+ asm volatile ("dp4a.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
88
+ return ret;
89
+ }
90
+
91
+ __SM_61_INTRINSICS_DECL__ int __dp4a(char4 srcA, char4 srcB, int c) {
92
+ int ret;
93
+ asm volatile ("dp4a.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c));
94
+ return ret;
95
+ }
96
+
97
+ __SM_61_INTRINSICS_DECL__ unsigned int __dp4a(uchar4 srcA, uchar4 srcB, unsigned int c) {
98
+ unsigned int ret;
99
+ asm volatile ("dp4a.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c));
100
+ return ret;
101
+ }
102
+
103
+ // 2a.lo
104
+ __SM_61_INTRINSICS_DECL__ int __dp2a_lo(int srcA, int srcB, int c) {
105
+ int ret;
106
+ asm volatile ("dp2a.lo.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
107
+ return ret;
108
+ }
109
+
110
+ __SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(unsigned int srcA, unsigned int srcB, unsigned int c) {
111
+ unsigned int ret;
112
+ asm volatile ("dp2a.lo.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
113
+ return ret;
114
+ }
115
+
116
+ __SM_61_INTRINSICS_DECL__ int __dp2a_lo(short2 srcA, char4 srcB, int c) {
117
+ int ret;
118
+ asm volatile ("dp2a.lo.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c));
119
+ return ret;
120
+ }
121
+
122
+ __SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(ushort2 srcA, uchar4 srcB, unsigned int c) {
123
+ unsigned int ret;
124
+ asm volatile ("dp2a.lo.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c));
125
+ return ret;
126
+ }
127
+
128
+ // 2a.hi
129
+ __SM_61_INTRINSICS_DECL__ int __dp2a_hi(int srcA, int srcB, int c) {
130
+ int ret;
131
+ asm volatile ("dp2a.hi.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
132
+ return ret;
133
+ }
134
+
135
+ __SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(unsigned int srcA, unsigned int srcB, unsigned int c) {
136
+ unsigned int ret;
137
+ asm volatile ("dp2a.hi.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
138
+ return ret;
139
+ }
140
+
141
+ __SM_61_INTRINSICS_DECL__ int __dp2a_hi(short2 srcA, char4 srcB, int c) {
142
+ int ret;
143
+ asm volatile ("dp2a.hi.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c));
144
+ return ret;
145
+ }
146
+
147
+ __SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(ushort2 srcA, uchar4 srcB, unsigned int c) {
148
+ unsigned int ret;
149
+ asm volatile ("dp2a.hi.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c));
150
+ return ret;
151
+ }
152
+
153
+
154
+ #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 610 */
155
+
156
+ #endif /* __cplusplus && __CUDACC__ */
157
+
158
+ #undef __SM_61_INTRINSICS_DECL__
159
+
160
+ #endif /* !__SM_61_INTRINSICS_HPP__ */
161
+
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/surface_indirect_functions.h ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+
51
+ #ifndef __SURFACE_INDIRECT_FUNCTIONS_H__
52
+ #define __SURFACE_INDIRECT_FUNCTIONS_H__
53
+
54
+
55
+ #if defined(__cplusplus) && defined(__CUDACC__)
56
+
57
+ #include "cuda_runtime_api.h"
58
+
59
+ template<typename T> struct __nv_isurf_trait { };
60
+ template<> struct __nv_isurf_trait<char> { typedef void type; };
61
+ template<> struct __nv_isurf_trait<signed char> { typedef void type; };
62
+ template<> struct __nv_isurf_trait<char1> { typedef void type; };
63
+ template<> struct __nv_isurf_trait<unsigned char> { typedef void type; };
64
+ template<> struct __nv_isurf_trait<uchar1> { typedef void type; };
65
+ template<> struct __nv_isurf_trait<short> { typedef void type; };
66
+ template<> struct __nv_isurf_trait<short1> { typedef void type; };
67
+ template<> struct __nv_isurf_trait<unsigned short> { typedef void type; };
68
+ template<> struct __nv_isurf_trait<ushort1> { typedef void type; };
69
+ template<> struct __nv_isurf_trait<int> { typedef void type; };
70
+ template<> struct __nv_isurf_trait<int1> { typedef void type; };
71
+ template<> struct __nv_isurf_trait<unsigned int> { typedef void type; };
72
+ template<> struct __nv_isurf_trait<uint1> { typedef void type; };
73
+ template<> struct __nv_isurf_trait<long long> { typedef void type; };
74
+ template<> struct __nv_isurf_trait<longlong1> { typedef void type; };
75
+ template<> struct __nv_isurf_trait<unsigned long long> { typedef void type; };
76
+ template<> struct __nv_isurf_trait<ulonglong1> { typedef void type; };
77
+ template<> struct __nv_isurf_trait<float> { typedef void type; };
78
+ template<> struct __nv_isurf_trait<float1> { typedef void type; };
79
+
80
+ template<> struct __nv_isurf_trait<char2> { typedef void type; };
81
+ template<> struct __nv_isurf_trait<uchar2> { typedef void type; };
82
+ template<> struct __nv_isurf_trait<short2> { typedef void type; };
83
+ template<> struct __nv_isurf_trait<ushort2> { typedef void type; };
84
+ template<> struct __nv_isurf_trait<int2> { typedef void type; };
85
+ template<> struct __nv_isurf_trait<uint2> { typedef void type; };
86
+ template<> struct __nv_isurf_trait<longlong2> { typedef void type; };
87
+ template<> struct __nv_isurf_trait<ulonglong2> { typedef void type; };
88
+ template<> struct __nv_isurf_trait<float2> { typedef void type; };
89
+
90
+ template<> struct __nv_isurf_trait<char4> { typedef void type; };
91
+ template<> struct __nv_isurf_trait<uchar4> { typedef void type; };
92
+ template<> struct __nv_isurf_trait<short4> { typedef void type; };
93
+ template<> struct __nv_isurf_trait<ushort4> { typedef void type; };
94
+ template<> struct __nv_isurf_trait<int4> { typedef void type; };
95
+ template<> struct __nv_isurf_trait<uint4> { typedef void type; };
96
+ template<> struct __nv_isurf_trait<float4> { typedef void type; };
97
+
98
+
99
+ template <typename T>
100
+ static __device__ typename __nv_isurf_trait<T>::type surf1Dread(T *ptr, cudaSurfaceObject_t obj, int x, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
101
+ {
102
+ #ifdef __CUDA_ARCH__
103
+ __nv_tex_surf_handler("__isurf1Dread", ptr, obj, x, mode);
104
+ #endif /* __CUDA_ARCH__ */
105
+ }
106
+
107
+ template <class T>
108
+ static __device__ T surf1Dread(cudaSurfaceObject_t surfObject, int x, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
109
+ {
110
+ #ifdef __CUDA_ARCH__
111
+ T ret;
112
+ surf1Dread(&ret, surfObject, x, boundaryMode);
113
+ return ret;
114
+ #endif /* __CUDA_ARCH__ */
115
+ }
116
+
117
+ template <typename T>
118
+ static __device__ typename __nv_isurf_trait<T>::type surf2Dread(T *ptr, cudaSurfaceObject_t obj, int x, int y, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
119
+ {
120
+ #ifdef __CUDA_ARCH__
121
+ __nv_tex_surf_handler("__isurf2Dread", ptr, obj, x, y, mode);
122
+ #endif /* __CUDA_ARCH__ */
123
+ }
124
+
125
+ template <class T>
126
+ static __device__ T surf2Dread(cudaSurfaceObject_t surfObject, int x, int y, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
127
+ {
128
+ #ifdef __CUDA_ARCH__
129
+ T ret;
130
+ surf2Dread(&ret, surfObject, x, y, boundaryMode);
131
+ return ret;
132
+ #endif /* __CUDA_ARCH__ */
133
+ }
134
+
135
+
136
+ template <typename T>
137
+ static __device__ typename __nv_isurf_trait<T>::type surf3Dread(T *ptr, cudaSurfaceObject_t obj, int x, int y, int z, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
138
+ {
139
+ #ifdef __CUDA_ARCH__
140
+ __nv_tex_surf_handler("__isurf3Dread", ptr, obj, x, y, z, mode);
141
+ #endif /* __CUDA_ARCH__ */
142
+ }
143
+
144
+ template <class T>
145
+ static __device__ T surf3Dread(cudaSurfaceObject_t surfObject, int x, int y, int z, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
146
+ {
147
+ #ifdef __CUDA_ARCH__
148
+ T ret;
149
+ surf3Dread(&ret, surfObject, x, y, z, boundaryMode);
150
+ return ret;
151
+ #endif /* __CUDA_ARCH__ */
152
+ }
153
+
154
+ template <typename T>
155
+ static __device__ typename __nv_isurf_trait<T>::type surf1DLayeredread(T *ptr, cudaSurfaceObject_t obj, int x, int layer, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
156
+ {
157
+ #ifdef __CUDA_ARCH__
158
+ __nv_tex_surf_handler("__isurf1DLayeredread", ptr, obj, x, layer, mode);
159
+ #endif /* __CUDA_ARCH__ */
160
+ }
161
+
162
+ template <class T>
163
+ static __device__ T surf1DLayeredread(cudaSurfaceObject_t surfObject, int x, int layer, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
164
+ {
165
+ #ifdef __CUDA_ARCH__
166
+ T ret;
167
+ surf1DLayeredread(&ret, surfObject, x, layer, boundaryMode);
168
+ return ret;
169
+ #endif /* __CUDA_ARCH__ */
170
+ }
171
+
172
+ template <typename T>
173
+ static __device__ typename __nv_isurf_trait<T>::type surf2DLayeredread(T *ptr, cudaSurfaceObject_t obj, int x, int y, int layer, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
174
+ {
175
+ #ifdef __CUDA_ARCH__
176
+ __nv_tex_surf_handler("__isurf2DLayeredread", ptr, obj, x, y, layer, mode);
177
+ #endif /* __CUDA_ARCH__ */
178
+ }
179
+
180
+ template <class T>
181
+ static __device__ T surf2DLayeredread(cudaSurfaceObject_t surfObject, int x, int y, int layer, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
182
+ {
183
+ #ifdef __CUDA_ARCH__
184
+ T ret;
185
+ surf2DLayeredread(&ret, surfObject, x, y, layer, boundaryMode);
186
+ return ret;
187
+ #endif /* __CUDA_ARCH__ */
188
+ }
189
+
190
+ template <typename T>
191
+ static __device__ typename __nv_isurf_trait<T>::type surfCubemapread(T *ptr, cudaSurfaceObject_t obj, int x, int y, int face, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
192
+ {
193
+ #ifdef __CUDA_ARCH__
194
+ __nv_tex_surf_handler("__isurfCubemapread", ptr, obj, x, y, face, mode);
195
+ #endif /* __CUDA_ARCH__ */
196
+ }
197
+
198
+ template <class T>
199
+ static __device__ T surfCubemapread(cudaSurfaceObject_t surfObject, int x, int y, int face, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
200
+ {
201
+ #ifdef __CUDA_ARCH__
202
+ T ret;
203
+ surfCubemapread(&ret, surfObject, x, y, face, boundaryMode);
204
+ return ret;
205
+ #endif /* __CUDA_ARCH__ */
206
+ }
207
+
208
+ template <typename T>
209
+ static __device__ typename __nv_isurf_trait<T>::type surfCubemapLayeredread(T *ptr, cudaSurfaceObject_t obj, int x, int y, int layerface, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
210
+ {
211
+ #ifdef __CUDA_ARCH__
212
+ __nv_tex_surf_handler("__isurfCubemapLayeredread", ptr, obj, x, y, layerface, mode);
213
+ #endif /* __CUDA_ARCH__ */
214
+ }
215
+
216
+ template <class T>
217
+ static __device__ T surfCubemapLayeredread(cudaSurfaceObject_t surfObject, int x, int y, int layerface, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
218
+ {
219
+ #ifdef __CUDA_ARCH__
220
+ T ret;
221
+ surfCubemapLayeredread(&ret, surfObject, x, y, layerface, boundaryMode);
222
+ return ret;
223
+ #endif /* __CUDA_ARCH__ */
224
+ }
225
+
226
+ template <typename T>
227
+ static __device__ typename __nv_isurf_trait<T>::type surf1Dwrite(T val, cudaSurfaceObject_t obj, int x, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
228
+ {
229
+ #ifdef __CUDA_ARCH__
230
+ __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, obj, x, mode);
231
+ #endif /* __CUDA_ARCH__ */
232
+ }
233
+
234
+ template <typename T>
235
+ static __device__ typename __nv_isurf_trait<T>::type surf2Dwrite(T val, cudaSurfaceObject_t obj, int x, int y, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
236
+ {
237
+ #ifdef __CUDA_ARCH__
238
+ __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, obj, x, y, mode);
239
+ #endif /* __CUDA_ARCH__ */
240
+ }
241
+
242
+ template <typename T>
243
+ static __device__ typename __nv_isurf_trait<T>::type surf3Dwrite(T val, cudaSurfaceObject_t obj, int x, int y, int z, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
244
+ {
245
+ #ifdef __CUDA_ARCH__
246
+ __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, obj, x, y, z, mode);
247
+ #endif /* __CUDA_ARCH__ */
248
+ }
249
+
250
+ template <typename T>
251
+ static __device__ typename __nv_isurf_trait<T>::type surf1DLayeredwrite(T val, cudaSurfaceObject_t obj, int x, int layer, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
252
+ {
253
+ #ifdef __CUDA_ARCH__
254
+ __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, obj, x, layer, mode);
255
+ #endif /* __CUDA_ARCH__ */
256
+ }
257
+
258
+ template <typename T>
259
+ static __device__ typename __nv_isurf_trait<T>::type surf2DLayeredwrite(T val, cudaSurfaceObject_t obj, int x, int y, int layer, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
260
+ {
261
+ #ifdef __CUDA_ARCH__
262
+ __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, obj, x, y, layer, mode);
263
+ #endif /* __CUDA_ARCH__ */
264
+ }
265
+
266
+ template <typename T>
267
+ static __device__ typename __nv_isurf_trait<T>::type surfCubemapwrite(T val, cudaSurfaceObject_t obj, int x, int y, int face, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
268
+ {
269
+ #ifdef __CUDA_ARCH__
270
+ __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, obj, x, y, face, mode);
271
+ #endif /* __CUDA_ARCH__ */
272
+ }
273
+
274
+ template <typename T>
275
+ static __device__ typename __nv_isurf_trait<T>::type surfCubemapLayeredwrite(T val, cudaSurfaceObject_t obj, int x, int y, int layerface, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
276
+ {
277
+ #ifdef __CUDA_ARCH__
278
+ __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, obj, x, y, layerface, mode);
279
+ #endif /* __CUDA_ARCH__ */
280
+ }
281
+
282
+ #endif // __cplusplus && __CUDACC__
283
+
284
+ #endif // __SURFACE_INDIRECT_FUNCTIONS_H__
285
+
286
+
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/surface_types.h ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__SURFACE_TYPES_H__)
51
+ #define __SURFACE_TYPES_H__
52
+
53
+ /*******************************************************************************
54
+ * *
55
+ * *
56
+ * *
57
+ *******************************************************************************/
58
+
59
+ #include "driver_types.h"
60
+
61
+ /**
62
+ * \addtogroup CUDART_TYPES
63
+ *
64
+ * @{
65
+ */
66
+
67
+ /*******************************************************************************
68
+ * *
69
+ * *
70
+ * *
71
+ *******************************************************************************/
72
+
73
+ #define cudaSurfaceType1D 0x01
74
+ #define cudaSurfaceType2D 0x02
75
+ #define cudaSurfaceType3D 0x03
76
+ #define cudaSurfaceTypeCubemap 0x0C
77
+ #define cudaSurfaceType1DLayered 0xF1
78
+ #define cudaSurfaceType2DLayered 0xF2
79
+ #define cudaSurfaceTypeCubemapLayered 0xFC
80
+
81
+ /**
82
+ * CUDA Surface boundary modes
83
+ */
84
+ enum __device_builtin__ cudaSurfaceBoundaryMode
85
+ {
86
+ cudaBoundaryModeZero = 0, /**< Zero boundary mode */
87
+ cudaBoundaryModeClamp = 1, /**< Clamp boundary mode */
88
+ cudaBoundaryModeTrap = 2 /**< Trap boundary mode */
89
+ };
90
+
91
+ /**
92
+ * CUDA Surface format modes
93
+ */
94
+ enum __device_builtin__ cudaSurfaceFormatMode
95
+ {
96
+ cudaFormatModeForced = 0, /**< Forced format mode */
97
+ cudaFormatModeAuto = 1 /**< Auto format mode */
98
+ };
99
+
100
+ /**
101
+ * CUDA Surface reference
102
+ */
103
+ struct __device_builtin__ surfaceReference
104
+ {
105
+ /**
106
+ * Channel descriptor for surface reference
107
+ */
108
+ struct cudaChannelFormatDesc channelDesc;
109
+ };
110
+
111
+ /**
112
+ * An opaque value that represents a CUDA Surface object
113
+ */
114
+ typedef __device_builtin__ unsigned long long cudaSurfaceObject_t;
115
+
116
+ /** @} */
117
+ /** @} */ /* END CUDART_TYPES */
118
+
119
+ #endif /* !__SURFACE_TYPES_H__ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_indirect_functions.h ADDED
@@ -0,0 +1,771 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2020 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+
51
+ #ifndef __TEXTURE_INDIRECT_FUNCTIONS_H__
52
+ #define __TEXTURE_INDIRECT_FUNCTIONS_H__
53
+
54
+
55
+ #if defined(__cplusplus) && defined(__CUDACC__)
56
+
57
+ #include "cuda_runtime_api.h"
58
+
59
+
60
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
61
+ #define __NV_TEX_SPARSE 1
62
+ #endif /* endif */
63
+
64
+ template <typename T> struct __nv_itex_trait { };
65
+ template<> struct __nv_itex_trait<char> { typedef void type; };
66
+ template<> struct __nv_itex_trait<signed char> { typedef void type; };
67
+ template<> struct __nv_itex_trait<char1> { typedef void type; };
68
+ template<> struct __nv_itex_trait<char2> { typedef void type; };
69
+ template<> struct __nv_itex_trait<char4> { typedef void type; };
70
+ template<> struct __nv_itex_trait<unsigned char> { typedef void type; };
71
+ template<> struct __nv_itex_trait<uchar1> { typedef void type; };
72
+ template<> struct __nv_itex_trait<uchar2> { typedef void type; };
73
+ template<> struct __nv_itex_trait<uchar4> { typedef void type; };
74
+ template<> struct __nv_itex_trait<short> { typedef void type; };
75
+ template<> struct __nv_itex_trait<short1> { typedef void type; };
76
+ template<> struct __nv_itex_trait<short2> { typedef void type; };
77
+ template<> struct __nv_itex_trait<short4> { typedef void type; };
78
+ template<> struct __nv_itex_trait<unsigned short> { typedef void type; };
79
+ template<> struct __nv_itex_trait<ushort1> { typedef void type; };
80
+ template<> struct __nv_itex_trait<ushort2> { typedef void type; };
81
+ template<> struct __nv_itex_trait<ushort4> { typedef void type; };
82
+ template<> struct __nv_itex_trait<int> { typedef void type; };
83
+ template<> struct __nv_itex_trait<int1> { typedef void type; };
84
+ template<> struct __nv_itex_trait<int2> { typedef void type; };
85
+ template<> struct __nv_itex_trait<int4> { typedef void type; };
86
+ template<> struct __nv_itex_trait<unsigned int> { typedef void type; };
87
+ template<> struct __nv_itex_trait<uint1> { typedef void type; };
88
+ template<> struct __nv_itex_trait<uint2> { typedef void type; };
89
+ template<> struct __nv_itex_trait<uint4> { typedef void type; };
90
+ #if !defined(__LP64__)
91
+ template<> struct __nv_itex_trait<long> { typedef void type; };
92
+ template<> struct __nv_itex_trait<long1> { typedef void type; };
93
+ template<> struct __nv_itex_trait<long2> { typedef void type; };
94
+ template<> struct __nv_itex_trait<long4> { typedef void type; };
95
+ template<> struct __nv_itex_trait<unsigned long> { typedef void type; };
96
+ template<> struct __nv_itex_trait<ulong1> { typedef void type; };
97
+ template<> struct __nv_itex_trait<ulong2> { typedef void type; };
98
+ template<> struct __nv_itex_trait<ulong4> { typedef void type; };
99
+ #endif /* !__LP64__ */
100
+ template<> struct __nv_itex_trait<float> { typedef void type; };
101
+ template<> struct __nv_itex_trait<float1> { typedef void type; };
102
+ template<> struct __nv_itex_trait<float2> { typedef void type; };
103
+ template<> struct __nv_itex_trait<float4> { typedef void type; };
104
+
105
+
106
+
107
+ template <typename T>
108
+ static __device__ typename __nv_itex_trait<T>::type tex1Dfetch(T *ptr, cudaTextureObject_t obj, int x)
109
+ {
110
+ #ifdef __CUDA_ARCH__
111
+ __nv_tex_surf_handler("__itex1Dfetch", ptr, obj, x);
112
+ #endif
113
+ }
114
+
115
+ template <class T>
116
+ static __device__ T tex1Dfetch(cudaTextureObject_t texObject, int x)
117
+ {
118
+ #ifdef __CUDA_ARCH__
119
+ T ret;
120
+ tex1Dfetch(&ret, texObject, x);
121
+ return ret;
122
+ #endif
123
+ }
124
+
125
+ template <typename T>
126
+ static __device__ typename __nv_itex_trait<T>::type tex1D(T *ptr, cudaTextureObject_t obj, float x)
127
+ {
128
+ #ifdef __CUDA_ARCH__
129
+ __nv_tex_surf_handler("__itex1D", ptr, obj, x);
130
+ #endif
131
+ }
132
+
133
+
134
+ template <class T>
135
+ static __device__ T tex1D(cudaTextureObject_t texObject, float x)
136
+ {
137
+ #ifdef __CUDA_ARCH__
138
+ T ret;
139
+ tex1D(&ret, texObject, x);
140
+ return ret;
141
+ #endif
142
+ }
143
+
144
+
145
+ template <typename T>
146
+ static __device__ typename __nv_itex_trait<T>::type tex2D(T *ptr, cudaTextureObject_t obj, float x, float y)
147
+ {
148
+ #ifdef __CUDA_ARCH__
149
+ __nv_tex_surf_handler("__itex2D", ptr, obj, x, y);
150
+ #endif
151
+ }
152
+
153
+ template <class T>
154
+ static __device__ T tex2D(cudaTextureObject_t texObject, float x, float y)
155
+ {
156
+ #ifdef __CUDA_ARCH__
157
+ T ret;
158
+ tex2D(&ret, texObject, x, y);
159
+ return ret;
160
+ #endif
161
+ }
162
+
163
+ #if __NV_TEX_SPARSE
164
+ template <typename T>
165
+ static __device__ typename __nv_itex_trait<T>::type tex2D(T *ptr, cudaTextureObject_t obj, float x, float y,
166
+ bool* isResident)
167
+ {
168
+ #ifdef __CUDA_ARCH__
169
+ unsigned char res;
170
+ __nv_tex_surf_handler("__itex2D_sparse", ptr, obj, x, y, &res);
171
+ *isResident = (res != 0);
172
+ #endif
173
+ }
174
+
175
+ template <class T>
176
+ static __device__ T tex2D(cudaTextureObject_t texObject, float x, float y, bool* isResident)
177
+ {
178
+ #ifdef __CUDA_ARCH__
179
+ T ret;
180
+ tex2D(&ret, texObject, x, y, isResident);
181
+ return ret;
182
+ #endif
183
+ }
184
+
185
+ #endif /* __NV_TEX_SPARSE */
186
+
187
+
188
+ template <typename T>
189
+ static __device__ typename __nv_itex_trait<T>::type tex3D(T *ptr, cudaTextureObject_t obj, float x, float y, float z)
190
+ {
191
+ #ifdef __CUDA_ARCH__
192
+ __nv_tex_surf_handler("__itex3D", ptr, obj, x, y, z);
193
+ #endif
194
+ }
195
+
196
+ template <class T>
197
+ static __device__ T tex3D(cudaTextureObject_t texObject, float x, float y, float z)
198
+ {
199
+ #ifdef __CUDA_ARCH__
200
+ T ret;
201
+ tex3D(&ret, texObject, x, y, z);
202
+ return ret;
203
+ #endif
204
+ }
205
+
206
+ #if __NV_TEX_SPARSE
207
+ template <typename T>
208
+ static __device__ typename __nv_itex_trait<T>::type tex3D(T *ptr, cudaTextureObject_t obj, float x, float y, float z,
209
+ bool* isResident)
210
+ {
211
+ #ifdef __CUDA_ARCH__
212
+ unsigned char res;
213
+ __nv_tex_surf_handler("__itex3D_sparse", ptr, obj, x, y, z, &res);
214
+ *isResident = (res != 0);
215
+ #endif
216
+ }
217
+
218
+ template <class T>
219
+ static __device__ T tex3D(cudaTextureObject_t texObject, float x, float y, float z, bool* isResident)
220
+ {
221
+ #ifdef __CUDA_ARCH__
222
+ T ret;
223
+ tex3D(&ret, texObject, x, y, z, isResident);
224
+ return ret;
225
+ #endif
226
+ }
227
+ #endif /* __NV_TEX_SPARSE */
228
+
229
+
230
+ template <typename T>
231
+ static __device__ typename __nv_itex_trait<T>::type tex1DLayered(T *ptr, cudaTextureObject_t obj, float x, int layer)
232
+ {
233
+ #ifdef __CUDA_ARCH__
234
+ __nv_tex_surf_handler("__itex1DLayered", ptr, obj, x, layer);
235
+ #endif
236
+ }
237
+
238
+ template <class T>
239
+ static __device__ T tex1DLayered(cudaTextureObject_t texObject, float x, int layer)
240
+ {
241
+ #ifdef __CUDA_ARCH__
242
+ T ret;
243
+ tex1DLayered(&ret, texObject, x, layer);
244
+ return ret;
245
+ #endif
246
+ }
247
+
248
+ template <typename T>
249
+ static __device__ typename __nv_itex_trait<T>::type tex2DLayered(T *ptr, cudaTextureObject_t obj, float x, float y, int layer)
250
+ {
251
+ #ifdef __CUDA_ARCH__
252
+ __nv_tex_surf_handler("__itex2DLayered", ptr, obj, x, y, layer);
253
+ #endif
254
+ }
255
+
256
+ template <class T>
257
+ static __device__ T tex2DLayered(cudaTextureObject_t texObject, float x, float y, int layer)
258
+ {
259
+ #ifdef __CUDA_ARCH__
260
+ T ret;
261
+ tex2DLayered(&ret, texObject, x, y, layer);
262
+ return ret;
263
+ #endif
264
+ }
265
+
266
+ #if __NV_TEX_SPARSE
267
+ template <typename T>
268
+ static __device__ typename __nv_itex_trait<T>::type tex2DLayered(T *ptr, cudaTextureObject_t obj, float x, float y, int layer, bool* isResident)
269
+ {
270
+ #ifdef __CUDA_ARCH__
271
+ unsigned char res;
272
+ __nv_tex_surf_handler("__itex2DLayered_sparse", ptr, obj, x, y, layer, &res);
273
+ *isResident = (res != 0);
274
+ #endif
275
+ }
276
+
277
+ template <class T>
278
+ static __device__ T tex2DLayered(cudaTextureObject_t texObject, float x, float y, int layer, bool* isResident)
279
+ {
280
+ #ifdef __CUDA_ARCH__
281
+ T ret;
282
+ tex2DLayered(&ret, texObject, x, y, layer, isResident);
283
+ return ret;
284
+ #endif
285
+ }
286
+ #endif /* __NV_TEX_SPARSE */
287
+
288
+
289
+ template <typename T>
290
+ static __device__ typename __nv_itex_trait<T>::type texCubemap(T *ptr, cudaTextureObject_t obj, float x, float y, float z)
291
+ {
292
+ #ifdef __CUDA_ARCH__
293
+ __nv_tex_surf_handler("__itexCubemap", ptr, obj, x, y, z);
294
+ #endif
295
+ }
296
+
297
+
298
+ template <class T>
299
+ static __device__ T texCubemap(cudaTextureObject_t texObject, float x, float y, float z)
300
+ {
301
+ #ifdef __CUDA_ARCH__
302
+ T ret;
303
+ texCubemap(&ret, texObject, x, y, z);
304
+ return ret;
305
+ #endif
306
+ }
307
+
308
+
309
+ template <typename T>
310
+ static __device__ typename __nv_itex_trait<T>::type texCubemapLayered(T *ptr, cudaTextureObject_t obj, float x, float y, float z, int layer)
311
+ {
312
+ #ifdef __CUDA_ARCH__
313
+ __nv_tex_surf_handler("__itexCubemapLayered", ptr, obj, x, y, z, layer);
314
+ #endif
315
+ }
316
+
317
+ template <class T>
318
+ static __device__ T texCubemapLayered(cudaTextureObject_t texObject, float x, float y, float z, int layer)
319
+ {
320
+ #ifdef __CUDA_ARCH__
321
+ T ret;
322
+ texCubemapLayered(&ret, texObject, x, y, z, layer);
323
+ return ret;
324
+ #endif
325
+ }
326
+
327
+ template <typename T>
328
+ static __device__ typename __nv_itex_trait<T>::type tex2Dgather(T *ptr, cudaTextureObject_t obj, float x, float y, int comp = 0)
329
+ {
330
+ #ifdef __CUDA_ARCH__
331
+ __nv_tex_surf_handler("__itex2Dgather", ptr, obj, x, y, comp);
332
+ #endif
333
+ }
334
+
335
+ template <class T>
336
+ static __device__ T tex2Dgather(cudaTextureObject_t to, float x, float y, int comp = 0)
337
+ {
338
+ #ifdef __CUDA_ARCH__
339
+ T ret;
340
+ tex2Dgather(&ret, to, x, y, comp);
341
+ return ret;
342
+ #endif
343
+ }
344
+
345
+ #if __NV_TEX_SPARSE
346
+ template <typename T>
347
+ static __device__ typename __nv_itex_trait<T>::type tex2Dgather(T *ptr, cudaTextureObject_t obj, float x, float y, bool* isResident, int comp = 0)
348
+ {
349
+ #ifdef __CUDA_ARCH__
350
+ unsigned char res;
351
+ __nv_tex_surf_handler("__itex2Dgather_sparse", ptr, obj, x, y, comp, &res);
352
+ *isResident = (res != 0);
353
+ #endif
354
+ }
355
+
356
+ template <class T>
357
+ static __device__ T tex2Dgather(cudaTextureObject_t to, float x, float y, bool* isResident, int comp = 0)
358
+ {
359
+ #ifdef __CUDA_ARCH__
360
+ T ret;
361
+ tex2Dgather(&ret, to, x, y, isResident, comp);
362
+ return ret;
363
+ #endif
364
+ }
365
+
366
+ #endif /* __NV_TEX_SPARSE */
367
+
368
+ template <typename T>
369
+ static __device__ typename __nv_itex_trait<T>::type tex1DLod(T *ptr, cudaTextureObject_t obj, float x, float level)
370
+ {
371
+ #ifdef __CUDA_ARCH__
372
+ __nv_tex_surf_handler("__itex1DLod", ptr, obj, x, level);
373
+ #endif
374
+ }
375
+
376
+ template <class T>
377
+ static __device__ T tex1DLod(cudaTextureObject_t texObject, float x, float level)
378
+ {
379
+ #ifdef __CUDA_ARCH__
380
+ T ret;
381
+ tex1DLod(&ret, texObject, x, level);
382
+ return ret;
383
+ #endif
384
+ }
385
+
386
+
387
+ template <typename T>
388
+ static __device__ typename __nv_itex_trait<T>::type tex2DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float level)
389
+ {
390
+ #ifdef __CUDA_ARCH__
391
+ __nv_tex_surf_handler("__itex2DLod", ptr, obj, x, y, level);
392
+ #endif
393
+ }
394
+
395
+ template <class T>
396
+ static __device__ T tex2DLod(cudaTextureObject_t texObject, float x, float y, float level)
397
+ {
398
+ #ifdef __CUDA_ARCH__
399
+ T ret;
400
+ tex2DLod(&ret, texObject, x, y, level);
401
+ return ret;
402
+ #endif
403
+ }
404
+
405
+ #if __NV_TEX_SPARSE
406
+
407
+ template <typename T>
408
+ static __device__ typename __nv_itex_trait<T>::type tex2DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float level, bool* isResident)
409
+ {
410
+ #ifdef __CUDA_ARCH__
411
+ unsigned char res;
412
+ __nv_tex_surf_handler("__itex2DLod_sparse", ptr, obj, x, y, level, &res);
413
+ *isResident = (res != 0);
414
+ #endif
415
+ }
416
+
417
+ template <class T>
418
+ static __device__ T tex2DLod(cudaTextureObject_t texObject, float x, float y, float level, bool* isResident)
419
+ {
420
+ #ifdef __CUDA_ARCH__
421
+ T ret;
422
+ tex2DLod(&ret, texObject, x, y, level, isResident);
423
+ return ret;
424
+ #endif
425
+ }
426
+
427
+ #endif /* __NV_TEX_SPARSE */
428
+
429
+
430
+ template <typename T>
431
+ static __device__ typename __nv_itex_trait<T>::type tex3DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float level)
432
+ {
433
+ #ifdef __CUDA_ARCH__
434
+ __nv_tex_surf_handler("__itex3DLod", ptr, obj, x, y, z, level);
435
+ #endif
436
+ }
437
+
438
+ template <class T>
439
+ static __device__ T tex3DLod(cudaTextureObject_t texObject, float x, float y, float z, float level)
440
+ {
441
+ #ifdef __CUDA_ARCH__
442
+ T ret;
443
+ tex3DLod(&ret, texObject, x, y, z, level);
444
+ return ret;
445
+ #endif
446
+ }
447
+
448
+ #if __NV_TEX_SPARSE
449
+ template <typename T>
450
+ static __device__ typename __nv_itex_trait<T>::type tex3DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float level, bool* isResident)
451
+ {
452
+ #ifdef __CUDA_ARCH__
453
+ unsigned char res;
454
+ __nv_tex_surf_handler("__itex3DLod_sparse", ptr, obj, x, y, z, level, &res);
455
+ *isResident = (res != 0);
456
+ #endif
457
+ }
458
+
459
+ template <class T>
460
+ static __device__ T tex3DLod(cudaTextureObject_t texObject, float x, float y, float z, float level, bool* isResident)
461
+ {
462
+ #ifdef __CUDA_ARCH__
463
+ T ret;
464
+ tex3DLod(&ret, texObject, x, y, z, level, isResident);
465
+ return ret;
466
+ #endif
467
+ }
468
+
469
+ #endif /* __NV_TEX_SPARSE */
470
+
471
+
472
+ template <typename T>
473
+ static __device__ typename __nv_itex_trait<T>::type tex1DLayeredLod(T *ptr, cudaTextureObject_t obj, float x, int layer, float level)
474
+ {
475
+ #ifdef __CUDA_ARCH__
476
+ __nv_tex_surf_handler("__itex1DLayeredLod", ptr, obj, x, layer, level);
477
+ #endif
478
+ }
479
+
480
+ template <class T>
481
+ static __device__ T tex1DLayeredLod(cudaTextureObject_t texObject, float x, int layer, float level)
482
+ {
483
+ #ifdef __CUDA_ARCH__
484
+ T ret;
485
+ tex1DLayeredLod(&ret, texObject, x, layer, level);
486
+ return ret;
487
+ #endif
488
+ }
489
+
490
+
491
+ template <typename T>
492
+ static __device__ typename __nv_itex_trait<T>::type tex2DLayeredLod(T *ptr, cudaTextureObject_t obj, float x, float y, int layer, float level)
493
+ {
494
+ #ifdef __CUDA_ARCH__
495
+ __nv_tex_surf_handler("__itex2DLayeredLod", ptr, obj, x, y, layer, level);
496
+ #endif
497
+ }
498
+
499
+ template <class T>
500
+ static __device__ T tex2DLayeredLod(cudaTextureObject_t texObject, float x, float y, int layer, float level)
501
+ {
502
+ #ifdef __CUDA_ARCH__
503
+ T ret;
504
+ tex2DLayeredLod(&ret, texObject, x, y, layer, level);
505
+ return ret;
506
+ #endif
507
+ }
508
+
509
+ #if __NV_TEX_SPARSE
510
+ template <typename T>
511
+ static __device__ typename __nv_itex_trait<T>::type tex2DLayeredLod(T *ptr, cudaTextureObject_t obj, float x, float y, int layer, float level, bool* isResident)
512
+ {
513
+ #ifdef __CUDA_ARCH__
514
+ unsigned char res;
515
+ __nv_tex_surf_handler("__itex2DLayeredLod_sparse", ptr, obj, x, y, layer, level, &res);
516
+ *isResident = (res != 0);
517
+ #endif
518
+ }
519
+
520
+ template <class T>
521
+ static __device__ T tex2DLayeredLod(cudaTextureObject_t texObject, float x, float y, int layer, float level, bool* isResident)
522
+ {
523
+ #ifdef __CUDA_ARCH__
524
+ T ret;
525
+ tex2DLayeredLod(&ret, texObject, x, y, layer, level, isResident);
526
+ return ret;
527
+ #endif
528
+ }
529
+ #endif /* __NV_TEX_SPARSE */
530
+
531
+ template <typename T>
532
+ static __device__ typename __nv_itex_trait<T>::type texCubemapLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float level)
533
+ {
534
+ #ifdef __CUDA_ARCH__
535
+ __nv_tex_surf_handler("__itexCubemapLod", ptr, obj, x, y, z, level);
536
+ #endif
537
+ }
538
+
539
+ template <class T>
540
+ static __device__ T texCubemapLod(cudaTextureObject_t texObject, float x, float y, float z, float level)
541
+ {
542
+ #ifdef __CUDA_ARCH__
543
+ T ret;
544
+ texCubemapLod(&ret, texObject, x, y, z, level);
545
+ return ret;
546
+ #endif
547
+ }
548
+
549
+
550
+ template <typename T>
551
+ static __device__ typename __nv_itex_trait<T>::type texCubemapGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float4 dPdx, float4 dPdy)
552
+ {
553
+ #ifdef __CUDA_ARCH__
554
+ __nv_tex_surf_handler("__itexCubemapGrad_v2", ptr, obj, x, y, z, &dPdx, &dPdy);
555
+ #endif
556
+ }
557
+
558
+ template <class T>
559
+ static __device__ T texCubemapGrad(cudaTextureObject_t texObject, float x, float y, float z, float4 dPdx, float4 dPdy)
560
+ {
561
+ #ifdef __CUDA_ARCH__
562
+ T ret;
563
+ texCubemapGrad(&ret, texObject, x, y, z, dPdx, dPdy);
564
+ return ret;
565
+ #endif
566
+ }
567
+
568
+ template <typename T>
569
+ static __device__ typename __nv_itex_trait<T>::type texCubemapLayeredLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, int layer, float level)
570
+ {
571
+ #ifdef __CUDA_ARCH__
572
+ __nv_tex_surf_handler("__itexCubemapLayeredLod", ptr, obj, x, y, z, layer, level);
573
+ #endif
574
+ }
575
+
576
+ template <class T>
577
+ static __device__ T texCubemapLayeredLod(cudaTextureObject_t texObject, float x, float y, float z, int layer, float level)
578
+ {
579
+ #ifdef __CUDA_ARCH__
580
+ T ret;
581
+ texCubemapLayeredLod(&ret, texObject, x, y, z, layer, level);
582
+ return ret;
583
+ #endif
584
+ }
585
+
586
+ template <typename T>
587
+ static __device__ typename __nv_itex_trait<T>::type tex1DGrad(T *ptr, cudaTextureObject_t obj, float x, float dPdx, float dPdy)
588
+ {
589
+ #ifdef __CUDA_ARCH__
590
+ __nv_tex_surf_handler("__itex1DGrad", ptr, obj, x, dPdx, dPdy);
591
+ #endif
592
+ }
593
+
594
+ template <class T>
595
+ static __device__ T tex1DGrad(cudaTextureObject_t texObject, float x, float dPdx, float dPdy)
596
+ {
597
+ #ifdef __CUDA_ARCH__
598
+ T ret;
599
+ tex1DGrad(&ret, texObject, x, dPdx, dPdy);
600
+ return ret;
601
+ #endif
602
+ }
603
+
604
+
605
+ template <typename T>
606
+ static __device__ typename __nv_itex_trait<T>::type tex2DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float2 dPdx, float2 dPdy)
607
+ {
608
+ #ifdef __CUDA_ARCH__
609
+ __nv_tex_surf_handler("__itex2DGrad_v2", ptr, obj, x, y, &dPdx, &dPdy);
610
+ #endif
611
+
612
+ }
613
+
614
+ template <class T>
615
+ static __device__ T tex2DGrad(cudaTextureObject_t texObject, float x, float y, float2 dPdx, float2 dPdy)
616
+ {
617
+ #ifdef __CUDA_ARCH__
618
+ T ret;
619
+ tex2DGrad(&ret, texObject, x, y, dPdx, dPdy);
620
+ return ret;
621
+ #endif
622
+ }
623
+
624
+ #if __NV_TEX_SPARSE
625
+ template <typename T>
626
+ static __device__ typename __nv_itex_trait<T>::type tex2DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float2 dPdx, float2 dPdy, bool* isResident)
627
+ {
628
+ #ifdef __CUDA_ARCH__
629
+ unsigned char res;
630
+ __nv_tex_surf_handler("__itex2DGrad_sparse", ptr, obj, x, y, &dPdx, &dPdy, &res);
631
+ *isResident = (res != 0);
632
+ #endif
633
+
634
+ }
635
+
636
+ template <class T>
637
+ static __device__ T tex2DGrad(cudaTextureObject_t texObject, float x, float y, float2 dPdx, float2 dPdy, bool* isResident)
638
+ {
639
+ #ifdef __CUDA_ARCH__
640
+ T ret;
641
+ tex2DGrad(&ret, texObject, x, y, dPdx, dPdy, isResident);
642
+ return ret;
643
+ #endif
644
+ }
645
+ #endif /* __NV_TEX_SPARSE */
646
+
647
+
648
+ template <typename T>
649
+ static __device__ typename __nv_itex_trait<T>::type tex3DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float4 dPdx, float4 dPdy)
650
+ {
651
+ #ifdef __CUDA_ARCH__
652
+ __nv_tex_surf_handler("__itex3DGrad_v2", ptr, obj, x, y, z, &dPdx, &dPdy);
653
+ #endif
654
+ }
655
+
656
+ template <class T>
657
+ static __device__ T tex3DGrad(cudaTextureObject_t texObject, float x, float y, float z, float4 dPdx, float4 dPdy)
658
+ {
659
+ #ifdef __CUDA_ARCH__
660
+ T ret;
661
+ tex3DGrad(&ret, texObject, x, y, z, dPdx, dPdy);
662
+ return ret;
663
+ #endif
664
+ }
665
+
666
+ #if __NV_TEX_SPARSE
667
+ template <typename T>
668
+ static __device__ typename __nv_itex_trait<T>::type tex3DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float4 dPdx, float4 dPdy, bool* isResident)
669
+ {
670
+ #ifdef __CUDA_ARCH__
671
+ unsigned char res;
672
+ __nv_tex_surf_handler("__itex3DGrad_sparse", ptr, obj, x, y, z, &dPdx, &dPdy, &res);
673
+ *isResident = (res != 0);
674
+ #endif
675
+ }
676
+
677
+ template <class T>
678
+ static __device__ T tex3DGrad(cudaTextureObject_t texObject, float x, float y, float z, float4 dPdx, float4 dPdy, bool* isResident)
679
+ {
680
+ #ifdef __CUDA_ARCH__
681
+ T ret;
682
+ tex3DGrad(&ret, texObject, x, y, z, dPdx, dPdy, isResident);
683
+ return ret;
684
+ #endif
685
+ }
686
+
687
+ #endif /* __NV_TEX_SPARSE */
688
+
689
+
690
+ template <typename T>
691
+ static __device__ typename __nv_itex_trait<T>::type tex1DLayeredGrad(T *ptr, cudaTextureObject_t obj, float x, int layer, float dPdx, float dPdy)
692
+ {
693
+ #ifdef __CUDA_ARCH__
694
+ __nv_tex_surf_handler("__itex1DLayeredGrad", ptr, obj, x, layer, dPdx, dPdy);
695
+ #endif
696
+ }
697
+
698
+ template <class T>
699
+ static __device__ T tex1DLayeredGrad(cudaTextureObject_t texObject, float x, int layer, float dPdx, float dPdy)
700
+ {
701
+ #ifdef __CUDA_ARCH__
702
+ T ret;
703
+ tex1DLayeredGrad(&ret, texObject, x, layer, dPdx, dPdy);
704
+ return ret;
705
+ #endif
706
+ }
707
+
708
+
709
+ template <typename T>
710
+ static __device__ typename __nv_itex_trait<T>::type tex2DLayeredGrad(T * ptr, cudaTextureObject_t obj, float x, float y, int layer, float2 dPdx, float2 dPdy)
711
+ {
712
+ #ifdef __CUDA_ARCH__
713
+ __nv_tex_surf_handler("__itex2DLayeredGrad_v2", ptr, obj, x, y, layer, &dPdx, &dPdy);
714
+ #endif
715
+ }
716
+
717
+ template <class T>
718
+ static __device__ T tex2DLayeredGrad(cudaTextureObject_t texObject, float x, float y, int layer, float2 dPdx, float2 dPdy)
719
+ {
720
+ #ifdef __CUDA_ARCH__
721
+ T ret;
722
+ tex2DLayeredGrad(&ret, texObject, x, y, layer, dPdx, dPdy);
723
+ return ret;
724
+ #endif
725
+ }
726
+
727
+ #if __NV_TEX_SPARSE
728
+ template <typename T>
729
+ static __device__ typename __nv_itex_trait<T>::type tex2DLayeredGrad(T * ptr, cudaTextureObject_t obj, float x, float y, int layer, float2 dPdx, float2 dPdy, bool* isResident)
730
+ {
731
+ #ifdef __CUDA_ARCH__
732
+ unsigned char res;
733
+ __nv_tex_surf_handler("__itex2DLayeredGrad_sparse", ptr, obj, x, y, layer, &dPdx, &dPdy, &res);
734
+ *isResident = (res != 0);
735
+ #endif
736
+ }
737
+
738
+ template <class T>
739
+ static __device__ T tex2DLayeredGrad(cudaTextureObject_t texObject, float x, float y, int layer, float2 dPdx, float2 dPdy, bool* isResident)
740
+ {
741
+ #ifdef __CUDA_ARCH__
742
+ T ret;
743
+ tex2DLayeredGrad(&ret, texObject, x, y, layer, dPdx, dPdy, isResident);
744
+ return ret;
745
+ #endif
746
+ }
747
+ #endif /* __NV_TEX_SPARSE */
748
+
749
+
750
+ template <typename T>
751
+ static __device__ typename __nv_itex_trait<T>::type texCubemapLayeredGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
752
+ {
753
+ #ifdef __CUDA_ARCH__
754
+ __nv_tex_surf_handler("__itexCubemapLayeredGrad_v2", ptr, obj, x, y, z, layer, &dPdx, &dPdy);
755
+ #endif
756
+ }
757
+
758
+ template <class T>
759
+ static __device__ T texCubemapLayeredGrad(cudaTextureObject_t texObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
760
+ {
761
+ #ifdef __CUDA_ARCH__
762
+ T ret;
763
+ texCubemapLayeredGrad(&ret, texObject, x, y, z, layer, dPdx, dPdy);
764
+ return ret;
765
+ #endif
766
+ }
767
+
768
+ #undef __NV_TEX_SPARSE
769
+
770
+ #endif // __cplusplus && __CUDACC__
771
+ #endif // __TEXTURE_INDIRECT_FUNCTIONS_H__