diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/caching.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/caching.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..28ccc9c1b49076e097a138d931dbf9c1795d2853 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/caching.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/fuse.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/fuse.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c8d78f5a46e6059d88fd22bdd7e832bbe47501e6 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/fuse.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/dask.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/dask.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e61c04624f81b70756bdac78cb9808b5c4209317 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/dask.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/webhdfs.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/webhdfs.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..789026e06d3edf46565babd40e101ae334796579 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/webhdfs.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudaGL_meta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudaGL_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..7a52e194b265d32f61d47bd3081f4958755bff46 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudaGL_meta.h @@ -0,0 +1,116 @@ +// This file is generated. Any changes you make will be lost during the next clean build. + +// Dependent includes +#ifdef __APPLE__ +#include +#else +#include +#endif + +// CUDA public interface, for type definitions and cu* function prototypes +#include "cudaGL.h" + + +// ************************************************************************* +// Definitions of structs to hold parameters for each function +// ************************************************************************* + +typedef struct cuGraphicsGLRegisterBuffer_params_st { + CUgraphicsResource *pCudaResource; + GLuint buffer; + unsigned int Flags; +} cuGraphicsGLRegisterBuffer_params; + +typedef struct cuGraphicsGLRegisterImage_params_st { + CUgraphicsResource *pCudaResource; + GLuint image; + GLenum target; + unsigned int Flags; +} cuGraphicsGLRegisterImage_params; + +typedef struct cuGLGetDevices_v2_params_st { + unsigned int *pCudaDeviceCount; + CUdevice *pCudaDevices; + unsigned int cudaDeviceCount; + CUGLDeviceList deviceList; +} cuGLGetDevices_v2_params; + +typedef struct cuGLCtxCreate_v2_params_st { + CUcontext *pCtx; + unsigned int Flags; + CUdevice device; +} cuGLCtxCreate_v2_params; + +typedef struct cuGLRegisterBufferObject_params_st { + GLuint buffer; +} cuGLRegisterBufferObject_params; + +typedef struct cuGLMapBufferObject_v2_ptds_params_st { + CUdeviceptr *dptr; + size_t *size; + GLuint buffer; +} cuGLMapBufferObject_v2_ptds_params; + +typedef struct cuGLUnmapBufferObject_params_st { + GLuint buffer; +} cuGLUnmapBufferObject_params; + +typedef struct cuGLUnregisterBufferObject_params_st { + GLuint buffer; +} cuGLUnregisterBufferObject_params; + +typedef struct cuGLSetBufferObjectMapFlags_params_st { + GLuint buffer; + unsigned int Flags; +} cuGLSetBufferObjectMapFlags_params; + +typedef struct cuGLMapBufferObjectAsync_v2_ptsz_params_st { + CUdeviceptr *dptr; + size_t *size; + GLuint buffer; + CUstream hStream; +} cuGLMapBufferObjectAsync_v2_ptsz_params; + +typedef struct cuGLUnmapBufferObjectAsync_params_st { + GLuint buffer; + CUstream hStream; +} cuGLUnmapBufferObjectAsync_params; + +typedef struct cuGLGetDevices_params_st { + unsigned int *pCudaDeviceCount; + CUdevice *pCudaDevices; + unsigned int cudaDeviceCount; + CUGLDeviceList deviceList; +} cuGLGetDevices_params; + +typedef struct cuGLMapBufferObject_v2_params_st { + CUdeviceptr *dptr; + size_t *size; + GLuint buffer; +} cuGLMapBufferObject_v2_params; + +typedef struct cuGLMapBufferObjectAsync_v2_params_st { + CUdeviceptr *dptr; + size_t *size; + GLuint buffer; + CUstream hStream; +} cuGLMapBufferObjectAsync_v2_params; + +typedef struct cuGLCtxCreate_params_st { + CUcontext *pCtx; + unsigned int Flags; + CUdevice device; +} cuGLCtxCreate_params; + +typedef struct cuGLMapBufferObject_params_st { + CUdeviceptr_v1 *dptr; + unsigned int *size; + GLuint buffer; +} cuGLMapBufferObject_params; + +typedef struct cuGLMapBufferObjectAsync_params_st { + CUdeviceptr_v1 *dptr; + unsigned int *size; + GLuint buffer; + CUstream hStream; +} cuGLMapBufferObjectAsync_params; diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudaVDPAU_meta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudaVDPAU_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..abc603c8d9be21e012a9b1641330c2e203d623b2 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudaVDPAU_meta.h @@ -0,0 +1,46 @@ +// This file is generated. Any changes you make will be lost during the next clean build. + +// Dependent includes +#include + +// CUDA public interface, for type definitions and cu* function prototypes +#include "cudaVDPAU.h" + + +// ************************************************************************* +// Definitions of structs to hold parameters for each function +// ************************************************************************* + +typedef struct cuVDPAUGetDevice_params_st { + CUdevice *pDevice; + VdpDevice vdpDevice; + VdpGetProcAddress *vdpGetProcAddress; +} cuVDPAUGetDevice_params; + +typedef struct cuVDPAUCtxCreate_v2_params_st { + CUcontext *pCtx; + unsigned int flags; + CUdevice device; + VdpDevice vdpDevice; + VdpGetProcAddress *vdpGetProcAddress; +} cuVDPAUCtxCreate_v2_params; + +typedef struct cuGraphicsVDPAURegisterVideoSurface_params_st { + CUgraphicsResource *pCudaResource; + VdpVideoSurface vdpSurface; + unsigned int flags; +} cuGraphicsVDPAURegisterVideoSurface_params; + +typedef struct cuGraphicsVDPAURegisterOutputSurface_params_st { + CUgraphicsResource *pCudaResource; + VdpOutputSurface vdpSurface; + unsigned int flags; +} cuGraphicsVDPAURegisterOutputSurface_params; + +typedef struct cuVDPAUCtxCreate_params_st { + CUcontext *pCtx; + unsigned int flags; + CUdevice device; + VdpDevice vdpDevice; + VdpGetProcAddress *vdpGetProcAddress; +} cuVDPAUCtxCreate_params; diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudart_removed_meta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudart_removed_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..350d3ac02c78de2d516eca5a5e886c2cc013c8f8 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudart_removed_meta.h @@ -0,0 +1,55 @@ +// This file is generated. Any changes you make will be lost during the next clean build. + +// CUDA public interface, for type definitions and api function prototypes +#include "cudart_removed.h" + +// ************************************************************************* +// Definitions of structs to hold parameters for each function +// ************************************************************************* + +// Currently used parameter trace structures +typedef struct cudaStreamDestroy_v3020_params_st { + cudaStream_t stream; +} cudaStreamDestroy_v3020_params; + +typedef struct cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6000_params_st { + int *numBlocks; + const void *func; + size_t numDynamicSmemBytes; +} cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6000_params; + +typedef struct cudaConfigureCall_v3020_params_st { + dim3 gridDim; + dim3 blockDim; + size_t sharedMem __dv; + cudaStream_t stream __dv; +} cudaConfigureCall_v3020_params; + +typedef struct cudaSetupArgument_v3020_params_st { + const void *arg; + size_t size; + size_t offset; +} cudaSetupArgument_v3020_params; + +typedef struct cudaLaunch_v3020_params_st { + const void *func; +} cudaLaunch_v3020_params; + +typedef struct cudaLaunch_ptsz_v7000_params_st { + const void *func; +} cudaLaunch_ptsz_v7000_params; + +typedef struct cudaStreamSetFlags_v10200_params_st { + cudaStream_t hStream; + unsigned int flags; +} cudaStreamSetFlags_v10200_params; + +typedef struct cudaStreamSetFlags_ptsz_v10200_params_st { + cudaStream_t hStream; + unsigned int flags; +} cudaStreamSetFlags_ptsz_v10200_params; + +// Parameter trace structures for removed functions + + +// End of parameter trace structures diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/nvperf_target.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/nvperf_target.h new file mode 100644 index 0000000000000000000000000000000000000000..4145af5bf3a0604ef4fa46e9295c82110b7b7002 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/nvperf_target.h @@ -0,0 +1,570 @@ +#ifndef NVPERF_TARGET_H +#define NVPERF_TARGET_H + +/* + * Copyright 2014-2022 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO USER: + * + * This source code is subject to NVIDIA ownership rights under U.S. and + * international Copyright laws. + * + * This software and the information contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions + * of a form of NVIDIA software license agreement. + * + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOURCE CODE. + * + * U.S. Government End Users. This source code is a "commercial item" as + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of + * "commercial computer software" and "commercial computer software + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) + * and is provided to the U.S. Government only as a commercial end item. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the + * source code with only those rights set forth herein. + * + * Any use of this source code in individual and commercial software must + * include, in the user documentation and internal comments to the code, + * the above Disclaimer and U.S. Government End Users Notice. + */ + +#include +#include +#include "nvperf_common.h" + +#if defined(__GNUC__) && defined(NVPA_SHARED_LIB) + #pragma GCC visibility push(default) + #if !defined(NVPW_LOCAL) + #define NVPW_LOCAL __attribute__ ((visibility ("hidden"))) + #endif +#else + #if !defined(NVPW_LOCAL) + #define NVPW_LOCAL + #endif +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @file nvperf_target.h + */ + +#ifndef NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_DEFINED +#define NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_DEFINED + /// GPU architecture support level + typedef enum NVPW_GpuArchitectureSupportLevel + { + NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_UNKNOWN = 0, + NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_UNSUPPORTED, + NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_SUPPORTED + } NVPW_GpuArchitectureSupportLevel; +#endif //NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_DEFINED + +#ifndef NVPW_SLI_SUPPORT_LEVEL_DEFINED +#define NVPW_SLI_SUPPORT_LEVEL_DEFINED + /// SLI configuration support level + typedef enum NVPW_SliSupportLevel + { + NVPW_SLI_SUPPORT_LEVEL_UNKNOWN = 0, + NVPW_SLI_SUPPORT_LEVEL_UNSUPPORTED, + /// Only Non-SLI configurations are supported. + NVPW_SLI_SUPPORT_LEVEL_SUPPORTED_NON_SLI_CONFIGURATION + } NVPW_SliSupportLevel; +#endif //NVPW_SLI_SUPPORT_LEVEL_DEFINED + +#ifndef NVPW_VGPU_SUPPORT_LEVEL_DEFINED +#define NVPW_VGPU_SUPPORT_LEVEL_DEFINED + /// Virtualized GPU configuration support level + typedef enum NVPW_VGpuSupportLevel + { + NVPW_VGPU_SUPPORT_LEVEL_UNKNOWN = 0, + NVPW_VGPU_SUPPORT_LEVEL_UNSUPPORTED, + /// Supported but not allowed by system admin. + NVPW_VGPU_SUPPORT_LEVEL_SUPPORTED_DISALLOWED, + NVPW_VGPU_SUPPORT_LEVEL_SUPPORTED_ALLOWED, + NVPW_VGPU_SUPPORT_LEVEL_SUPPORTED_NON_VGPU_CONFIGURATION + } NVPW_VGpuSupportLevel; +#endif //NVPW_VGPU_SUPPORT_LEVEL_DEFINED + +#ifndef NVPW_CONF_COMPUTE_SUPPORT_LEVEL_DEFINED +#define NVPW_CONF_COMPUTE_SUPPORT_LEVEL_DEFINED + /// Confidential Compute mode support level + typedef enum NVPW_ConfidentialComputeSupportLevel + { + NVPW_CONF_COMPUTE_SUPPORT_LEVEL_UNKNOWN = 0, + NVPW_CONF_COMPUTE_SUPPORT_LEVEL_UNSUPPORTED, + NVPW_CONF_COMPUTE_SUPPORT_LEVEL_SUPPORTED_NON_CONF_COMPUTE_CONFIGURATION + } NVPW_ConfidentialComputeSupportLevel; +#endif //NVPW_CONF_COMPUTE_SUPPORT_LEVEL_DEFINED + +#ifndef NVPW_CMP_SUPPORT_LEVEL_DEFINED +#define NVPW_CMP_SUPPORT_LEVEL_DEFINED + /// CMP support level + typedef enum NVPW_CmpSupportLevel + { + NVPW_CMP_SUPPORT_LEVEL_UNKNOWN = 0, + NVPW_CMP_SUPPORT_LEVEL_UNSUPPORTED, + NVPW_CMP_SUPPORT_LEVEL_SUPPORTED_NON_CMP_CONFIGURATON + } NVPW_CmpSupportLevel; +#endif //NVPW_CMP_SUPPORT_LEVEL_DEFINED + +#ifndef NVPW_WSL_SUPPORT_LEVEL_DEFINED +#define NVPW_WSL_SUPPORT_LEVEL_DEFINED + /// WSL support level + typedef enum NVPW_WslSupportLevel + { + NVPW_WSL_SUPPORT_LEVEL_UNKNOWN = 0, + NVPW_WSL_SUPPORT_LEVEL_UNSUPPORTED_INSUFFICIENT_DRIVER_VERSION, + NVPW_WSL_SUPPORT_LEVEL_SUPPORTED, + NVPW_WSL_SUPPORT_LEVEL_SUPPORTED_NON_WSL_CONFIGURATION + } NVPW_WslSupportLevel; +#endif //NVPW_WSL_SUPPORT_LEVEL_DEFINED + + typedef struct NVPW_InitializeTarget_Params + { + /// [in] + size_t structSize; + /// [in] assign to NULL + void* pPriv; + } NVPW_InitializeTarget_Params; +#define NVPW_InitializeTarget_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_InitializeTarget_Params, pPriv) + + /// Load the target library. + NVPA_Status NVPW_InitializeTarget(NVPW_InitializeTarget_Params* pParams); + + typedef struct NVPW_GetDeviceCount_Params + { + /// [in] + size_t structSize; + /// [in] assign to NULL + void* pPriv; + size_t numDevices; + } NVPW_GetDeviceCount_Params; +#define NVPW_GetDeviceCount_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_GetDeviceCount_Params, numDevices) + + NVPA_Status NVPW_GetDeviceCount(NVPW_GetDeviceCount_Params* pParams); + + typedef struct NVPW_Device_GetNames_Params + { + /// [in] + size_t structSize; + /// [in] assign to NULL + void* pPriv; + size_t deviceIndex; + const char* pDeviceName; + const char* pChipName; + } NVPW_Device_GetNames_Params; +#define NVPW_Device_GetNames_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetNames_Params, pChipName) + + NVPA_Status NVPW_Device_GetNames(NVPW_Device_GetNames_Params* pParams); + + typedef struct NVPW_PciBusId + { + /// The PCI domain on which the device bus resides. + uint32_t domain; + /// The bus on which the device resides. + uint16_t bus; + /// device ID. + uint16_t device; + } NVPW_PciBusId; +#define NVPW_PciBusId_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PciBusId, device) + + typedef struct NVPW_Device_GetPciBusIds_Params + { + /// [in] + size_t structSize; + /// [in] assign to NULL + void* pPriv; + /// [in] caller-allocated array of NVPW_PciBusId, indexed by NVPW deviceIndex + NVPW_PciBusId* pBusIds; + /// [in] size of the pBusIDs array; use result from NVPW_GetDeviceCount + size_t numDevices; + } NVPW_Device_GetPciBusIds_Params; +#define NVPW_Device_GetPciBusIds_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetPciBusIds_Params, numDevices) + + NVPA_Status NVPW_Device_GetPciBusIds(NVPW_Device_GetPciBusIds_Params* pParams); + + +#define NVPW_DEVICE_MIG_GPU_INSTANCE_ID_INVALID 0xFFFFFFFFu +#define NVPW_DEVICE_MIG_GPU_INSTANCE_ID_FULLCHIP 0xFFFFFFFEu + + + typedef struct NVPW_Device_GetMigAttributes_Params + { + /// [in] + size_t structSize; + /// [in] assign to NULL + void* pPriv; + /// [in] + size_t deviceIndex; + /// [out] + NVPA_Bool isMigPartition; + /// [out] + uint32_t gpuInstanceId; + /// [out] + uint32_t computeInstanceId; + } NVPW_Device_GetMigAttributes_Params; +#define NVPW_Device_GetMigAttributes_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetMigAttributes_Params, computeInstanceId) + + NVPA_Status NVPW_Device_GetMigAttributes(NVPW_Device_GetMigAttributes_Params* pParams); + + typedef struct NVPW_Adapter_GetDeviceIndex_Params + { + /// [in] + size_t structSize; + /// [in] assign to NULL + void* pPriv; + /// [in] + struct IDXGIAdapter* pAdapter; + /// [in] + size_t sliIndex; + /// [out] + size_t deviceIndex; + } NVPW_Adapter_GetDeviceIndex_Params; +#define NVPW_Adapter_GetDeviceIndex_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Adapter_GetDeviceIndex_Params, deviceIndex) + + NVPA_Status NVPW_Adapter_GetDeviceIndex(NVPW_Adapter_GetDeviceIndex_Params* pParams); + + typedef struct NVPW_CounterData_GetNumRanges_Params + { + /// [in] + size_t structSize; + /// [in] assign to NULL + void* pPriv; + const uint8_t* pCounterDataImage; + size_t numRanges; + } NVPW_CounterData_GetNumRanges_Params; +#define NVPW_CounterData_GetNumRanges_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_GetNumRanges_Params, numRanges) + + NVPA_Status NVPW_CounterData_GetNumRanges(NVPW_CounterData_GetNumRanges_Params* pParams); + + typedef struct NVPW_CounterData_GetChipName_Params + { + /// [in] + size_t structSize; + /// [in] assign to NULL + void* pPriv; + /// [in] + const uint8_t* pCounterDataImage; + /// [in] + size_t counterDataImageSize; + /// [out] + const char* pChipName; + } NVPW_CounterData_GetChipName_Params; +#define NVPW_CounterData_GetChipName_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_GetChipName_Params, pChipName) + + NVPA_Status NVPW_CounterData_GetChipName(NVPW_CounterData_GetChipName_Params* pParams); + + typedef struct NVPW_Config_GetNumPasses_Params + { + /// [in] + size_t structSize; + /// [in] assign to NULL + void* pPriv; + /// [in] + const uint8_t* pConfig; + /// [out] + size_t numPipelinedPasses; + /// [out] + size_t numIsolatedPasses; + } NVPW_Config_GetNumPasses_Params; +#define NVPW_Config_GetNumPasses_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Config_GetNumPasses_Params, numIsolatedPasses) + + /// Total num passes = numPipelinedPasses + numIsolatedPasses * numNestingLevels + NVPA_Status NVPW_Config_GetNumPasses(NVPW_Config_GetNumPasses_Params* pParams); + + typedef struct NVPW_Config_GetNumPasses_V2_Params + { + /// [in] + size_t structSize; + /// [in] assign to NULL + void* pPriv; + /// [in] + const uint8_t* pConfig; + /// [out] + size_t numPasses; + } NVPW_Config_GetNumPasses_V2_Params; +#define NVPW_Config_GetNumPasses_V2_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Config_GetNumPasses_V2_Params, numPasses) + + /// Total num passes = numPasses * numNestingLevels + NVPA_Status NVPW_Config_GetNumPasses_V2(NVPW_Config_GetNumPasses_V2_Params* pParams); + +#define NVPW_API_SET_CUDA_PROFILER 0x18209d0775b2f89dULL + +#define NVPW_API_SET_D3D11_PROFILER 0xca55c6738445db2bULL + +#define NVPW_API_SET_D3D12_PROFILER 0xc0c2d46dd7c7ad78ULL + +#define NVPW_API_SET_EGL_PROFILER 0x3c3747dae1f9565cULL + +#define NVPW_API_SET_GPU_PERIODICSAMPLER 0x9f4c2571fc0b2e8aULL + +#define NVPW_API_SET_METRICSCONTEXT 0x7c8579f6f2144beaULL + +#define NVPW_API_SET_METRICSEVALUATOR 0x0368a8768d811af9ULL + +#define NVPW_API_SET_METRICS_GA100_COMP 0x16b7d8c20d8b4915ULL + +#define NVPW_API_SET_METRICS_GA100_GRFX 0xc94eaabec04a94faULL + +#define NVPW_API_SET_METRICS_GA10X_COMP 0xb5d6391c2e299ab5ULL + +#define NVPW_API_SET_METRICS_GA10X_GRFX 0x6ebc121178b5ce0bULL + +#define NVPW_API_SET_METRICS_GV100_COMP 0x863705cc57919f72ULL + +#define NVPW_API_SET_METRICS_GV100_GRFX 0x9900da75d164fecfULL + +#define NVPW_API_SET_METRICS_GV11B_COMP 0xd3f79a859235848fULL + +#define NVPW_API_SET_METRICS_GV11B_GRFX 0xeb8e26220106e227ULL + +#define NVPW_API_SET_METRICS_TU10X_COMP 0x70f40be0afd35da8ULL + +#define NVPW_API_SET_METRICS_TU10X_GRFX 0xdf219cb838db6968ULL + +#define NVPW_API_SET_METRICS_TU11X_COMP 0xeb0069d7d0956678ULL + +#define NVPW_API_SET_METRICS_TU11X_GRFX 0x0977d9342bd62743ULL + +#define NVPW_API_SET_OPENGL_PROFILER 0xe4cd9ea40f2ee777ULL + +#define NVPW_API_SET_VULKAN_PROFILER 0x8c56b6a03d779689ULL + + typedef struct NVPW_QueryVersionNumber_Params + { + /// [in] + size_t structSize; + /// [in] assign to NULL + void* pPriv; + /// [in] + uint64_t apiSet; + /// [out] + uint32_t major; + /// [out] + uint32_t minor; + /// [out] + uint32_t patch; + /// [out] + uint32_t relMajor; + /// [out] + uint32_t relMinor; + /// [out] + uint32_t relPatch; + } NVPW_QueryVersionNumber_Params; +#define NVPW_QueryVersionNumber_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_QueryVersionNumber_Params, relPatch) + + /// Query version number of an API set + NVPA_Status NVPW_QueryVersionNumber(NVPW_QueryVersionNumber_Params* pParams); + + typedef enum NVPW_Device_ClockStatus + { + /// clock status is unknown + NVPW_DEVICE_CLOCK_STATUS_UNKNOWN, + /// clocks are locked to rated tdp values + NVPW_DEVICE_CLOCK_STATUS_LOCKED_TO_RATED_TDP, + /// clocks are not locked and can boost above rated tdp + NVPW_DEVICE_CLOCK_STATUS_BOOST_ENABLED, + /// clocks are not locked and will not go above rated tdp + NVPW_DEVICE_CLOCK_STATUS_BOOST_DISABLED, + NVPW_DEVICE_CLOCK_STATUS__COUNT + } NVPW_Device_ClockStatus; + + typedef struct NVPW_Device_GetClockStatus_Params + { + /// [in] + size_t structSize; + /// [in] assign to NULL + void* pPriv; + size_t deviceIndex; + /// [in] + NVPW_Device_ClockStatus clockStatus; + } NVPW_Device_GetClockStatus_Params; +#define NVPW_Device_GetClockStatus_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetClockStatus_Params, clockStatus) + + NVPA_Status NVPW_Device_GetClockStatus(NVPW_Device_GetClockStatus_Params* pParams); + + typedef enum NVPW_Device_ClockSetting + { + /// invalid op, specify valid clocks operation during profiling + NVPW_DEVICE_CLOCK_SETTING_INVALID, + /// default to driver/application config (normally unlocked and not boosted, but could be unlocked boosted, or + /// locked to rated TDP) + NVPW_DEVICE_CLOCK_SETTING_DEFAULT, + /// lock clocks at rated tdp base values + NVPW_DEVICE_CLOCK_SETTING_LOCK_TO_RATED_TDP, + NVPW_DEVICE_CLOCK_SETTING__COUNT + } NVPW_Device_ClockSetting; + + typedef struct NVPW_Device_SetClockSetting_Params + { + /// [in] + size_t structSize; + /// [in] assign to NULL + void* pPriv; + size_t deviceIndex; + /// [in] + NVPW_Device_ClockSetting clockSetting; + } NVPW_Device_SetClockSetting_Params; +#define NVPW_Device_SetClockSetting_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_SetClockSetting_Params, clockSetting) + + NVPA_Status NVPW_Device_SetClockSetting(NVPW_Device_SetClockSetting_Params* pParams); + + typedef struct NVPW_CounterData_GetRangeDescriptions_Params + { + /// [in] + size_t structSize; + /// [in] assign to NULL + void* pPriv; + const uint8_t* pCounterDataImage; + size_t rangeIndex; + /// [inout] Number of descriptions allocated in ppDescriptions + size_t numDescriptions; + const char** ppDescriptions; + } NVPW_CounterData_GetRangeDescriptions_Params; +#define NVPW_CounterData_GetRangeDescriptions_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_GetRangeDescriptions_Params, ppDescriptions) + + NVPA_Status NVPW_CounterData_GetRangeDescriptions(NVPW_CounterData_GetRangeDescriptions_Params* pParams); + + typedef struct NVPW_Profiler_CounterData_GetRangeDescriptions_Params + { + /// [in] + size_t structSize; + /// [in] assign to NULL + void* pPriv; + const uint8_t* pCounterDataImage; + size_t rangeIndex; + /// [inout] Number of descriptions allocated in ppDescriptions + size_t numDescriptions; + const char** ppDescriptions; + } NVPW_Profiler_CounterData_GetRangeDescriptions_Params; +#define NVPW_Profiler_CounterData_GetRangeDescriptions_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Profiler_CounterData_GetRangeDescriptions_Params, ppDescriptions) + + NVPA_Status NVPW_Profiler_CounterData_GetRangeDescriptions(NVPW_Profiler_CounterData_GetRangeDescriptions_Params* pParams); + +#ifndef NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_DEFINED +#define NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_DEFINED + typedef enum NVPW_PeriodicSampler_CounterData_AppendMode + { + NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_LINEAR = 0, + NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_CIRCULAR = 1, + NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE__COUNT + } NVPW_PeriodicSampler_CounterData_AppendMode; +#endif //NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_DEFINED + + typedef struct NVPW_PeriodicSampler_CounterData_GetSampleTime_Params + { + /// [in] + size_t structSize; + /// [in] assign to NULL + void* pPriv; + /// [in] + const uint8_t* pCounterDataImage; + /// [in] + size_t rangeIndex; + /// [out] + uint64_t timestampStart; + /// [out] + uint64_t timestampEnd; + } NVPW_PeriodicSampler_CounterData_GetSampleTime_Params; +#define NVPW_PeriodicSampler_CounterData_GetSampleTime_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_GetSampleTime_Params, timestampEnd) + + NVPA_Status NVPW_PeriodicSampler_CounterData_GetSampleTime(NVPW_PeriodicSampler_CounterData_GetSampleTime_Params* pParams); + + typedef struct NVPW_PeriodicSampler_CounterData_TrimInPlace_Params + { + /// [in] + size_t structSize; + /// [in] assign to NULL + void* pPriv; + /// [in] + uint8_t* pCounterDataImage; + /// [in] + size_t counterDataImageSize; + /// [out] + size_t counterDataImageTrimmedSize; + } NVPW_PeriodicSampler_CounterData_TrimInPlace_Params; +#define NVPW_PeriodicSampler_CounterData_TrimInPlace_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_TrimInPlace_Params, counterDataImageTrimmedSize) + + NVPA_Status NVPW_PeriodicSampler_CounterData_TrimInPlace(NVPW_PeriodicSampler_CounterData_TrimInPlace_Params* pParams); + + typedef struct NVPW_PeriodicSampler_CounterData_GetInfo_Params + { + /// [in] + size_t structSize; + /// [in] assign to NULL + void* pPriv; + /// [in] + const uint8_t* pCounterDataImage; + /// [in] + size_t counterDataImageSize; + /// [out] total number of ranges in the counter data + size_t numTotalRanges; + /// [out] if in "linear" mode, this API returns the number of "populated" ranges; if it's in "circular" mode, + /// then it returns the last "populated" range index + 1, when there is no such range, it returns 0. + size_t numPopulatedRanges; + /// [out] if in "linear" mode, this API returns the number of "completed" ranges; if it's in "circular" mode, + /// then it returns the last "completed" range index + 1, when there is no such range, it returns 0. + size_t numCompletedRanges; + } NVPW_PeriodicSampler_CounterData_GetInfo_Params; +#define NVPW_PeriodicSampler_CounterData_GetInfo_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_GetInfo_Params, numCompletedRanges) + + /// In periodic sampler, a range in counter data stores exactly one sample's data. For better performance, periodic + /// sampler may operate in an out-of-order fashion when populating sample data, i.e. it may not fully populate all + /// counters of a sample/range before starting to populate the next sample/range. As a result, we have two concepts + /// here, "populated" & "completed": a range is considered "populated" even if only partial counters have been + /// written; on the other hand, a range is only considered "completed" if all the collecting counters have been + /// written. + NVPA_Status NVPW_PeriodicSampler_CounterData_GetInfo(NVPW_PeriodicSampler_CounterData_GetInfo_Params* pParams); + + typedef struct NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params + { + /// [in] + size_t structSize; + /// [in] assign to NULL + void* pPriv; + /// [in] + const uint8_t* pCounterDataImage; + /// [in] + size_t counterDataImageSize; + /// [in] + size_t rangeIndex; + /// [out] + uint32_t triggerCount; + } NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params; +#define NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params, triggerCount) + + NVPA_Status NVPW_PeriodicSampler_CounterData_GetTriggerCount(NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params* pParams); + + + typedef struct NVPW_TimestampReport + { + uint32_t payload; + uint8_t reserved0004[4]; + uint64_t timestamp; + } NVPW_TimestampReport; + + + + +#ifdef __cplusplus +} // extern "C" +#endif + +#if defined(__GNUC__) && defined(NVPA_SHARED_LIB) + #pragma GCC visibility pop +#endif + +#endif // NVPERF_TARGET_H diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl.hpp b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..263e0ab4717ed411621f8cb3330aed385c672151 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl.hpp @@ -0,0 +1,12419 @@ +/******************************************************************************* + * Copyright (c) 2008-2013 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +/*! \file + * + * \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33) and + * OpenCL 1.2 (rev 15) + * \author Benedict R. Gaster, Laurent Morichetti and Lee Howes + * + * Additions and fixes from: + * Brian Cole, March 3rd 2010 and April 2012 + * Matt Gruenke, April 2012. + * Bruce Merry, February 2013. + * + * \version 1.2.5 + * \date June 2013 + * + * Optional extension support + * + * cl + * cl_ext_device_fission + * #define USE_CL_DEVICE_FISSION + */ + +/*! \mainpage + * \section intro Introduction + * For many large applications C++ is the language of choice and so it seems + * reasonable to define C++ bindings for OpenCL. + * + * + * The interface is contained with a single C++ header file \em cl.hpp and all + * definitions are contained within the namespace \em cl. There is no additional + * requirement to include \em cl.h and to use either the C++ or original C + * bindings it is enough to simply include \em cl.hpp. + * + * The bindings themselves are lightweight and correspond closely to the + * underlying C API. Using the C++ bindings introduces no additional execution + * overhead. + * + * For detail documentation on the bindings see: + * + * The OpenCL C++ Wrapper API 1.2 (revision 09) + * http://www.khronos.org/registry/cl/specs/opencl-cplusplus-1.2.pdf + * + * \section example Example + * + * The following example shows a general use case for the C++ + * bindings, including support for the optional exception feature and + * also the supplied vector and string classes, see following sections for + * decriptions of these features. + * + * \code + * #define __CL_ENABLE_EXCEPTIONS + * + * #if defined(__APPLE__) || defined(__MACOSX) + * #include + * #else + * #include + * #endif + * #include + * #include + * #include + * + * const char * helloStr = "__kernel void " + * "hello(void) " + * "{ " + * " " + * "} "; + * + * int + * main(void) + * { + * cl_int err = CL_SUCCESS; + * try { + * + * std::vector platforms; + * cl::Platform::get(&platforms); + * if (platforms.size() == 0) { + * std::cout << "Platform size 0\n"; + * return -1; + * } + * + * cl_context_properties properties[] = + * { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0}; + * cl::Context context(CL_DEVICE_TYPE_CPU, properties); + * + * std::vector devices = context.getInfo(); + * + * cl::Program::Sources source(1, + * std::make_pair(helloStr,strlen(helloStr))); + * cl::Program program_ = cl::Program(context, source); + * program_.build(devices); + * + * cl::Kernel kernel(program_, "hello", &err); + * + * cl::Event event; + * cl::CommandQueue queue(context, devices[0], 0, &err); + * queue.enqueueNDRangeKernel( + * kernel, + * cl::NullRange, + * cl::NDRange(4,4), + * cl::NullRange, + * NULL, + * &event); + * + * event.wait(); + * } + * catch (cl::Error err) { + * std::cerr + * << "ERROR: " + * << err.what() + * << "(" + * << err.err() + * << ")" + * << std::endl; + * } + * + * return EXIT_SUCCESS; + * } + * + * \endcode + * + */ +#ifndef CL_HPP_ +#define CL_HPP_ + +#ifdef _WIN32 + +#include +#include +#include +#include + +#if defined(__CL_ENABLE_EXCEPTIONS) +#include +#endif // #if defined(__CL_ENABLE_EXCEPTIONS) + +#pragma push_macro("max") +#undef max +#if defined(USE_DX_INTEROP) +#include +#include +#endif +#endif // _WIN32 + +// +#if defined(USE_CL_DEVICE_FISSION) +#include +#endif + +#if defined(__APPLE__) || defined(__MACOSX) +#include +#include +#include +#else +#include +#include +#endif // !__APPLE__ + +// To avoid accidentally taking ownership of core OpenCL types +// such as cl_kernel constructors are made explicit +// under OpenCL 1.2 +#if defined(CL_VERSION_1_2) && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) +#define __CL_EXPLICIT_CONSTRUCTORS explicit +#else // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) +#define __CL_EXPLICIT_CONSTRUCTORS +#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + +// Define deprecated prefixes and suffixes to ensure compilation +// in case they are not pre-defined +#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) +#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) +#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED) +#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) +#if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED) +#define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED) +#if !defined(CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED) +#define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED) +#if !defined(CL_EXT_PREFIX__VERSION_2_0_DEPRECATED) +#define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_2_0_DEPRECATED) +#if !defined(CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED) +#define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_2_0_DEPRECATED) +#if !defined(CL_EXT_PREFIX__VERSION_2_1_DEPRECATED) +#define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_2_1_DEPRECATED) +#if !defined(CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED) +#define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_2_1_DEPRECATED) +#if !defined(CL_EXT_PREFIX__VERSION_2_2_DEPRECATED) +#define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_2_2_DEPRECATED) +#if !defined(CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED) +#define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_2_2_DEPRECATED) + +#if !defined(CL_CALLBACK) +#define CL_CALLBACK +#endif //CL_CALLBACK + +#include +#include + +#if !defined(__NO_STD_VECTOR) +#include +#endif + +#if !defined(__NO_STD_STRING) +#include +#endif + +#if defined(linux) || defined(__APPLE__) || defined(__MACOSX) +#include + +#include +#include +#endif // linux + +#include + + +/*! \namespace cl + * + * \brief The OpenCL C++ bindings are defined within this namespace. + * + */ +namespace cl { + +class Memory; + +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) +#define __INIT_CL_EXT_FCN_PTR(name) \ + if(!pfn_##name) { \ + pfn_##name = (PFN_##name) \ + clGetExtensionFunctionAddress(#name); \ + if(!pfn_##name) { \ + } \ + } +#endif // #if defined(CL_VERSION_1_1) + +#if defined(CL_VERSION_1_2) +#define __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, name) \ + if(!pfn_##name) { \ + pfn_##name = (PFN_##name) \ + clGetExtensionFunctionAddressForPlatform(platform, #name); \ + if(!pfn_##name) { \ + } \ + } +#endif // #if defined(CL_VERSION_1_1) + +class Program; +class Device; +class Context; +class CommandQueue; +class Memory; +class Buffer; + +#if defined(__CL_ENABLE_EXCEPTIONS) +/*! \brief Exception class + * + * This may be thrown by API functions when __CL_ENABLE_EXCEPTIONS is defined. + */ +class Error : public std::exception +{ +private: + cl_int err_; + const char * errStr_; +public: + /*! \brief Create a new CL error exception for a given error code + * and corresponding message. + * + * \param err error code value. + * + * \param errStr a descriptive string that must remain in scope until + * handling of the exception has concluded. If set, it + * will be returned by what(). + */ + Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr) + {} + + ~Error() throw() {} + + /*! \brief Get error string associated with exception + * + * \return A memory pointer to the error message string. + */ + virtual const char * what() const throw () + { + if (errStr_ == NULL) { + return "empty"; + } + else { + return errStr_; + } + } + + /*! \brief Get error code associated with exception + * + * \return The error code. + */ + cl_int err(void) const { return err_; } +}; + +#define __ERR_STR(x) #x +#else +#define __ERR_STR(x) NULL +#endif // __CL_ENABLE_EXCEPTIONS + + +namespace detail +{ +#if defined(__CL_ENABLE_EXCEPTIONS) +static inline cl_int errHandler ( + cl_int err, + const char * errStr = NULL) +{ + if (err != CL_SUCCESS) { + throw Error(err, errStr); + } + return err; +} +#else +static inline cl_int errHandler (cl_int err, const char * errStr = NULL) +{ + (void) errStr; // suppress unused variable warning + return err; +} +#endif // __CL_ENABLE_EXCEPTIONS +} + + + +//! \cond DOXYGEN_DETAIL +#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS) +#define __GET_DEVICE_INFO_ERR __ERR_STR(clGetDeviceInfo) +#define __GET_PLATFORM_INFO_ERR __ERR_STR(clGetPlatformInfo) +#define __GET_DEVICE_IDS_ERR __ERR_STR(clGetDeviceIDs) +#define __GET_PLATFORM_IDS_ERR __ERR_STR(clGetPlatformIDs) +#define __GET_CONTEXT_INFO_ERR __ERR_STR(clGetContextInfo) +#define __GET_EVENT_INFO_ERR __ERR_STR(clGetEventInfo) +#define __GET_EVENT_PROFILE_INFO_ERR __ERR_STR(clGetEventProfileInfo) +#define __GET_MEM_OBJECT_INFO_ERR __ERR_STR(clGetMemObjectInfo) +#define __GET_IMAGE_INFO_ERR __ERR_STR(clGetImageInfo) +#define __GET_SAMPLER_INFO_ERR __ERR_STR(clGetSamplerInfo) +#define __GET_KERNEL_INFO_ERR __ERR_STR(clGetKernelInfo) +#if defined(CL_VERSION_1_2) +#define __GET_KERNEL_ARG_INFO_ERR __ERR_STR(clGetKernelArgInfo) +#endif // #if defined(CL_VERSION_1_2) +#define __GET_KERNEL_WORK_GROUP_INFO_ERR __ERR_STR(clGetKernelWorkGroupInfo) +#define __GET_PROGRAM_INFO_ERR __ERR_STR(clGetProgramInfo) +#define __GET_PROGRAM_BUILD_INFO_ERR __ERR_STR(clGetProgramBuildInfo) +#define __GET_COMMAND_QUEUE_INFO_ERR __ERR_STR(clGetCommandQueueInfo) + +#define __CREATE_CONTEXT_ERR __ERR_STR(clCreateContext) +#define __CREATE_CONTEXT_FROM_TYPE_ERR __ERR_STR(clCreateContextFromType) +#define __GET_SUPPORTED_IMAGE_FORMATS_ERR __ERR_STR(clGetSupportedImageFormats) + +#define __CREATE_BUFFER_ERR __ERR_STR(clCreateBuffer) +#define __COPY_ERR __ERR_STR(cl::copy) +#define __CREATE_SUBBUFFER_ERR __ERR_STR(clCreateSubBuffer) +#define __CREATE_GL_BUFFER_ERR __ERR_STR(clCreateFromGLBuffer) +#define __CREATE_GL_RENDER_BUFFER_ERR __ERR_STR(clCreateFromGLBuffer) +#define __GET_GL_OBJECT_INFO_ERR __ERR_STR(clGetGLObjectInfo) +#if defined(CL_VERSION_1_2) +#define __CREATE_IMAGE_ERR __ERR_STR(clCreateImage) +#define __CREATE_GL_TEXTURE_ERR __ERR_STR(clCreateFromGLTexture) +#define __IMAGE_DIMENSION_ERR __ERR_STR(Incorrect image dimensions) +#endif // #if defined(CL_VERSION_1_2) +#define __CREATE_SAMPLER_PROPERTY_ERR __ERR_STR(clCreateSamplerWithProperties) +#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR __ERR_STR(clSetMemObjectDestructorCallback) + +#define __CREATE_USER_EVENT_ERR __ERR_STR(clCreateUserEvent) +#define __SET_USER_EVENT_STATUS_ERR __ERR_STR(clSetUserEventStatus) +#define __SET_EVENT_CALLBACK_ERR __ERR_STR(clSetEventCallback) +#define __WAIT_FOR_EVENTS_ERR __ERR_STR(clWaitForEvents) + +#define __CREATE_KERNEL_ERR __ERR_STR(clCreateKernel) +#define __SET_KERNEL_ARGS_ERR __ERR_STR(clSetKernelArg) +#define __CREATE_PROGRAM_WITH_SOURCE_ERR __ERR_STR(clCreateProgramWithSource) +#define __CREATE_PROGRAM_WITH_BINARY_ERR __ERR_STR(clCreateProgramWithBinary) +#if defined(CL_VERSION_1_2) +#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR __ERR_STR(clCreateProgramWithBuiltInKernels) +#endif // #if defined(CL_VERSION_1_2) +#define __BUILD_PROGRAM_ERR __ERR_STR(clBuildProgram) +#if defined(CL_VERSION_1_2) +#define __COMPILE_PROGRAM_ERR __ERR_STR(clCompileProgram) + +#endif // #if defined(CL_VERSION_1_2) +#define __CREATE_KERNELS_IN_PROGRAM_ERR __ERR_STR(clCreateKernelsInProgram) + +#define __CREATE_COMMAND_QUEUE_PROPERTY_ERR __ERR_STR(clCreateCommandQueueWithProperties) +#define __SET_COMMAND_QUEUE_PROPERTY_ERR __ERR_STR(clSetCommandQueueProperty) +#define __ENQUEUE_READ_BUFFER_ERR __ERR_STR(clEnqueueReadBuffer) +#define __ENQUEUE_READ_BUFFER_RECT_ERR __ERR_STR(clEnqueueReadBufferRect) +#define __ENQUEUE_WRITE_BUFFER_ERR __ERR_STR(clEnqueueWriteBuffer) +#define __ENQUEUE_WRITE_BUFFER_RECT_ERR __ERR_STR(clEnqueueWriteBufferRect) +#define __ENQEUE_COPY_BUFFER_ERR __ERR_STR(clEnqueueCopyBuffer) +#define __ENQEUE_COPY_BUFFER_RECT_ERR __ERR_STR(clEnqueueCopyBufferRect) +#define __ENQUEUE_FILL_BUFFER_ERR __ERR_STR(clEnqueueFillBuffer) +#define __ENQUEUE_READ_IMAGE_ERR __ERR_STR(clEnqueueReadImage) +#define __ENQUEUE_WRITE_IMAGE_ERR __ERR_STR(clEnqueueWriteImage) +#define __ENQUEUE_COPY_IMAGE_ERR __ERR_STR(clEnqueueCopyImage) +#define __ENQUEUE_FILL_IMAGE_ERR __ERR_STR(clEnqueueFillImage) +#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR __ERR_STR(clEnqueueCopyImageToBuffer) +#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR __ERR_STR(clEnqueueCopyBufferToImage) +#define __ENQUEUE_MAP_BUFFER_ERR __ERR_STR(clEnqueueMapBuffer) +#define __ENQUEUE_MAP_IMAGE_ERR __ERR_STR(clEnqueueMapImage) +#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR __ERR_STR(clEnqueueUnMapMemObject) +#define __ENQUEUE_NDRANGE_KERNEL_ERR __ERR_STR(clEnqueueNDRangeKernel) +#define __ENQUEUE_NATIVE_KERNEL __ERR_STR(clEnqueueNativeKernel) +#if defined(CL_VERSION_1_2) +#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR __ERR_STR(clEnqueueMigrateMemObjects) +#endif // #if defined(CL_VERSION_1_2) + +#define __ENQUEUE_ACQUIRE_GL_ERR __ERR_STR(clEnqueueAcquireGLObjects) +#define __ENQUEUE_RELEASE_GL_ERR __ERR_STR(clEnqueueReleaseGLObjects) + + +#define __RETAIN_ERR __ERR_STR(Retain Object) +#define __RELEASE_ERR __ERR_STR(Release Object) +#define __FLUSH_ERR __ERR_STR(clFlush) +#define __FINISH_ERR __ERR_STR(clFinish) +#define __VECTOR_CAPACITY_ERR __ERR_STR(Vector capacity error) + +/** + * CL 1.2 version that uses device fission. + */ +#if defined(CL_VERSION_1_2) +#define __CREATE_SUB_DEVICES __ERR_STR(clCreateSubDevices) +#else +#define __CREATE_SUB_DEVICES __ERR_STR(clCreateSubDevicesEXT) +#endif // #if defined(CL_VERSION_1_2) + +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) +#define __ENQUEUE_MARKER_ERR __ERR_STR(clEnqueueMarker) +#define __ENQUEUE_WAIT_FOR_EVENTS_ERR __ERR_STR(clEnqueueWaitForEvents) +#define __ENQUEUE_BARRIER_ERR __ERR_STR(clEnqueueBarrier) +#define __UNLOAD_COMPILER_ERR __ERR_STR(clUnloadCompiler) +#define __CREATE_GL_TEXTURE_2D_ERR __ERR_STR(clCreateFromGLTexture2D) +#define __CREATE_GL_TEXTURE_3D_ERR __ERR_STR(clCreateFromGLTexture3D) +#define __CREATE_IMAGE2D_ERR __ERR_STR(clCreateImage2D) +#define __CREATE_IMAGE3D_ERR __ERR_STR(clCreateImage3D) +#endif // #if defined(CL_VERSION_1_1) + +#if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) || !defined(CL_VERSION_2_0) +#define __CREATE_COMMAND_QUEUE_ERR __ERR_STR(clCreateCommandQueue) +#define __CREATE_SAMPLER_ERR __ERR_STR(clCreateSampler) +#define __ENQUEUE_TASK_ERR __ERR_STR(clEnqueueTask) +#endif // #if defined(CL_VERSION_1_2) + +#endif // __CL_USER_OVERRIDE_ERROR_STRINGS +//! \endcond + +/** + * CL 1.2 marker and barrier commands + */ +#if defined(CL_VERSION_1_2) +#define __ENQUEUE_MARKER_WAIT_LIST_ERR __ERR_STR(clEnqueueMarkerWithWaitList) +#define __ENQUEUE_BARRIER_WAIT_LIST_ERR __ERR_STR(clEnqueueBarrierWithWaitList) +#endif // #if defined(CL_VERSION_1_2) + +#if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING) +typedef std::string STRING_CLASS; +#elif !defined(__USE_DEV_STRING) + +/*! \class string + * \brief Simple string class, that provides a limited subset of std::string + * functionality but avoids many of the issues that come with that class. + + * \note Deprecated. Please use std::string as default or + * re-define the string class to match the std::string + * interface by defining STRING_CLASS + */ +class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED string CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED +{ +private: + ::size_t size_; + char * str_; +public: + //! \brief Constructs an empty string, allocating no memory. + string(void) : size_(0), str_(NULL) + { + } + + /*! \brief Constructs a string populated from an arbitrary value of + * specified size. + * + * An extra '\0' is added, in case none was contained in str. + * + * \param str the initial value of the string instance. Note that '\0' + * characters receive no special treatment. If NULL, + * the string is left empty, with a size of 0. + * + * \param size the number of characters to copy from str. + */ + string(const char * str, ::size_t size) : + size_(size), + str_(NULL) + { + if( size > 0 ) { + str_ = new char[size_+1]; + if (str_ != NULL) { + memcpy(str_, str, size_ * sizeof(char)); + str_[size_] = '\0'; + } + else { + size_ = 0; + } + } + } + + /*! \brief Constructs a string populated from a null-terminated value. + * + * \param str the null-terminated initial value of the string instance. + * If NULL, the string is left empty, with a size of 0. + */ + string(const char * str) : + size_(0), + str_(NULL) + { + if( str ) { + size_= ::strlen(str); + } + if( size_ > 0 ) { + str_ = new char[size_ + 1]; + if (str_ != NULL) { + memcpy(str_, str, (size_ + 1) * sizeof(char)); + } + } + } + + void resize( ::size_t n ) + { + if( size_ == n ) { + return; + } + if (n == 0) { + if( str_ ) { + delete [] str_; + } + str_ = NULL; + size_ = 0; + } + else { + char *newString = new char[n + 1]; + int copySize = n; + if( size_ < n ) { + copySize = size_; + } + size_ = n; + + if(str_) { + memcpy(newString, str_, (copySize + 1) * sizeof(char)); + } + if( copySize < size_ ) { + memset(newString + copySize, 0, size_ - copySize); + } + newString[size_] = '\0'; + + delete [] str_; + str_ = newString; + } + } + + const char& operator[] ( ::size_t pos ) const + { + return str_[pos]; + } + + char& operator[] ( ::size_t pos ) + { + return str_[pos]; + } + + /*! \brief Copies the value of another string to this one. + * + * \param rhs the string to copy. + * + * \returns a reference to the modified instance. + */ + string& operator=(const string& rhs) + { + if (this == &rhs) { + return *this; + } + + if( str_ != NULL ) { + delete [] str_; + str_ = NULL; + size_ = 0; + } + + if (rhs.size_ == 0 || rhs.str_ == NULL) { + str_ = NULL; + size_ = 0; + } + else { + str_ = new char[rhs.size_ + 1]; + size_ = rhs.size_; + + if (str_ != NULL) { + memcpy(str_, rhs.str_, (size_ + 1) * sizeof(char)); + } + else { + size_ = 0; + } + } + + return *this; + } + + /*! \brief Constructs a string by copying the value of another instance. + * + * \param rhs the string to copy. + */ + string(const string& rhs) : + size_(0), + str_(NULL) + { + *this = rhs; + } + + //! \brief Destructor - frees memory used to hold the current value. + ~string() + { + delete[] str_; + str_ = NULL; + } + + //! \brief Queries the length of the string, excluding any added '\0's. + ::size_t size(void) const { return size_; } + + //! \brief Queries the length of the string, excluding any added '\0's. + ::size_t length(void) const { return size(); } + + /*! \brief Returns a pointer to the private copy held by this instance, + * or "" if empty/unset. + */ + const char * c_str(void) const { return (str_) ? str_ : "";} +}; +typedef cl::string STRING_CLASS; +#endif // #elif !defined(__USE_DEV_STRING) + +#if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR) +#define VECTOR_CLASS std::vector +#elif !defined(__USE_DEV_VECTOR) +#define VECTOR_CLASS cl::vector + +#if !defined(__MAX_DEFAULT_VECTOR_SIZE) +#define __MAX_DEFAULT_VECTOR_SIZE 10 +#endif + +/*! \class vector + * \brief Fixed sized vector implementation that mirroring + * + * \note Deprecated. Please use std::vector as default or + * re-define the vector class to match the std::vector + * interface by defining VECTOR_CLASS + + * \note Not recommended for use with custom objects as + * current implementation will construct N elements + * + * std::vector functionality. + * \brief Fixed sized vector compatible with std::vector. + * + * \note + * This differs from std::vector<> not just in memory allocation, + * but also in terms of when members are constructed, destroyed, + * and assigned instead of being copy constructed. + * + * \param T type of element contained in the vector. + * + * \param N maximum size of the vector. + */ +template +class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED vector CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED +{ +private: + T data_[N]; + unsigned int size_; + +public: + //! \brief Constructs an empty vector with no memory allocated. + vector() : + size_(static_cast(0)) + {} + + //! \brief Deallocates the vector's memory and destroys all of its elements. + ~vector() + { + clear(); + } + + //! \brief Returns the number of elements currently contained. + unsigned int size(void) const + { + return size_; + } + + /*! \brief Empties the vector of all elements. + * \note + * This does not deallocate memory but will invoke destructors + * on contained elements. + */ + void clear() + { + while(!empty()) { + pop_back(); + } + } + + /*! \brief Appends an element after the last valid element. + * Calling this on a vector that has reached capacity will throw an + * exception if exceptions are enabled. + */ + void push_back (const T& x) + { + if (size() < N) { + new (&data_[size_]) T(x); + size_++; + } else { + detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR); + } + } + + /*! \brief Removes the last valid element from the vector. + * Calling this on an empty vector will throw an exception + * if exceptions are enabled. + */ + void pop_back(void) + { + if (size_ != 0) { + --size_; + data_[size_].~T(); + } else { + detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR); + } + } + + /*! \brief Constructs with a value copied from another. + * + * \param vec the vector to copy. + */ + vector(const vector& vec) : + size_(vec.size_) + { + if (size_ != 0) { + assign(vec.begin(), vec.end()); + } + } + + /*! \brief Constructs with a specified number of initial elements. + * + * \param size number of initial elements. + * + * \param val value of initial elements. + */ + vector(unsigned int size, const T& val = T()) : + size_(0) + { + for (unsigned int i = 0; i < size; i++) { + push_back(val); + } + } + + /*! \brief Overwrites the current content with that copied from another + * instance. + * + * \param rhs vector to copy. + * + * \returns a reference to this. + */ + vector& operator=(const vector& rhs) + { + if (this == &rhs) { + return *this; + } + + if (rhs.size_ != 0) { + assign(rhs.begin(), rhs.end()); + } else { + clear(); + } + + return *this; + } + + /*! \brief Tests equality against another instance. + * + * \param vec the vector against which to compare. + */ + bool operator==(vector &vec) + { + if (size() != vec.size()) { + return false; + } + + for( unsigned int i = 0; i < size(); ++i ) { + if( operator[](i) != vec[i] ) { + return false; + } + } + return true; + } + + //! \brief Conversion operator to T*. + operator T* () { return data_; } + + //! \brief Conversion operator to const T*. + operator const T* () const { return data_; } + + //! \brief Tests whether this instance has any elements. + bool empty (void) const + { + return size_==0; + } + + //! \brief Returns the maximum number of elements this instance can hold. + unsigned int max_size (void) const + { + return N; + } + + //! \brief Returns the maximum number of elements this instance can hold. + unsigned int capacity () const + { + return N; + } + + /*! \brief Returns a reference to a given element. + * + * \param index which element to access. * + * \note + * The caller is responsible for ensuring index is >= 0 and < size(). + */ + T& operator[](int index) + { + return data_[index]; + } + + /*! \brief Returns a const reference to a given element. + * + * \param index which element to access. + * + * \note + * The caller is responsible for ensuring index is >= 0 and < size(). + */ + const T& operator[](int index) const + { + return data_[index]; + } + + /*! \brief Assigns elements of the vector based on a source iterator range. + * + * \param start Beginning iterator of source range + * \param end Enditerator of source range + * + * \note + * Will throw an exception if exceptions are enabled and size exceeded. + */ + template + void assign(I start, I end) + { + clear(); + while(start != end) { + push_back(*start); + start++; + } + } + + /*! \class iterator + * \brief Const iterator class for vectors + */ + class iterator + { + private: + const vector *vec_; + int index_; + + /** + * Internal iterator constructor to capture reference + * to the vector it iterates over rather than taking + * the vector by copy. + */ + iterator (const vector &vec, int index) : + vec_(&vec) + { + if( !vec.empty() ) { + index_ = index; + } else { + index_ = -1; + } + } + + public: + iterator(void) : + index_(-1), + vec_(NULL) + { + } + + iterator(const iterator& rhs) : + vec_(rhs.vec_), + index_(rhs.index_) + { + } + + ~iterator(void) {} + + static iterator begin(const cl::vector &vec) + { + iterator i(vec, 0); + + return i; + } + + static iterator end(const cl::vector &vec) + { + iterator i(vec, vec.size()); + + return i; + } + + bool operator==(iterator i) + { + return ((vec_ == i.vec_) && + (index_ == i.index_)); + } + + bool operator!=(iterator i) + { + return (!(*this==i)); + } + + iterator& operator++() + { + ++index_; + return *this; + } + + iterator operator++(int) + { + iterator retVal(*this); + ++index_; + return retVal; + } + + iterator& operator--() + { + --index_; + return *this; + } + + iterator operator--(int) + { + iterator retVal(*this); + --index_; + return retVal; + } + + const T& operator *() const + { + return (*vec_)[index_]; + } + }; + + iterator begin(void) + { + return iterator::begin(*this); + } + + iterator begin(void) const + { + return iterator::begin(*this); + } + + iterator end(void) + { + return iterator::end(*this); + } + + iterator end(void) const + { + return iterator::end(*this); + } + + T& front(void) + { + return data_[0]; + } + + T& back(void) + { + return data_[size_]; + } + + const T& front(void) const + { + return data_[0]; + } + + const T& back(void) const + { + return data_[size_-1]; + } +}; +#endif // #if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR) + + + + + +namespace detail { +#define __DEFAULT_NOT_INITIALIZED 1 +#define __DEFAULT_BEING_INITIALIZED 2 +#define __DEFAULT_INITIALIZED 4 + + /* + * Compare and exchange primitives are needed for handling of defaults + */ + inline int compare_exchange(volatile int * dest, int exchange, int comparand) + { +#ifdef _WIN32 + return (int)(InterlockedCompareExchange( + (volatile long*)dest, + (long)exchange, + (long)comparand)); +#elif defined(__APPLE__) || defined(__MACOSX) + return OSAtomicOr32Orig((uint32_t)exchange, (volatile uint32_t*)dest); +#else // !_WIN32 || defined(__APPLE__) || defined(__MACOSX) + return (__sync_val_compare_and_swap( + dest, + comparand, + exchange)); +#endif // !_WIN32 + } + + inline void fence() { _mm_mfence(); } +}; // namespace detail + + +/*! \brief class used to interface between C++ and + * OpenCL C calls that require arrays of size_t values, whose + * size is known statically. + */ +template +class size_t +{ +private: + ::size_t data_[N]; + +public: + //! \brief Initialize size_t to all 0s + size_t() + { + for( int i = 0; i < N; ++i ) { + data_[i] = 0; + } + } + + ::size_t& operator[](int index) + { + return data_[index]; + } + + const ::size_t& operator[](int index) const + { + return data_[index]; + } + + //! \brief Conversion operator to T*. + operator ::size_t* () { return data_; } + + //! \brief Conversion operator to const T*. + operator const ::size_t* () const { return data_; } +}; + +namespace detail { + +// Generic getInfoHelper. The final parameter is used to guide overload +// resolution: the actual parameter passed is an int, which makes this +// a worse conversion sequence than a specialization that declares the +// parameter as an int. +template +inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long) +{ + return f(name, sizeof(T), param, NULL); +} + +// Specialized getInfoHelper for VECTOR_CLASS params +template +inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, long) +{ + ::size_t required; + cl_int err = f(name, 0, NULL, &required); + if (err != CL_SUCCESS) { + return err; + } + + T* value = (T*) alloca(required); + err = f(name, required, value, NULL); + if (err != CL_SUCCESS) { + return err; + } + + param->assign(&value[0], &value[required/sizeof(T)]); + return CL_SUCCESS; +} + +/* Specialization for reference-counted types. This depends on the + * existence of Wrapper::cl_type, and none of the other types having the + * cl_type member. Note that simplify specifying the parameter as Wrapper + * does not work, because when using a derived type (e.g. Context) the generic + * template will provide a better match. + */ +template +inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, int, typename T::cl_type = 0) +{ + ::size_t required; + cl_int err = f(name, 0, NULL, &required); + if (err != CL_SUCCESS) { + return err; + } + + typename T::cl_type * value = (typename T::cl_type *) alloca(required); + err = f(name, required, value, NULL); + if (err != CL_SUCCESS) { + return err; + } + + ::size_t elements = required / sizeof(typename T::cl_type); + param->assign(&value[0], &value[elements]); + for (::size_t i = 0; i < elements; i++) + { + if (value[i] != NULL) + { + err = (*param)[i].retain(); + if (err != CL_SUCCESS) { + return err; + } + } + } + return CL_SUCCESS; +} + +// Specialized for getInfo +template +inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, int) +{ + cl_int err = f(name, param->size() * sizeof(char *), &(*param)[0], NULL); + + if (err != CL_SUCCESS) { + return err; + } + + return CL_SUCCESS; +} + +// Specialized GetInfoHelper for STRING_CLASS params +template +inline cl_int getInfoHelper(Func f, cl_uint name, STRING_CLASS* param, long) +{ + ::size_t required; + cl_int err = f(name, 0, NULL, &required); + if (err != CL_SUCCESS) { + return err; + } + + char* value = (char*) alloca(required); + err = f(name, required, value, NULL); + if (err != CL_SUCCESS) { + return err; + } + + *param = value; + return CL_SUCCESS; +} + +// Specialized GetInfoHelper for cl::size_t params +template +inline cl_int getInfoHelper(Func f, cl_uint name, size_t* param, long) +{ + ::size_t required; + cl_int err = f(name, 0, NULL, &required); + if (err != CL_SUCCESS) { + return err; + } + + ::size_t* value = (::size_t*) alloca(required); + err = f(name, required, value, NULL); + if (err != CL_SUCCESS) { + return err; + } + + for(int i = 0; i < N; ++i) { + (*param)[i] = value[i]; + } + + return CL_SUCCESS; +} + +template struct ReferenceHandler; + +/* Specialization for reference-counted types. This depends on the + * existence of Wrapper::cl_type, and none of the other types having the + * cl_type member. Note that simplify specifying the parameter as Wrapper + * does not work, because when using a derived type (e.g. Context) the generic + * template will provide a better match. + */ +template +inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0) +{ + typename T::cl_type value; + cl_int err = f(name, sizeof(value), &value, NULL); + if (err != CL_SUCCESS) { + return err; + } + *param = value; + if (value != NULL) + { + err = param->retain(); + if (err != CL_SUCCESS) { + return err; + } + } + return CL_SUCCESS; +} + +#define __PARAM_NAME_INFO_1_0(F) \ + F(cl_platform_info, CL_PLATFORM_PROFILE, STRING_CLASS) \ + F(cl_platform_info, CL_PLATFORM_VERSION, STRING_CLASS) \ + F(cl_platform_info, CL_PLATFORM_NAME, STRING_CLASS) \ + F(cl_platform_info, CL_PLATFORM_VENDOR, STRING_CLASS) \ + F(cl_platform_info, CL_PLATFORM_EXTENSIONS, STRING_CLASS) \ + \ + F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \ + F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, ::size_t) \ + F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, VECTOR_CLASS< ::size_t>) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \ + F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, ::size_t) \ + F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, ::size_t) \ + F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, ::size_t) \ + F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, ::size_t) \ + F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, ::size_t) \ + F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \ + F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, ::size_t) \ + F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \ + F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \ + F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \ + F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \ + F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\ + F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \ + F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \ + F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \ + F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, ::size_t) \ + F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \ + F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \ + F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \ + F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \ + F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties) \ + F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \ + F(cl_device_info, CL_DEVICE_NAME, STRING_CLASS) \ + F(cl_device_info, CL_DEVICE_VENDOR, STRING_CLASS) \ + F(cl_device_info, CL_DRIVER_VERSION, STRING_CLASS) \ + F(cl_device_info, CL_DEVICE_PROFILE, STRING_CLASS) \ + F(cl_device_info, CL_DEVICE_VERSION, STRING_CLASS) \ + F(cl_device_info, CL_DEVICE_EXTENSIONS, STRING_CLASS) \ + \ + F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \ + F(cl_context_info, CL_CONTEXT_DEVICES, VECTOR_CLASS) \ + F(cl_context_info, CL_CONTEXT_PROPERTIES, VECTOR_CLASS) \ + \ + F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \ + F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \ + F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \ + F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_uint) \ + \ + F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \ + F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \ + F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \ + F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \ + \ + F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \ + F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \ + F(cl_mem_info, CL_MEM_SIZE, ::size_t) \ + F(cl_mem_info, CL_MEM_HOST_PTR, void*) \ + F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \ + F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \ + F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \ + \ + F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \ + F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, ::size_t) \ + F(cl_image_info, CL_IMAGE_ROW_PITCH, ::size_t) \ + F(cl_image_info, CL_IMAGE_SLICE_PITCH, ::size_t) \ + F(cl_image_info, CL_IMAGE_WIDTH, ::size_t) \ + F(cl_image_info, CL_IMAGE_HEIGHT, ::size_t) \ + F(cl_image_info, CL_IMAGE_DEPTH, ::size_t) \ + \ + F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \ + F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \ + F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_addressing_mode) \ + F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_filter_mode) \ + F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_bool) \ + \ + F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \ + F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \ + F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \ + F(cl_program_info, CL_PROGRAM_DEVICES, VECTOR_CLASS) \ + F(cl_program_info, CL_PROGRAM_SOURCE, STRING_CLASS) \ + F(cl_program_info, CL_PROGRAM_BINARY_SIZES, VECTOR_CLASS< ::size_t>) \ + F(cl_program_info, CL_PROGRAM_BINARIES, VECTOR_CLASS) \ + \ + F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \ + F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, STRING_CLASS) \ + F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, STRING_CLASS) \ + \ + F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, STRING_CLASS) \ + F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \ + F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \ + F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \ + F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \ + \ + F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, ::size_t) \ + F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::size_t<3>) \ + F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \ + \ + F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \ + F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \ + F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \ + F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties) + +#if defined(CL_VERSION_1_1) +#define __PARAM_NAME_INFO_1_1(F) \ + F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \ + F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \ + F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \ + F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) \ + F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, STRING_CLASS) \ + \ + F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \ + F(cl_mem_info, CL_MEM_OFFSET, ::size_t) \ + \ + F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, ::size_t) \ + F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \ + \ + F(cl_event_info, CL_EVENT_CONTEXT, cl::Context) +#endif // CL_VERSION_1_1 + + +#if defined(CL_VERSION_1_2) +#define __PARAM_NAME_INFO_1_2(F) \ + F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer) \ + \ + F(cl_program_info, CL_PROGRAM_NUM_KERNELS, ::size_t) \ + F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, STRING_CLASS) \ + \ + F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \ + \ + F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, STRING_CLASS) \ + \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, STRING_CLASS) \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, STRING_CLASS) \ + \ + F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl_device_id) \ + F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, VECTOR_CLASS) \ + F(cl_device_info, CL_DEVICE_PARTITION_TYPE, VECTOR_CLASS) \ + F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, ::size_t) \ + F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \ + F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, STRING_CLASS) +#endif // #if defined(CL_VERSION_1_2) + +#if defined(USE_CL_DEVICE_FISSION) +#define __PARAM_NAME_DEVICE_FISSION(F) \ + F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \ + F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, VECTOR_CLASS) \ + F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, VECTOR_CLASS) \ + F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \ + F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, VECTOR_CLASS) +#endif // USE_CL_DEVICE_FISSION + +template +struct param_traits {}; + +#define __CL_DECLARE_PARAM_TRAITS(token, param_name, T) \ +struct token; \ +template<> \ +struct param_traits \ +{ \ + enum { value = param_name }; \ + typedef T param_type; \ +}; + +__PARAM_NAME_INFO_1_0(__CL_DECLARE_PARAM_TRAITS) +#if defined(CL_VERSION_1_1) +__PARAM_NAME_INFO_1_1(__CL_DECLARE_PARAM_TRAITS) +#endif // CL_VERSION_1_1 +#if defined(CL_VERSION_1_2) +__PARAM_NAME_INFO_1_2(__CL_DECLARE_PARAM_TRAITS) +#endif // CL_VERSION_1_1 + +#if defined(USE_CL_DEVICE_FISSION) +__PARAM_NAME_DEVICE_FISSION(__CL_DECLARE_PARAM_TRAITS); +#endif // USE_CL_DEVICE_FISSION + +#ifdef CL_PLATFORM_ICD_SUFFIX_KHR +__CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_CLASS) +#endif + +#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong) +#endif + +#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, VECTOR_CLASS< ::size_t>) +#endif +#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_SIMD_WIDTH_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint) +#endif + +#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint) +#endif +#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint) +#endif +#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint) +#endif +#ifdef CL_DEVICE_WARP_SIZE_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint) +#endif +#ifdef CL_DEVICE_GPU_OVERLAP_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool) +#endif +#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool) +#endif +#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool) +#endif + +// Convenience functions + +template +inline cl_int +getInfo(Func f, cl_uint name, T* param) +{ + return getInfoHelper(f, name, param, 0); +} + +template +struct GetInfoFunctor0 +{ + Func f_; const Arg0& arg0_; + cl_int operator ()( + cl_uint param, ::size_t size, void* value, ::size_t* size_ret) + { return f_(arg0_, param, size, value, size_ret); } +}; + +template +struct GetInfoFunctor1 +{ + Func f_; const Arg0& arg0_; const Arg1& arg1_; + cl_int operator ()( + cl_uint param, ::size_t size, void* value, ::size_t* size_ret) + { return f_(arg0_, arg1_, param, size, value, size_ret); } +}; + +template +inline cl_int +getInfo(Func f, const Arg0& arg0, cl_uint name, T* param) +{ + GetInfoFunctor0 f0 = { f, arg0 }; + return getInfoHelper(f0, name, param, 0); +} + +template +inline cl_int +getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param) +{ + GetInfoFunctor1 f0 = { f, arg0, arg1 }; + return getInfoHelper(f0, name, param, 0); +} + +template +struct ReferenceHandler +{ }; + +#if defined(CL_VERSION_1_2) +/** + * OpenCL 1.2 devices do have retain/release. + */ +template <> +struct ReferenceHandler +{ + /** + * Retain the device. + * \param device A valid device created using createSubDevices + * \return + * CL_SUCCESS if the function executed successfully. + * CL_INVALID_DEVICE if device was not a valid subdevice + * CL_OUT_OF_RESOURCES + * CL_OUT_OF_HOST_MEMORY + */ + static cl_int retain(cl_device_id device) + { return ::clRetainDevice(device); } + /** + * Retain the device. + * \param device A valid device created using createSubDevices + * \return + * CL_SUCCESS if the function executed successfully. + * CL_INVALID_DEVICE if device was not a valid subdevice + * CL_OUT_OF_RESOURCES + * CL_OUT_OF_HOST_MEMORY + */ + static cl_int release(cl_device_id device) + { return ::clReleaseDevice(device); } +}; +#else // #if defined(CL_VERSION_1_2) +/** + * OpenCL 1.1 devices do not have retain/release. + */ +template <> +struct ReferenceHandler +{ + // cl_device_id does not have retain(). + static cl_int retain(cl_device_id) + { return CL_SUCCESS; } + // cl_device_id does not have release(). + static cl_int release(cl_device_id) + { return CL_SUCCESS; } +}; +#endif // #if defined(CL_VERSION_1_2) + +template <> +struct ReferenceHandler +{ + // cl_platform_id does not have retain(). + static cl_int retain(cl_platform_id) + { return CL_SUCCESS; } + // cl_platform_id does not have release(). + static cl_int release(cl_platform_id) + { return CL_SUCCESS; } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_context context) + { return ::clRetainContext(context); } + static cl_int release(cl_context context) + { return ::clReleaseContext(context); } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_command_queue queue) + { return ::clRetainCommandQueue(queue); } + static cl_int release(cl_command_queue queue) + { return ::clReleaseCommandQueue(queue); } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_mem memory) + { return ::clRetainMemObject(memory); } + static cl_int release(cl_mem memory) + { return ::clReleaseMemObject(memory); } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_sampler sampler) + { return ::clRetainSampler(sampler); } + static cl_int release(cl_sampler sampler) + { return ::clReleaseSampler(sampler); } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_program program) + { return ::clRetainProgram(program); } + static cl_int release(cl_program program) + { return ::clReleaseProgram(program); } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_kernel kernel) + { return ::clRetainKernel(kernel); } + static cl_int release(cl_kernel kernel) + { return ::clReleaseKernel(kernel); } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_event event) + { return ::clRetainEvent(event); } + static cl_int release(cl_event event) + { return ::clReleaseEvent(event); } +}; + + +// Extracts version number with major in the upper 16 bits, minor in the lower 16 +static cl_uint getVersion(const char *versionInfo) +{ + int highVersion = 0; + int lowVersion = 0; + int index = 7; + while(versionInfo[index] != '.' ) { + highVersion *= 10; + highVersion += versionInfo[index]-'0'; + ++index; + } + ++index; + while(versionInfo[index] != ' ' ) { + lowVersion *= 10; + lowVersion += versionInfo[index]-'0'; + ++index; + } + return (highVersion << 16) | lowVersion; +} + +static cl_uint getPlatformVersion(cl_platform_id platform) +{ + ::size_t size = 0; + clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size); + char *versionInfo = (char *) alloca(size); + clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, &versionInfo[0], &size); + return getVersion(versionInfo); +} + +static cl_uint getDevicePlatformVersion(cl_device_id device) +{ + cl_platform_id platform; + clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL); + return getPlatformVersion(platform); +} + +#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) +static cl_uint getContextPlatformVersion(cl_context context) +{ + // The platform cannot be queried directly, so we first have to grab a + // device and obtain its context + ::size_t size = 0; + clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size); + if (size == 0) + return 0; + cl_device_id *devices = (cl_device_id *) alloca(size); + clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices, NULL); + return getDevicePlatformVersion(devices[0]); +} +#endif // #if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + +template +class Wrapper +{ +public: + typedef T cl_type; + +protected: + cl_type object_; + +public: + Wrapper() : object_(NULL) { } + + Wrapper(const cl_type &obj) : object_(obj) { } + + ~Wrapper() + { + if (object_ != NULL) { release(); } + } + + Wrapper(const Wrapper& rhs) + { + object_ = rhs.object_; + if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } + } + + Wrapper& operator = (const Wrapper& rhs) + { + if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } + object_ = rhs.object_; + if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } + return *this; + } + + Wrapper& operator = (const cl_type &rhs) + { + if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } + object_ = rhs; + return *this; + } + + cl_type operator ()() const { return object_; } + + cl_type& operator ()() { return object_; } + +protected: + template + friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type); + + cl_int retain() const + { + return ReferenceHandler::retain(object_); + } + + cl_int release() const + { + return ReferenceHandler::release(object_); + } +}; + +template <> +class Wrapper +{ +public: + typedef cl_device_id cl_type; + +protected: + cl_type object_; + bool referenceCountable_; + + static bool isReferenceCountable(cl_device_id device) + { + bool retVal = false; + if (device != NULL) { + int version = getDevicePlatformVersion(device); + if(version > ((1 << 16) + 1)) { + retVal = true; + } + } + return retVal; + } + +public: + Wrapper() : object_(NULL), referenceCountable_(false) + { + } + + Wrapper(const cl_type &obj) : object_(obj), referenceCountable_(false) + { + referenceCountable_ = isReferenceCountable(obj); + } + + ~Wrapper() + { + if (object_ != NULL) { release(); } + } + + Wrapper(const Wrapper& rhs) + { + object_ = rhs.object_; + referenceCountable_ = isReferenceCountable(object_); + if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } + } + + Wrapper& operator = (const Wrapper& rhs) + { + if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } + object_ = rhs.object_; + referenceCountable_ = rhs.referenceCountable_; + if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } + return *this; + } + + Wrapper& operator = (const cl_type &rhs) + { + if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } + object_ = rhs; + referenceCountable_ = isReferenceCountable(object_); + return *this; + } + + cl_type operator ()() const { return object_; } + + cl_type& operator ()() { return object_; } + +protected: + template + friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type); + + template + friend inline cl_int getInfoHelper(Func, cl_uint, VECTOR_CLASS*, int, typename U::cl_type); + + cl_int retain() const + { + if( referenceCountable_ ) { + return ReferenceHandler::retain(object_); + } + else { + return CL_SUCCESS; + } + } + + cl_int release() const + { + if( referenceCountable_ ) { + return ReferenceHandler::release(object_); + } + else { + return CL_SUCCESS; + } + } +}; + +} // namespace detail +//! \endcond + +/*! \stuct ImageFormat + * \brief Adds constructors and member functions for cl_image_format. + * + * \see cl_image_format + */ +struct ImageFormat : public cl_image_format +{ + //! \brief Default constructor - performs no initialization. + ImageFormat(){} + + //! \brief Initializing constructor. + ImageFormat(cl_channel_order order, cl_channel_type type) + { + image_channel_order = order; + image_channel_data_type = type; + } + + //! \brief Assignment operator. + ImageFormat& operator = (const ImageFormat& rhs) + { + if (this != &rhs) { + this->image_channel_data_type = rhs.image_channel_data_type; + this->image_channel_order = rhs.image_channel_order; + } + return *this; + } +}; + +/*! \brief Class interface for cl_device_id. + * + * \note Copies of these objects are inexpensive, since they don't 'own' + * any underlying resources or data structures. + * + * \see cl_device_id + */ +class Device : public detail::Wrapper +{ +public: + //! \brief Default constructor - initializes to NULL. + Device() : detail::Wrapper() { } + + /*! \brief Copy constructor. + * + * This simply copies the device ID value, which is an inexpensive operation. + */ + Device(const Device& device) : detail::Wrapper(device) { } + + /*! \brief Constructor from cl_device_id. + * + * This simply copies the device ID value, which is an inexpensive operation. + */ + Device(const cl_device_id &device) : detail::Wrapper(device) { } + + /*! \brief Returns the first device on the default context. + * + * \see Context::getDefault() + */ + static Device getDefault(cl_int * err = NULL); + + /*! \brief Assignment operator from Device. + * + * This simply copies the device ID value, which is an inexpensive operation. + */ + Device& operator = (const Device& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_device_id. + * + * This simply copies the device ID value, which is an inexpensive operation. + */ + Device& operator = (const cl_device_id& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetDeviceInfo(). + template + cl_int getInfo(cl_device_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetDeviceInfo, object_, name, param), + __GET_DEVICE_INFO_ERR); + } + + //! \brief Wrapper for clGetDeviceInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_device_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /** + * CL 1.2 version + */ +#if defined(CL_VERSION_1_2) + //! \brief Wrapper for clCreateSubDevicesEXT(). + cl_int createSubDevices( + const cl_device_partition_property * properties, + VECTOR_CLASS* devices) + { + cl_uint n = 0; + cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_SUB_DEVICES); + } + + cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id)); + err = clCreateSubDevices(object_, properties, n, ids, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_SUB_DEVICES); + } + + devices->assign(&ids[0], &ids[n]); + return CL_SUCCESS; + } +#endif // #if defined(CL_VERSION_1_2) + +/** + * CL 1.1 version that uses device fission. + */ +#if defined(CL_VERSION_1_1) +#if defined(USE_CL_DEVICE_FISSION) + cl_int createSubDevices( + const cl_device_partition_property_ext * properties, + VECTOR_CLASS* devices) + { + typedef CL_API_ENTRY cl_int + ( CL_API_CALL * PFN_clCreateSubDevicesEXT)( + cl_device_id /*in_device*/, + const cl_device_partition_property_ext * /* properties */, + cl_uint /*num_entries*/, + cl_device_id * /*out_devices*/, + cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL; + __INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT); + + cl_uint n = 0; + cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_SUB_DEVICES); + } + + cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id)); + err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_SUB_DEVICES); + } + + devices->assign(&ids[0], &ids[n]); + return CL_SUCCESS; + } +#endif // #if defined(USE_CL_DEVICE_FISSION) +#endif // #if defined(CL_VERSION_1_1) +}; + +/*! \brief Class interface for cl_platform_id. + * + * \note Copies of these objects are inexpensive, since they don't 'own' + * any underlying resources or data structures. + * + * \see cl_platform_id + */ +class Platform : public detail::Wrapper +{ +public: + //! \brief Default constructor - initializes to NULL. + Platform() : detail::Wrapper() { } + + /*! \brief Copy constructor. + * + * This simply copies the platform ID value, which is an inexpensive operation. + */ + Platform(const Platform& platform) : detail::Wrapper(platform) { } + + /*! \brief Constructor from cl_platform_id. + * + * This simply copies the platform ID value, which is an inexpensive operation. + */ + Platform(const cl_platform_id &platform) : detail::Wrapper(platform) { } + + /*! \brief Assignment operator from Platform. + * + * This simply copies the platform ID value, which is an inexpensive operation. + */ + Platform& operator = (const Platform& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_platform_id. + * + * This simply copies the platform ID value, which is an inexpensive operation. + */ + Platform& operator = (const cl_platform_id& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetPlatformInfo(). + cl_int getInfo(cl_platform_info name, STRING_CLASS* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetPlatformInfo, object_, name, param), + __GET_PLATFORM_INFO_ERR); + } + + //! \brief Wrapper for clGetPlatformInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_platform_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /*! \brief Gets a list of devices for this platform. + * + * Wraps clGetDeviceIDs(). + */ + cl_int getDevices( + cl_device_type type, + VECTOR_CLASS* devices) const + { + cl_uint n = 0; + if( devices == NULL ) { + return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR); + } + cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_DEVICE_IDS_ERR); + } + + cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id)); + err = ::clGetDeviceIDs(object_, type, n, ids, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_DEVICE_IDS_ERR); + } + + devices->assign(&ids[0], &ids[n]); + return CL_SUCCESS; + } + +#if defined(USE_DX_INTEROP) + /*! \brief Get the list of available D3D10 devices. + * + * \param d3d_device_source. + * + * \param d3d_object. + * + * \param d3d_device_set. + * + * \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device + * values returned in devices can be used to identify a specific OpenCL + * device. If \a devices argument is NULL, this argument is ignored. + * + * \return One of the following values: + * - CL_SUCCESS if the function is executed successfully. + * + * The application can query specific capabilities of the OpenCL device(s) + * returned by cl::getDevices. This can be used by the application to + * determine which device(s) to use. + * + * \note In the case that exceptions are enabled and a return value + * other than CL_SUCCESS is generated, then cl::Error exception is + * generated. + */ + cl_int getDevices( + cl_d3d10_device_source_khr d3d_device_source, + void * d3d_object, + cl_d3d10_device_set_khr d3d_device_set, + VECTOR_CLASS* devices) const + { + typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)( + cl_platform_id platform, + cl_d3d10_device_source_khr d3d_device_source, + void * d3d_object, + cl_d3d10_device_set_khr d3d_device_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint* num_devices); + + if( devices == NULL ) { + return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR); + } + + static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL; + __INIT_CL_EXT_FCN_PTR_PLATFORM(object_, clGetDeviceIDsFromD3D10KHR); + + cl_uint n = 0; + cl_int err = pfn_clGetDeviceIDsFromD3D10KHR( + object_, + d3d_device_source, + d3d_object, + d3d_device_set, + 0, + NULL, + &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_DEVICE_IDS_ERR); + } + + cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id)); + err = pfn_clGetDeviceIDsFromD3D10KHR( + object_, + d3d_device_source, + d3d_object, + d3d_device_set, + n, + ids, + NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_DEVICE_IDS_ERR); + } + + devices->assign(&ids[0], &ids[n]); + return CL_SUCCESS; + } +#endif + + /*! \brief Gets a list of available platforms. + * + * Wraps clGetPlatformIDs(). + */ + static cl_int get( + VECTOR_CLASS* platforms) + { + cl_uint n = 0; + + if( platforms == NULL ) { + return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR); + } + + cl_int err = ::clGetPlatformIDs(0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + cl_platform_id* ids = (cl_platform_id*) alloca( + n * sizeof(cl_platform_id)); + err = ::clGetPlatformIDs(n, ids, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + platforms->assign(&ids[0], &ids[n]); + return CL_SUCCESS; + } + + /*! \brief Gets the first available platform. + * + * Wraps clGetPlatformIDs(), returning the first result. + */ + static cl_int get( + Platform * platform) + { + cl_uint n = 0; + + if( platform == NULL ) { + return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR); + } + + cl_int err = ::clGetPlatformIDs(0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + cl_platform_id* ids = (cl_platform_id*) alloca( + n * sizeof(cl_platform_id)); + err = ::clGetPlatformIDs(n, ids, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + *platform = ids[0]; + return CL_SUCCESS; + } + + /*! \brief Gets the first available platform, returning it by value. + * + * Wraps clGetPlatformIDs(), returning the first result. + */ + static Platform get( + cl_int * errResult = NULL) + { + Platform platform; + cl_uint n = 0; + cl_int err = ::clGetPlatformIDs(0, NULL, &n); + if (err != CL_SUCCESS) { + detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + if (errResult != NULL) { + *errResult = err; + } + } + + cl_platform_id* ids = (cl_platform_id*) alloca( + n * sizeof(cl_platform_id)); + err = ::clGetPlatformIDs(n, ids, NULL); + + if (err != CL_SUCCESS) { + detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + if (errResult != NULL) { + *errResult = err; + } + + return ids[0]; + } + + static Platform getDefault( + cl_int *errResult = NULL ) + { + return get(errResult); + } + + +#if defined(CL_VERSION_1_2) + //! \brief Wrapper for clUnloadCompiler(). + cl_int + unloadCompiler() + { + return ::clUnloadPlatformCompiler(object_); + } +#endif // #if defined(CL_VERSION_1_2) +}; // class Platform + +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) +/** + * Unload the OpenCL compiler. + * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead. + */ +inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int +UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; +inline cl_int +UnloadCompiler() +{ + return ::clUnloadCompiler(); +} +#endif // #if defined(CL_VERSION_1_1) + +/*! \brief Class interface for cl_context. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_context as the original. For details, see + * clRetainContext() and clReleaseContext(). + * + * \see cl_context + */ +class Context + : public detail::Wrapper +{ +private: + static volatile int default_initialized_; + static Context default_; + static volatile cl_int default_error_; +public: + /*! \brief Destructor. + * + * This calls clReleaseContext() on the value held by this instance. + */ + ~Context() { } + + /*! \brief Constructs a context including a list of specified devices. + * + * Wraps clCreateContext(). + */ + Context( + const VECTOR_CLASS& devices, + cl_context_properties* properties = NULL, + void (CL_CALLBACK * notifyFptr)( + const char *, + const void *, + ::size_t, + void *) = NULL, + void* data = NULL, + cl_int* err = NULL) + { + cl_int error; + + ::size_t numDevices = devices.size(); + cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id)); + for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { + deviceIDs[deviceIndex] = (devices[deviceIndex])(); + } + + object_ = ::clCreateContext( + properties, (cl_uint) numDevices, + deviceIDs, + notifyFptr, data, &error); + + detail::errHandler(error, __CREATE_CONTEXT_ERR); + if (err != NULL) { + *err = error; + } + } + + Context( + const Device& device, + cl_context_properties* properties = NULL, + void (CL_CALLBACK * notifyFptr)( + const char *, + const void *, + ::size_t, + void *) = NULL, + void* data = NULL, + cl_int* err = NULL) + { + cl_int error; + + cl_device_id deviceID = device(); + + object_ = ::clCreateContext( + properties, 1, + &deviceID, + notifyFptr, data, &error); + + detail::errHandler(error, __CREATE_CONTEXT_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! \brief Constructs a context including all devices of a specified type. + * + * Wraps clCreateContextFromType(). + */ + Context( + cl_device_type type, + cl_context_properties* properties = NULL, + void (CL_CALLBACK * notifyFptr)( + const char *, + const void *, + ::size_t, + void *) = NULL, + void* data = NULL, + cl_int* err = NULL) + { + cl_int error; + +#if !defined(__APPLE__) || !defined(__MACOS) + cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 }; + if (properties == NULL) { + prop[1] = (cl_context_properties)Platform::get(&error)(); + if (error != CL_SUCCESS) { + detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR); + if (err != NULL) { + *err = error; + return; + } + } + + properties = &prop[0]; + } +#endif + object_ = ::clCreateContextFromType( + properties, type, notifyFptr, data, &error); + + detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT. + * + * \note All calls to this function return the same cl_context as the first. + */ + static Context getDefault(cl_int * err = NULL) + { + int state = detail::compare_exchange( + &default_initialized_, + __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED); + + if (state & __DEFAULT_INITIALIZED) { + if (err != NULL) { + *err = default_error_; + } + return default_; + } + + if (state & __DEFAULT_BEING_INITIALIZED) { + // Assume writes will propagate eventually... + while(default_initialized_ != __DEFAULT_INITIALIZED) { + detail::fence(); + } + + if (err != NULL) { + *err = default_error_; + } + return default_; + } + + cl_int error; + default_ = Context( + CL_DEVICE_TYPE_DEFAULT, + NULL, + NULL, + NULL, + &error); + + detail::fence(); + + default_error_ = error; + // Assume writes will propagate eventually... + default_initialized_ = __DEFAULT_INITIALIZED; + + detail::fence(); + + if (err != NULL) { + *err = default_error_; + } + return default_; + + } + + //! \brief Default constructor - initializes to NULL. + Context() : detail::Wrapper() { } + + /*! \brief Copy constructor. + * + * This calls clRetainContext() on the parameter's cl_context. + */ + Context(const Context& context) : detail::Wrapper(context) { } + + /*! \brief Constructor from cl_context - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_context + * into the new Context object. + */ + __CL_EXPLICIT_CONSTRUCTORS Context(const cl_context& context) : detail::Wrapper(context) { } + + /*! \brief Assignment operator from Context. + * + * This calls clRetainContext() on the parameter and clReleaseContext() on + * the previous value held by this instance. + */ + Context& operator = (const Context& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_context - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseContext() on the value previously held by this instance. + */ + Context& operator = (const cl_context& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetContextInfo(). + template + cl_int getInfo(cl_context_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetContextInfo, object_, name, param), + __GET_CONTEXT_INFO_ERR); + } + + //! \brief Wrapper for clGetContextInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_context_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /*! \brief Gets a list of supported image formats. + * + * Wraps clGetSupportedImageFormats(). + */ + cl_int getSupportedImageFormats( + cl_mem_flags flags, + cl_mem_object_type type, + VECTOR_CLASS* formats) const + { + cl_uint numEntries; + cl_int err = ::clGetSupportedImageFormats( + object_, + flags, + type, + 0, + NULL, + &numEntries); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR); + } + + ImageFormat* value = (ImageFormat*) + alloca(numEntries * sizeof(ImageFormat)); + err = ::clGetSupportedImageFormats( + object_, + flags, + type, + numEntries, + (cl_image_format*) value, + NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR); + } + + formats->assign(&value[0], &value[numEntries]); + return CL_SUCCESS; + } +}; + +inline Device Device::getDefault(cl_int * err) +{ + cl_int error; + Device device; + + Context context = Context::getDefault(&error); +#if !defined(CL_VERSION_2_0) || defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); +#else + detail::errHandler(error, __CREATE_COMMAND_QUEUE_PROPERTY_ERR); +#endif + + if (error != CL_SUCCESS) { + if (err != NULL) { + *err = error; + } + } + else { + device = context.getInfo()[0]; + if (err != NULL) { + *err = CL_SUCCESS; + } + } + + return device; +} + + +#ifdef _WIN32 +__declspec(selectany) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED; +__declspec(selectany) Context Context::default_; +__declspec(selectany) volatile cl_int Context::default_error_ = CL_SUCCESS; +#else +__attribute__((weak)) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED; +__attribute__((weak)) Context Context::default_; +__attribute__((weak)) volatile cl_int Context::default_error_ = CL_SUCCESS; +#endif + +/*! \brief Class interface for cl_event. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_event as the original. For details, see + * clRetainEvent() and clReleaseEvent(). + * + * \see cl_event + */ +class Event : public detail::Wrapper +{ +public: + /*! \brief Destructor. + * + * This calls clReleaseEvent() on the value held by this instance. + */ + ~Event() { } + + //! \brief Default constructor - initializes to NULL. + Event() : detail::Wrapper() { } + + /*! \brief Copy constructor. + * + * This calls clRetainEvent() on the parameter's cl_event. + */ + Event(const Event& event) : detail::Wrapper(event) { } + + /*! \brief Constructor from cl_event - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_event + * into the new Event object. + */ + Event(const cl_event& event) : detail::Wrapper(event) { } + + /*! \brief Assignment operator from cl_event - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseEvent() on the value previously held by this instance. + */ + Event& operator = (const Event& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_event. + * + * This calls clRetainEvent() on the parameter and clReleaseEvent() on + * the previous value held by this instance. + */ + Event& operator = (const cl_event& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetEventInfo(). + template + cl_int getInfo(cl_event_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetEventInfo, object_, name, param), + __GET_EVENT_INFO_ERR); + } + + //! \brief Wrapper for clGetEventInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_event_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + //! \brief Wrapper for clGetEventProfilingInfo(). + template + cl_int getProfilingInfo(cl_profiling_info name, T* param) const + { + return detail::errHandler(detail::getInfo( + &::clGetEventProfilingInfo, object_, name, param), + __GET_EVENT_PROFILE_INFO_ERR); + } + + //! \brief Wrapper for clGetEventProfilingInfo() that returns by value. + template typename + detail::param_traits::param_type + getProfilingInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_profiling_info, name>::param_type param; + cl_int result = getProfilingInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /*! \brief Blocks the calling thread until this event completes. + * + * Wraps clWaitForEvents(). + */ + cl_int wait() const + { + return detail::errHandler( + ::clWaitForEvents(1, &object_), + __WAIT_FOR_EVENTS_ERR); + } + +#if defined(CL_VERSION_1_1) + /*! \brief Registers a user callback function for a specific command execution status. + * + * Wraps clSetEventCallback(). + */ + cl_int setCallback( + cl_int type, + void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *), + void * user_data = NULL) + { + return detail::errHandler( + ::clSetEventCallback( + object_, + type, + pfn_notify, + user_data), + __SET_EVENT_CALLBACK_ERR); + } +#endif + + /*! \brief Blocks the calling thread until every event specified is complete. + * + * Wraps clWaitForEvents(). + */ + static cl_int + waitForEvents(const VECTOR_CLASS& events) + { + return detail::errHandler( + ::clWaitForEvents( + (cl_uint) events.size(), (cl_event*)&events.front()), + __WAIT_FOR_EVENTS_ERR); + } +}; + +#if defined(CL_VERSION_1_1) +/*! \brief Class interface for user events (a subset of cl_event's). + * + * See Event for details about copy semantics, etc. + */ +class UserEvent : public Event +{ +public: + /*! \brief Constructs a user event on a given context. + * + * Wraps clCreateUserEvent(). + */ + UserEvent( + const Context& context, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateUserEvent( + context(), + &error); + + detail::errHandler(error, __CREATE_USER_EVENT_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + UserEvent() : Event() { } + + //! \brief Copy constructor - performs shallow copy. + UserEvent(const UserEvent& event) : Event(event) { } + + //! \brief Assignment Operator - performs shallow copy. + UserEvent& operator = (const UserEvent& rhs) + { + if (this != &rhs) { + Event::operator=(rhs); + } + return *this; + } + + /*! \brief Sets the execution status of a user event object. + * + * Wraps clSetUserEventStatus(). + */ + cl_int setStatus(cl_int status) + { + return detail::errHandler( + ::clSetUserEventStatus(object_,status), + __SET_USER_EVENT_STATUS_ERR); + } +}; +#endif + +/*! \brief Blocks the calling thread until every event specified is complete. + * + * Wraps clWaitForEvents(). + */ +inline static cl_int +WaitForEvents(const VECTOR_CLASS& events) +{ + return detail::errHandler( + ::clWaitForEvents( + (cl_uint) events.size(), (cl_event*)&events.front()), + __WAIT_FOR_EVENTS_ERR); +} + +/*! \brief Class interface for cl_mem. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_mem as the original. For details, see + * clRetainMemObject() and clReleaseMemObject(). + * + * \see cl_mem + */ +class Memory : public detail::Wrapper +{ +public: + + /*! \brief Destructor. + * + * This calls clReleaseMemObject() on the value held by this instance. + */ + ~Memory() {} + + //! \brief Default constructor - initializes to NULL. + Memory() : detail::Wrapper() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * This calls clRetainMemObject() on the parameter's cl_mem. + */ + Memory(const Memory& memory) : detail::Wrapper(memory) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_mem + * into the new Memory object. + */ + __CL_EXPLICIT_CONSTRUCTORS Memory(const cl_mem& memory) : detail::Wrapper(memory) { } + + /*! \brief Assignment operator from Memory. + * + * This calls clRetainMemObject() on the parameter and clReleaseMemObject() + * on the previous value held by this instance. + */ + Memory& operator = (const Memory& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_mem - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseMemObject() on the value previously held by this instance. + */ + Memory& operator = (const cl_mem& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetMemObjectInfo(). + template + cl_int getInfo(cl_mem_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetMemObjectInfo, object_, name, param), + __GET_MEM_OBJECT_INFO_ERR); + } + + //! \brief Wrapper for clGetMemObjectInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_mem_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + +#if defined(CL_VERSION_1_1) + /*! \brief Registers a callback function to be called when the memory object + * is no longer needed. + * + * Wraps clSetMemObjectDestructorCallback(). + * + * Repeated calls to this function, for a given cl_mem value, will append + * to the list of functions called (in reverse order) when memory object's + * resources are freed and the memory object is deleted. + * + * \note + * The registered callbacks are associated with the underlying cl_mem + * value - not the Memory class instance. + */ + cl_int setDestructorCallback( + void (CL_CALLBACK * pfn_notify)(cl_mem, void *), + void * user_data = NULL) + { + return detail::errHandler( + ::clSetMemObjectDestructorCallback( + object_, + pfn_notify, + user_data), + __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR); + } +#endif + +}; + +// Pre-declare copy functions +class Buffer; +template< typename IteratorType > +cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer ); +template< typename IteratorType > +cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator ); + +/*! \brief Class interface for Buffer Memory Objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Buffer : public Memory +{ +public: + + /*! \brief Constructs a Buffer in a specified context. + * + * Wraps clCreateBuffer(). + * + * \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was + * specified. Note alignment & exclusivity requirements. + */ + Buffer( + const Context& context, + cl_mem_flags flags, + ::size_t size, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error); + + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! \brief Constructs a Buffer in the default context. + * + * Wraps clCreateBuffer(). + * + * \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was + * specified. Note alignment & exclusivity requirements. + * + * \see Context::getDefault() + */ + Buffer( + cl_mem_flags flags, + ::size_t size, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + + Context context = Context::getDefault(err); + + object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error); + + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! + * \brief Construct a Buffer from a host container via iterators. + * If useHostPtr is specified iterators must be random access. + */ + template< typename IteratorType > + Buffer( + IteratorType startIterator, + IteratorType endIterator, + bool readOnly, + bool useHostPtr = false, + cl_int* err = NULL) + { + typedef typename std::iterator_traits::value_type DataType; + cl_int error; + + cl_mem_flags flags = 0; + if( readOnly ) { + flags |= CL_MEM_READ_ONLY; + } + else { + flags |= CL_MEM_READ_WRITE; + } + if( useHostPtr ) { + flags |= CL_MEM_USE_HOST_PTR; + } + + ::size_t size = sizeof(DataType)*(endIterator - startIterator); + + Context context = Context::getDefault(err); + + if( useHostPtr ) { + object_ = ::clCreateBuffer(context(), flags, size, static_cast(&*startIterator), &error); + } else { + object_ = ::clCreateBuffer(context(), flags, size, 0, &error); + } + + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + + if( !useHostPtr ) { + error = cl::copy(startIterator, endIterator, *this); + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + } + + //! \brief Default constructor - initializes to NULL. + Buffer() : Memory() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Buffer(const Buffer& buffer) : Memory(buffer) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Buffer(const cl_mem& buffer) : Memory(buffer) { } + + /*! \brief Assignment from Buffer - performs shallow copy. + * + * See Memory for further details. + */ + Buffer& operator = (const Buffer& rhs) + { + if (this != &rhs) { + Memory::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Buffer& operator = (const cl_mem& rhs) + { + Memory::operator=(rhs); + return *this; + } + +#if defined(CL_VERSION_1_1) + /*! \brief Creates a new buffer object from this. + * + * Wraps clCreateSubBuffer(). + */ + Buffer createSubBuffer( + cl_mem_flags flags, + cl_buffer_create_type buffer_create_type, + const void * buffer_create_info, + cl_int * err = NULL) + { + Buffer result; + cl_int error; + result.object_ = ::clCreateSubBuffer( + object_, + flags, + buffer_create_type, + buffer_create_info, + &error); + + detail::errHandler(error, __CREATE_SUBBUFFER_ERR); + if (err != NULL) { + *err = error; + } + + return result; + } +#endif +}; + +#if defined (USE_DX_INTEROP) +/*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's. + * + * This is provided to facilitate interoperability with Direct3D. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class BufferD3D10 : public Buffer +{ +public: + typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)( + cl_context context, cl_mem_flags flags, ID3D10Buffer* buffer, + cl_int* errcode_ret); + + /*! \brief Constructs a BufferD3D10, in a specified context, from a + * given ID3D10Buffer. + * + * Wraps clCreateFromD3D10BufferKHR(). + */ + BufferD3D10( + const Context& context, + cl_mem_flags flags, + ID3D10Buffer* bufobj, + cl_int * err = NULL) + { + static PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR = NULL; + +#if defined(CL_VERSION_1_2) + vector props = context.getInfo(); + cl_platform platform = -1; + for( int i = 0; i < props.size(); ++i ) { + if( props[i] == CL_CONTEXT_PLATFORM ) { + platform = props[i+1]; + } + } + __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clCreateFromD3D10BufferKHR); +#endif +#if defined(CL_VERSION_1_1) + __INIT_CL_EXT_FCN_PTR(clCreateFromD3D10BufferKHR); +#endif + + cl_int error; + object_ = pfn_clCreateFromD3D10BufferKHR( + context(), + flags, + bufobj, + &error); + + detail::errHandler(error, __CREATE_GL_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + BufferD3D10() : Buffer() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + BufferD3D10(const BufferD3D10& buffer) : Buffer(buffer) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS BufferD3D10(const cl_mem& buffer) : Buffer(buffer) { } + + /*! \brief Assignment from BufferD3D10 - performs shallow copy. + * + * See Memory for further details. + */ + BufferD3D10& operator = (const BufferD3D10& rhs) + { + if (this != &rhs) { + Buffer::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + BufferD3D10& operator = (const cl_mem& rhs) + { + Buffer::operator=(rhs); + return *this; + } +}; +#endif + +/*! \brief Class interface for GL Buffer Memory Objects. + * + * This is provided to facilitate interoperability with OpenGL. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class BufferGL : public Buffer +{ +public: + /*! \brief Constructs a BufferGL in a specified context, from a given + * GL buffer. + * + * Wraps clCreateFromGLBuffer(). + */ + BufferGL( + const Context& context, + cl_mem_flags flags, + GLuint bufobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLBuffer( + context(), + flags, + bufobj, + &error); + + detail::errHandler(error, __CREATE_GL_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + BufferGL() : Buffer() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + BufferGL(const BufferGL& buffer) : Buffer(buffer) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS BufferGL(const cl_mem& buffer) : Buffer(buffer) { } + + /*! \brief Assignment from BufferGL - performs shallow copy. + * + * See Memory for further details. + */ + BufferGL& operator = (const BufferGL& rhs) + { + if (this != &rhs) { + Buffer::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + BufferGL& operator = (const cl_mem& rhs) + { + Buffer::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetGLObjectInfo(). + cl_int getObjectInfo( + cl_gl_object_type *type, + GLuint * gl_object_name) + { + return detail::errHandler( + ::clGetGLObjectInfo(object_,type,gl_object_name), + __GET_GL_OBJECT_INFO_ERR); + } +}; + +/*! \brief Class interface for GL Render Buffer Memory Objects. + * + * This is provided to facilitate interoperability with OpenGL. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class BufferRenderGL : public Buffer +{ +public: + /*! \brief Constructs a BufferRenderGL in a specified context, from a given + * GL Renderbuffer. + * + * Wraps clCreateFromGLRenderbuffer(). + */ + BufferRenderGL( + const Context& context, + cl_mem_flags flags, + GLuint bufobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLRenderbuffer( + context(), + flags, + bufobj, + &error); + + detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + BufferRenderGL() : Buffer() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + BufferRenderGL(const BufferGL& buffer) : Buffer(buffer) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : Buffer(buffer) { } + + /*! \brief Assignment from BufferGL - performs shallow copy. + * + * See Memory for further details. + */ + BufferRenderGL& operator = (const BufferRenderGL& rhs) + { + if (this != &rhs) { + Buffer::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + BufferRenderGL& operator = (const cl_mem& rhs) + { + Buffer::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetGLObjectInfo(). + cl_int getObjectInfo( + cl_gl_object_type *type, + GLuint * gl_object_name) + { + return detail::errHandler( + ::clGetGLObjectInfo(object_,type,gl_object_name), + __GET_GL_OBJECT_INFO_ERR); + } +}; + +/*! \brief C++ base class for Image Memory objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Image : public Memory +{ +protected: + //! \brief Default constructor - initializes to NULL. + Image() : Memory() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Image(const Image& image) : Memory(image) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image(const cl_mem& image) : Memory(image) { } + + /*! \brief Assignment from Image - performs shallow copy. + * + * See Memory for further details. + */ + Image& operator = (const Image& rhs) + { + if (this != &rhs) { + Memory::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image& operator = (const cl_mem& rhs) + { + Memory::operator=(rhs); + return *this; + } + +public: + //! \brief Wrapper for clGetImageInfo(). + template + cl_int getImageInfo(cl_image_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetImageInfo, object_, name, param), + __GET_IMAGE_INFO_ERR); + } + + //! \brief Wrapper for clGetImageInfo() that returns by value. + template typename + detail::param_traits::param_type + getImageInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_image_info, name>::param_type param; + cl_int result = getImageInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } +}; + +#if defined(CL_VERSION_1_2) +/*! \brief Class interface for 1D Image Memory objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Image1D : public Image +{ +public: + /*! \brief Constructs a 1D Image in a specified context. + * + * Wraps clCreateImage(). + */ + Image1D( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t width, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + cl_image_desc desc; + desc.image_type = CL_MEM_OBJECT_IMAGE1D; + desc.image_width = width; + desc.image_row_pitch = 0; + desc.num_mip_levels = 0; + desc.num_samples = 0; + desc.buffer = 0; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + Image1D() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Image1D(const Image1D& image1D) : Image(image1D) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image1D(const cl_mem& image1D) : Image(image1D) { } + + /*! \brief Assignment from Image1D - performs shallow copy. + * + * See Memory for further details. + */ + Image1D& operator = (const Image1D& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image1D& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } +}; + +/*! \class Image1DBuffer + * \brief Image interface for 1D buffer images. + */ +class Image1DBuffer : public Image +{ +public: + Image1DBuffer( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t width, + const Buffer &buffer, + cl_int* err = NULL) + { + cl_int error; + cl_image_desc desc; + desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + desc.image_width = width; + desc.image_row_pitch = 0; + desc.num_mip_levels = 0; + desc.num_samples = 0; + desc.buffer = buffer(); + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + NULL, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } + + Image1DBuffer() { } + + Image1DBuffer(const Image1DBuffer& image1D) : Image(image1D) { } + + __CL_EXPLICIT_CONSTRUCTORS Image1DBuffer(const cl_mem& image1D) : Image(image1D) { } + + Image1DBuffer& operator = (const Image1DBuffer& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + Image1DBuffer& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } +}; + +/*! \class Image1DArray + * \brief Image interface for arrays of 1D images. + */ +class Image1DArray : public Image +{ +public: + Image1DArray( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t arraySize, + ::size_t width, + ::size_t rowPitch, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + cl_image_desc desc; + desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY; + desc.image_array_size = arraySize; + desc.image_width = width; + desc.image_row_pitch = rowPitch; + desc.num_mip_levels = 0; + desc.num_samples = 0; + desc.buffer = 0; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } + + Image1DArray() { } + + Image1DArray(const Image1DArray& imageArray) : Image(imageArray) { } + + __CL_EXPLICIT_CONSTRUCTORS Image1DArray(const cl_mem& imageArray) : Image(imageArray) { } + + Image1DArray& operator = (const Image1DArray& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + Image1DArray& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } +}; +#endif // #if defined(CL_VERSION_1_2) + + +/*! \brief Class interface for 2D Image Memory objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Image2D : public Image +{ +public: + /*! \brief Constructs a 1D Image in a specified context. + * + * Wraps clCreateImage(). + */ + Image2D( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t width, + ::size_t height, + ::size_t row_pitch = 0, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + bool useCreateImage; + +#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + // Run-time decision based on the actual platform + { + cl_uint version = detail::getContextPlatformVersion(context()); + useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above + } +#elif defined(CL_VERSION_1_2) + useCreateImage = true; +#else + useCreateImage = false; +#endif + +#if defined(CL_VERSION_1_2) + if (useCreateImage) + { + cl_image_desc desc; + desc.image_type = CL_MEM_OBJECT_IMAGE2D; + desc.image_width = width; + desc.image_height = height; + desc.image_row_pitch = row_pitch; + desc.num_mip_levels = 0; + desc.num_samples = 0; + desc.buffer = 0; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // #if defined(CL_VERSION_1_2) +#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + if (!useCreateImage) + { + object_ = ::clCreateImage2D( + context(), flags,&format, width, height, row_pitch, host_ptr, &error); + + detail::errHandler(error, __CREATE_IMAGE2D_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + } + + //! \brief Default constructor - initializes to NULL. + Image2D() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Image2D(const Image2D& image2D) : Image(image2D) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image2D(const cl_mem& image2D) : Image(image2D) { } + + /*! \brief Assignment from Image2D - performs shallow copy. + * + * See Memory for further details. + */ + Image2D& operator = (const Image2D& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image2D& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } +}; + + +#if !defined(CL_VERSION_1_2) +/*! \brief Class interface for GL 2D Image Memory objects. + * + * This is provided to facilitate interoperability with OpenGL. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + * \note Deprecated for OpenCL 1.2. Please use ImageGL instead. + */ +class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED : public Image2D +{ +public: + /*! \brief Constructs an Image2DGL in a specified context, from a given + * GL Texture. + * + * Wraps clCreateFromGLTexture2D(). + */ + Image2DGL( + const Context& context, + cl_mem_flags flags, + GLenum target, + GLint miplevel, + GLuint texobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLTexture2D( + context(), + flags, + target, + miplevel, + texobj, + &error); + + detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR); + if (err != NULL) { + *err = error; + } + + } + + //! \brief Default constructor - initializes to NULL. + Image2DGL() : Image2D() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Image2DGL(const Image2DGL& image) : Image2D(image) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image2DGL(const cl_mem& image) : Image2D(image) { } + + /*! \brief Assignment from Image2DGL - performs shallow copy. + * + * See Memory for further details. + */ + Image2DGL& operator = (const Image2DGL& rhs) + { + if (this != &rhs) { + Image2D::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image2DGL& operator = (const cl_mem& rhs) + { + Image2D::operator=(rhs); + return *this; + } +}; +#endif // #if !defined(CL_VERSION_1_2) + +#if defined(CL_VERSION_1_2) +/*! \class Image2DArray + * \brief Image interface for arrays of 2D images. + */ +class Image2DArray : public Image +{ +public: + Image2DArray( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t arraySize, + ::size_t width, + ::size_t height, + ::size_t rowPitch, + ::size_t slicePitch, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + cl_image_desc desc; + desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY; + desc.image_array_size = arraySize; + desc.image_width = width; + desc.image_height = height; + desc.image_row_pitch = rowPitch; + desc.image_slice_pitch = slicePitch; + desc.num_mip_levels = 0; + desc.num_samples = 0; + desc.buffer = 0; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } + + Image2DArray() { } + + Image2DArray(const Image2DArray& imageArray) : Image(imageArray) { } + + __CL_EXPLICIT_CONSTRUCTORS Image2DArray(const cl_mem& imageArray) : Image(imageArray) { } + + Image2DArray& operator = (const Image2DArray& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + Image2DArray& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } +}; +#endif // #if defined(CL_VERSION_1_2) + +/*! \brief Class interface for 3D Image Memory objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Image3D : public Image +{ +public: + /*! \brief Constructs a 3D Image in a specified context. + * + * Wraps clCreateImage(). + */ + Image3D( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t width, + ::size_t height, + ::size_t depth, + ::size_t row_pitch = 0, + ::size_t slice_pitch = 0, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + bool useCreateImage; + +#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + // Run-time decision based on the actual platform + { + cl_uint version = detail::getContextPlatformVersion(context()); + useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above + } +#elif defined(CL_VERSION_1_2) + useCreateImage = true; +#else + useCreateImage = false; +#endif + +#if defined(CL_VERSION_1_2) + if (useCreateImage) + { + cl_image_desc desc; + desc.image_type = CL_MEM_OBJECT_IMAGE3D; + desc.image_width = width; + desc.image_height = height; + desc.image_depth = depth; + desc.image_row_pitch = row_pitch; + desc.image_slice_pitch = slice_pitch; + desc.num_mip_levels = 0; + desc.num_samples = 0; + desc.buffer = 0; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // #if defined(CL_VERSION_1_2) +#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + if (!useCreateImage) + { + object_ = ::clCreateImage3D( + context(), flags, &format, width, height, depth, row_pitch, + slice_pitch, host_ptr, &error); + + detail::errHandler(error, __CREATE_IMAGE3D_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + } + + //! \brief Default constructor - initializes to NULL. + Image3D() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Image3D(const Image3D& image3D) : Image(image3D) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image3D(const cl_mem& image3D) : Image(image3D) { } + + /*! \brief Assignment from Image3D - performs shallow copy. + * + * See Memory for further details. + */ + Image3D& operator = (const Image3D& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image3D& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } +}; + +#if !defined(CL_VERSION_1_2) +/*! \brief Class interface for GL 3D Image Memory objects. + * + * This is provided to facilitate interoperability with OpenGL. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Image3DGL : public Image3D +{ +public: + /*! \brief Constructs an Image3DGL in a specified context, from a given + * GL Texture. + * + * Wraps clCreateFromGLTexture3D(). + */ + Image3DGL( + const Context& context, + cl_mem_flags flags, + GLenum target, + GLint miplevel, + GLuint texobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLTexture3D( + context(), + flags, + target, + miplevel, + texobj, + &error); + + detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + Image3DGL() : Image3D() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Image3DGL(const Image3DGL& image) : Image3D(image) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image3DGL(const cl_mem& image) : Image3D(image) { } + + /*! \brief Assignment from Image3DGL - performs shallow copy. + * + * See Memory for further details. + */ + Image3DGL& operator = (const Image3DGL& rhs) + { + if (this != &rhs) { + Image3D::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image3DGL& operator = (const cl_mem& rhs) + { + Image3D::operator=(rhs); + return *this; + } +}; +#endif // #if !defined(CL_VERSION_1_2) + +#if defined(CL_VERSION_1_2) +/*! \class ImageGL + * \brief general image interface for GL interop. + * We abstract the 2D and 3D GL images into a single instance here + * that wraps all GL sourced images on the grounds that setup information + * was performed by OpenCL anyway. + */ +class ImageGL : public Image +{ +public: + ImageGL( + const Context& context, + cl_mem_flags flags, + GLenum target, + GLint miplevel, + GLuint texobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLTexture( + context(), + flags, + target, + miplevel, + texobj, + &error); + + detail::errHandler(error, __CREATE_GL_TEXTURE_ERR); + if (err != NULL) { + *err = error; + } + } + + ImageGL() : Image() { } + + ImageGL(const ImageGL& image) : Image(image) { } + + __CL_EXPLICIT_CONSTRUCTORS ImageGL(const cl_mem& image) : Image(image) { } + + ImageGL& operator = (const ImageGL& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + ImageGL& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } +}; +#endif // #if defined(CL_VERSION_1_2) + +/*! \brief Class interface for cl_sampler. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_sampler as the original. For details, see + * clRetainSampler() and clReleaseSampler(). + * + * \see cl_sampler + */ +class Sampler : public detail::Wrapper +{ +public: + /*! \brief Destructor. + * + * This calls clReleaseSampler() on the value held by this instance. + */ + ~Sampler() { } + + //! \brief Default constructor - initializes to NULL. + Sampler() { } + + /*! \brief Constructs a Sampler in a specified context. + * + * Wraps clCreateSampler(). + */ + + #if !defined(CL_VERSION_2_0) || defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) + Sampler( + const Context& context, + cl_bool normalized_coords, + cl_addressing_mode addressing_mode, + cl_filter_mode filter_mode, + cl_int* err = NULL) + { + cl_int error; + object_ = ::clCreateSampler( + context(), + normalized_coords, + addressing_mode, + filter_mode, + &error); + + detail::errHandler(error, __CREATE_SAMPLER_ERR); + if (err != NULL) { + *err = error; + } + } +#else + Sampler( + const Context& context, + const cl_sampler_properties *sampler_properties, + cl_int* err = NULL) + { + cl_int error; + object_ = ::clCreateSamplerWithProperties( + context(), + sampler_properties, + &error); + + detail::errHandler(error, __CREATE_SAMPLER_PROPERTY_ERR); + if (err != NULL) { + *err = error; + } + } +#endif + + /*! \brief Copy constructor - performs shallow copy. + * + * This calls clRetainSampler() on the parameter's cl_sampler. + */ + Sampler(const Sampler& sampler) : detail::Wrapper(sampler) { } + + /*! \brief Constructor from cl_sampler - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_sampler + * into the new Sampler object. + */ + Sampler(const cl_sampler& sampler) : detail::Wrapper(sampler) { } + + /*! \brief Assignment operator from Sampler. + * + * This calls clRetainSampler() on the parameter and clReleaseSampler() + * on the previous value held by this instance. + */ + Sampler& operator = (const Sampler& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_sampler - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseSampler() on the value previously held by this instance. + */ + Sampler& operator = (const cl_sampler& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetSamplerInfo(). + template + cl_int getInfo(cl_sampler_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetSamplerInfo, object_, name, param), + __GET_SAMPLER_INFO_ERR); + } + + //! \brief Wrapper for clGetSamplerInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_sampler_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } +}; + +class Program; +class CommandQueue; +class Kernel; + +//! \brief Class interface for specifying NDRange values. +class NDRange +{ +private: + size_t<3> sizes_; + cl_uint dimensions_; + +public: + //! \brief Default constructor - resulting range has zero dimensions. + NDRange() + : dimensions_(0) + { } + + //! \brief Constructs one-dimensional range. + NDRange(::size_t size0) + : dimensions_(1) + { + sizes_[0] = size0; + } + + //! \brief Constructs two-dimensional range. + NDRange(::size_t size0, ::size_t size1) + : dimensions_(2) + { + sizes_[0] = size0; + sizes_[1] = size1; + } + + //! \brief Constructs three-dimensional range. + NDRange(::size_t size0, ::size_t size1, ::size_t size2) + : dimensions_(3) + { + sizes_[0] = size0; + sizes_[1] = size1; + sizes_[2] = size2; + } + + /*! \brief Conversion operator to const ::size_t *. + * + * \returns a pointer to the size of the first dimension. + */ + operator const ::size_t*() const { + return (const ::size_t*) sizes_; + } + + //! \brief Queries the number of dimensions in the range. + ::size_t dimensions() const { return dimensions_; } +}; + +//! \brief A zero-dimensional range. +static const NDRange NullRange; + +//! \brief Local address wrapper for use with Kernel::setArg +struct LocalSpaceArg +{ + ::size_t size_; +}; + +namespace detail { + +template +struct KernelArgumentHandler +{ + static ::size_t size(const T&) { return sizeof(T); } + static T* ptr(T& value) { return &value; } +}; + +template <> +struct KernelArgumentHandler +{ + static ::size_t size(const LocalSpaceArg& value) { return value.size_; } + static void* ptr(LocalSpaceArg&) { return NULL; } +}; + +} +//! \endcond + +/*! __local + * \brief Helper function for generating LocalSpaceArg objects. + * Deprecated. Replaced with Local. + */ +inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED LocalSpaceArg +__local(::size_t size) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; +inline LocalSpaceArg +__local(::size_t size) +{ + LocalSpaceArg ret = { size }; + return ret; +} + +/*! Local + * \brief Helper function for generating LocalSpaceArg objects. + */ +inline LocalSpaceArg +Local(::size_t size) +{ + LocalSpaceArg ret = { size }; + return ret; +} + +//class KernelFunctor; + +/*! \brief Class interface for cl_kernel. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_kernel as the original. For details, see + * clRetainKernel() and clReleaseKernel(). + * + * \see cl_kernel + */ +class Kernel : public detail::Wrapper +{ +public: + inline Kernel(const Program& program, const char* name, cl_int* err = NULL); + + /*! \brief Destructor. + * + * This calls clReleaseKernel() on the value held by this instance. + */ + ~Kernel() { } + + //! \brief Default constructor - initializes to NULL. + Kernel() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * This calls clRetainKernel() on the parameter's cl_kernel. + */ + Kernel(const Kernel& kernel) : detail::Wrapper(kernel) { } + + /*! \brief Constructor from cl_kernel - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_kernel + * into the new Kernel object. + */ + __CL_EXPLICIT_CONSTRUCTORS Kernel(const cl_kernel& kernel) : detail::Wrapper(kernel) { } + + /*! \brief Assignment operator from Kernel. + * + * This calls clRetainKernel() on the parameter and clReleaseKernel() + * on the previous value held by this instance. + */ + Kernel& operator = (const Kernel& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_kernel - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseKernel() on the value previously held by this instance. + */ + Kernel& operator = (const cl_kernel& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + template + cl_int getInfo(cl_kernel_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetKernelInfo, object_, name, param), + __GET_KERNEL_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_kernel_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + +#if defined(CL_VERSION_1_2) + template + cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param), + __GET_KERNEL_ARG_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getArgInfo(cl_uint argIndex, cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_kernel_arg_info, name>::param_type param; + cl_int result = getArgInfo(argIndex, name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } +#endif // #if defined(CL_VERSION_1_2) + + template + cl_int getWorkGroupInfo( + const Device& device, cl_kernel_work_group_info name, T* param) const + { + return detail::errHandler( + detail::getInfo( + &::clGetKernelWorkGroupInfo, object_, device(), name, param), + __GET_KERNEL_WORK_GROUP_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getWorkGroupInfo(const Device& device, cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_kernel_work_group_info, name>::param_type param; + cl_int result = getWorkGroupInfo(device, name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + template + cl_int setArg(cl_uint index, T value) + { + return detail::errHandler( + ::clSetKernelArg( + object_, + index, + detail::KernelArgumentHandler::size(value), + detail::KernelArgumentHandler::ptr(value)), + __SET_KERNEL_ARGS_ERR); + } + + cl_int setArg(cl_uint index, ::size_t size, void* argPtr) + { + return detail::errHandler( + ::clSetKernelArg(object_, index, size, argPtr), + __SET_KERNEL_ARGS_ERR); + } +}; + +/*! \class Program + * \brief Program interface that implements cl_program. + */ +class Program : public detail::Wrapper +{ +public: + typedef VECTOR_CLASS > Binaries; + typedef VECTOR_CLASS > Sources; + + Program( + const STRING_CLASS& source, + cl_int* err = NULL) + { + cl_int error; + + const char * strings = source.c_str(); + const ::size_t length = source.size(); + + Context context = Context::getDefault(err); + + object_ = ::clCreateProgramWithSource( + context(), (cl_uint)1, &strings, &length, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); + + if (error == CL_SUCCESS) { + + error = ::clBuildProgram( + object_, + 0, + NULL, + "", + NULL, + NULL); + + detail::errHandler(error, __BUILD_PROGRAM_ERR); + } + + if (err != NULL) { + *err = error; + } + } + + Program( + const STRING_CLASS& source, + bool build, + cl_int* err = NULL) + { + cl_int error; + + const char * strings = source.c_str(); + const ::size_t length = source.size(); + + Context context = Context::getDefault(err); + + object_ = ::clCreateProgramWithSource( + context(), (cl_uint)1, &strings, &length, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); + + if (error == CL_SUCCESS && build) { + + error = ::clBuildProgram( + object_, + 0, + NULL, + "", + NULL, + NULL); + + detail::errHandler(error, __BUILD_PROGRAM_ERR); + } + + if (err != NULL) { + *err = error; + } + } + + Program( + const Context& context, + const STRING_CLASS& source, + bool build = false, + cl_int* err = NULL) + { + cl_int error; + + const char * strings = source.c_str(); + const ::size_t length = source.size(); + + object_ = ::clCreateProgramWithSource( + context(), (cl_uint)1, &strings, &length, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); + + if (error == CL_SUCCESS && build) { + + error = ::clBuildProgram( + object_, + 0, + NULL, + "", + NULL, + NULL); + + detail::errHandler(error, __BUILD_PROGRAM_ERR); + } + + if (err != NULL) { + *err = error; + } + } + + Program( + const Context& context, + const Sources& sources, + cl_int* err = NULL) + { + cl_int error; + + const ::size_t n = (::size_t)sources.size(); + ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t)); + const char** strings = (const char**) alloca(n * sizeof(const char*)); + + for (::size_t i = 0; i < n; ++i) { + strings[i] = sources[(int)i].first; + lengths[i] = sources[(int)i].second; + } + + object_ = ::clCreateProgramWithSource( + context(), (cl_uint)n, strings, lengths, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); + if (err != NULL) { + *err = error; + } + } + + /** + * Construct a program object from a list of devices and a per-device list of binaries. + * \param context A valid OpenCL context in which to construct the program. + * \param devices A vector of OpenCL device objects for which the program will be created. + * \param binaries A vector of pairs of a pointer to a binary object and its length. + * \param binaryStatus An optional vector that on completion will be resized to + * match the size of binaries and filled with values to specify if each binary + * was successfully loaded. + * Set to CL_SUCCESS if the binary was successfully loaded. + * Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL. + * Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device. + * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors: + * CL_INVALID_CONTEXT if context is not a valid context. + * CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; + * or if any entry in binaries is NULL or has length 0. + * CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context. + * CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device. + * CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host. + */ + Program( + const Context& context, + const VECTOR_CLASS& devices, + const Binaries& binaries, + VECTOR_CLASS* binaryStatus = NULL, + cl_int* err = NULL) + { + cl_int error; + + const ::size_t numDevices = devices.size(); + + // Catch size mismatch early and return + if(binaries.size() != numDevices) { + error = CL_INVALID_VALUE; + detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR); + if (err != NULL) { + *err = error; + } + return; + } + + ::size_t* lengths = (::size_t*) alloca(numDevices * sizeof(::size_t)); + const unsigned char** images = (const unsigned char**) alloca(numDevices * sizeof(const unsigned char**)); + + for (::size_t i = 0; i < numDevices; ++i) { + images[i] = (const unsigned char*)binaries[i].first; + lengths[i] = binaries[(int)i].second; + } + + cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id)); + for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { + deviceIDs[deviceIndex] = (devices[deviceIndex])(); + } + + if(binaryStatus) { + binaryStatus->resize(numDevices); + } + + object_ = ::clCreateProgramWithBinary( + context(), (cl_uint) devices.size(), + deviceIDs, + lengths, images, binaryStatus != NULL + ? &binaryStatus->front() + : NULL, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR); + if (err != NULL) { + *err = error; + } + } + + +#if defined(CL_VERSION_1_2) + /** + * Create program using builtin kernels. + * \param kernelNames Semi-colon separated list of builtin kernel names + */ + Program( + const Context& context, + const VECTOR_CLASS& devices, + const STRING_CLASS& kernelNames, + cl_int* err = NULL) + { + cl_int error; + + + ::size_t numDevices = devices.size(); + cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id)); + for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { + deviceIDs[deviceIndex] = (devices[deviceIndex])(); + } + + object_ = ::clCreateProgramWithBuiltInKernels( + context(), + (cl_uint) devices.size(), + deviceIDs, + kernelNames.c_str(), + &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // #if defined(CL_VERSION_1_2) + + Program() { } + + Program(const Program& program) : detail::Wrapper(program) { } + + __CL_EXPLICIT_CONSTRUCTORS Program(const cl_program& program) : detail::Wrapper(program) { } + + Program& operator = (const Program& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + Program& operator = (const cl_program& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + cl_int build( + const VECTOR_CLASS& devices, + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL) const + { + ::size_t numDevices = devices.size(); + cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id)); + for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { + deviceIDs[deviceIndex] = (devices[deviceIndex])(); + } + + return detail::errHandler( + ::clBuildProgram( + object_, + (cl_uint) + devices.size(), + deviceIDs, + options, + notifyFptr, + data), + __BUILD_PROGRAM_ERR); + } + + cl_int build( + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL) const + { + return detail::errHandler( + ::clBuildProgram( + object_, + 0, + NULL, + options, + notifyFptr, + data), + __BUILD_PROGRAM_ERR); + } + +#if defined(CL_VERSION_1_2) + cl_int compile( + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL) const + { + return detail::errHandler( + ::clCompileProgram( + object_, + 0, + NULL, + options, + 0, + NULL, + NULL, + notifyFptr, + data), + __COMPILE_PROGRAM_ERR); + } +#endif + + template + cl_int getInfo(cl_program_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetProgramInfo, object_, name, param), + __GET_PROGRAM_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_program_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + template + cl_int getBuildInfo( + const Device& device, cl_program_build_info name, T* param) const + { + return detail::errHandler( + detail::getInfo( + &::clGetProgramBuildInfo, object_, device(), name, param), + __GET_PROGRAM_BUILD_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getBuildInfo(const Device& device, cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_program_build_info, name>::param_type param; + cl_int result = getBuildInfo(device, name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + cl_int createKernels(VECTOR_CLASS* kernels) + { + cl_uint numKernels; + cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR); + } + + Kernel* value = (Kernel*) alloca(numKernels * sizeof(Kernel)); + err = ::clCreateKernelsInProgram( + object_, numKernels, (cl_kernel*) value, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR); + } + + kernels->assign(&value[0], &value[numKernels]); + return CL_SUCCESS; + } +}; + +#if defined(CL_VERSION_1_2) +inline Program linkProgram( + Program input1, + Program input2, + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL, + cl_int* err = NULL) +{ + cl_int err_local = CL_SUCCESS; + + cl_program programs[2] = { input1(), input2() }; + + Context ctx = input1.getInfo(); + + cl_program prog = ::clLinkProgram( + ctx(), + 0, + NULL, + options, + 2, + programs, + notifyFptr, + data, + &err_local); + + detail::errHandler(err_local,__COMPILE_PROGRAM_ERR); + if (err != NULL) { + *err = err_local; + } + + return Program(prog); +} + +inline Program linkProgram( + VECTOR_CLASS inputPrograms, + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL, + cl_int* err = NULL) +{ + cl_int err_local = CL_SUCCESS; + + cl_program * programs = (cl_program*) alloca(inputPrograms.size() * sizeof(cl_program)); + + if (programs != NULL) { + for (unsigned int i = 0; i < inputPrograms.size(); i++) { + programs[i] = inputPrograms[i](); + } + } + + cl_program prog = ::clLinkProgram( + Context::getDefault()(), + 0, + NULL, + options, + (cl_uint)inputPrograms.size(), + programs, + notifyFptr, + data, + &err_local); + + detail::errHandler(err_local,__COMPILE_PROGRAM_ERR); + if (err != NULL) { + *err = err_local; + } + + return Program(prog); +} +#endif + +template<> +inline VECTOR_CLASS cl::Program::getInfo(cl_int* err) const +{ + VECTOR_CLASS< ::size_t> sizes = getInfo(); + VECTOR_CLASS binaries; + for (VECTOR_CLASS< ::size_t>::iterator s = sizes.begin(); s != sizes.end(); ++s) + { + char *ptr = NULL; + if (*s != 0) + ptr = new char[*s]; + binaries.push_back(ptr); + } + + cl_int result = getInfo(CL_PROGRAM_BINARIES, &binaries); + if (err != NULL) { + *err = result; + } + return binaries; +} + +inline Kernel::Kernel(const Program& program, const char* name, cl_int* err) +{ + cl_int error; + + object_ = ::clCreateKernel(program(), name, &error); + detail::errHandler(error, __CREATE_KERNEL_ERR); + + if (err != NULL) { + *err = error; + } + +} + +/*! \class CommandQueue + * \brief CommandQueue interface for cl_command_queue. + */ +class CommandQueue : public detail::Wrapper +{ +private: + static volatile int default_initialized_; + static CommandQueue default_; + static volatile cl_int default_error_; +public: +#if !defined(CL_VERSION_2_0) || defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) + CommandQueue( + cl_command_queue_properties properties, + cl_int* err = NULL) + { + cl_int error; + + Context context = Context::getDefault(&error); + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + + if (error != CL_SUCCESS) { + if (err != NULL) { + *err = error; + } + } + else { + Device device = context.getInfo()[0]; + + object_ = ::clCreateCommandQueue( + context(), device(), properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + if (err != NULL) { + *err = error; + } + } + } +#else + CommandQueue( + const cl_queue_properties *properties, + cl_int* err = NULL) + { + cl_int error; + + Context context = Context::getDefault(&error); + detail::errHandler(error, __CREATE_COMMAND_QUEUE_PROPERTY_ERR); + + if (error != CL_SUCCESS) { + if (err != NULL) { + *err = error; + } + } + else { + Device device = context.getInfo()[0]; + + object_ = ::clCreateCommandQueueWithProperties( + context(), device(), properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_PROPERTY_ERR); + if (err != NULL) { + *err = error; + } + } + } +#endif + +#if !defined(CL_VERSION_2_0) || defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) + CommandQueue( + const Context& context, + const Device& device, + cl_command_queue_properties properties = 0, + cl_int* err = NULL) + { + cl_int error; + + object_ = ::clCreateCommandQueue( + context(), device(), properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + if (err != NULL) { + *err = error; + } + } +#else + CommandQueue( + const Context& context, + const Device& device, + const cl_queue_properties *properties = NULL, + cl_int* err = NULL) + { + cl_int error; + + object_ = ::clCreateCommandQueueWithProperties( + context(), device(), properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_PROPERTY_ERR); + if (err != NULL) { + *err = error; + } + } +#endif + + static CommandQueue getDefault(cl_int * err = NULL) + { + int state = detail::compare_exchange( + &default_initialized_, + __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED); + + if (state & __DEFAULT_INITIALIZED) { + if (err != NULL) { + *err = default_error_; + } + return default_; + } + + if (state & __DEFAULT_BEING_INITIALIZED) { + // Assume writes will propagate eventually... + while(default_initialized_ != __DEFAULT_INITIALIZED) { + detail::fence(); + } + + if (err != NULL) { + *err = default_error_; + } + return default_; + } + + cl_int error; + + Context context = Context::getDefault(&error); +#if !defined(CL_VERSION_2_0) || defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); +#else + detail::errHandler(error, __CREATE_COMMAND_QUEUE_PROPERTY_ERR); +#endif + + if (error != CL_SUCCESS) { + if (err != NULL) { + *err = error; + } + } + else { + Device device = context.getInfo()[0]; + + default_ = CommandQueue(context, device, 0, &error); + +#if !defined(CL_VERSION_2_0) || defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); +#else + detail::errHandler(error, __CREATE_COMMAND_QUEUE_PROPERTY_ERR); +#endif + if (err != NULL) { + *err = error; + } + } + + detail::fence(); + + default_error_ = error; + // Assume writes will propagate eventually... + default_initialized_ = __DEFAULT_INITIALIZED; + + detail::fence(); + + if (err != NULL) { + *err = default_error_; + } + return default_; + + } + + CommandQueue() { } + + CommandQueue(const CommandQueue& commandQueue) : detail::Wrapper(commandQueue) { } + + CommandQueue(const cl_command_queue& commandQueue) : detail::Wrapper(commandQueue) { } + + CommandQueue& operator = (const CommandQueue& rhs) + { + if (this != &rhs) { + detail::Wrapper::operator=(rhs); + } + return *this; + } + + CommandQueue& operator = (const cl_command_queue& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + template + cl_int getInfo(cl_command_queue_info name, T* param) const + { + return detail::errHandler( + detail::getInfo( + &::clGetCommandQueueInfo, object_, name, param), + __GET_COMMAND_QUEUE_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_command_queue_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + cl_int enqueueReadBuffer( + const Buffer& buffer, + cl_bool blocking, + ::size_t offset, + ::size_t size, + void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueReadBuffer( + object_, buffer(), blocking, offset, size, + ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_READ_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueWriteBuffer( + const Buffer& buffer, + cl_bool blocking, + ::size_t offset, + ::size_t size, + const void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueWriteBuffer( + object_, buffer(), blocking, offset, size, + ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_WRITE_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueCopyBuffer( + const Buffer& src, + const Buffer& dst, + ::size_t src_offset, + ::size_t dst_offset, + ::size_t size, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyBuffer( + object_, src(), dst(), src_offset, dst_offset, size, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQEUE_COPY_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueReadBufferRect( + const Buffer& buffer, + cl_bool blocking, + const size_t<3>& buffer_offset, + const size_t<3>& host_offset, + const size_t<3>& region, + ::size_t buffer_row_pitch, + ::size_t buffer_slice_pitch, + ::size_t host_row_pitch, + ::size_t host_slice_pitch, + void *ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueReadBufferRect( + object_, + buffer(), + blocking, + (const ::size_t *)buffer_offset, + (const ::size_t *)host_offset, + (const ::size_t *)region, + buffer_row_pitch, + buffer_slice_pitch, + host_row_pitch, + host_slice_pitch, + ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_READ_BUFFER_RECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueWriteBufferRect( + const Buffer& buffer, + cl_bool blocking, + const size_t<3>& buffer_offset, + const size_t<3>& host_offset, + const size_t<3>& region, + ::size_t buffer_row_pitch, + ::size_t buffer_slice_pitch, + ::size_t host_row_pitch, + ::size_t host_slice_pitch, + void *ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueWriteBufferRect( + object_, + buffer(), + blocking, + (const ::size_t *)buffer_offset, + (const ::size_t *)host_offset, + (const ::size_t *)region, + buffer_row_pitch, + buffer_slice_pitch, + host_row_pitch, + host_slice_pitch, + ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_WRITE_BUFFER_RECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueCopyBufferRect( + const Buffer& src, + const Buffer& dst, + const size_t<3>& src_origin, + const size_t<3>& dst_origin, + const size_t<3>& region, + ::size_t src_row_pitch, + ::size_t src_slice_pitch, + ::size_t dst_row_pitch, + ::size_t dst_slice_pitch, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyBufferRect( + object_, + src(), + dst(), + (const ::size_t *)src_origin, + (const ::size_t *)dst_origin, + (const ::size_t *)region, + src_row_pitch, + src_slice_pitch, + dst_row_pitch, + dst_slice_pitch, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQEUE_COPY_BUFFER_RECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if defined(CL_VERSION_1_2) + /** + * Enqueue a command to fill a buffer object with a pattern + * of a given size. The pattern is specified a as vector. + * \tparam PatternType The datatype of the pattern field. + * The pattern type must be an accepted OpenCL data type. + */ + template + cl_int enqueueFillBuffer( + const Buffer& buffer, + PatternType pattern, + ::size_t offset, + ::size_t size, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueFillBuffer( + object_, + buffer(), + static_cast(&pattern), + sizeof(PatternType), + offset, + size, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_FILL_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif // #if defined(CL_VERSION_1_2) + + cl_int enqueueReadImage( + const Image& image, + cl_bool blocking, + const size_t<3>& origin, + const size_t<3>& region, + ::size_t row_pitch, + ::size_t slice_pitch, + void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueReadImage( + object_, image(), blocking, (const ::size_t *) origin, + (const ::size_t *) region, row_pitch, slice_pitch, ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_READ_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueWriteImage( + const Image& image, + cl_bool blocking, + const size_t<3>& origin, + const size_t<3>& region, + ::size_t row_pitch, + ::size_t slice_pitch, + void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueWriteImage( + object_, image(), blocking, (const ::size_t *) origin, + (const ::size_t *) region, row_pitch, slice_pitch, ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_WRITE_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueCopyImage( + const Image& src, + const Image& dst, + const size_t<3>& src_origin, + const size_t<3>& dst_origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyImage( + object_, src(), dst(), (const ::size_t *) src_origin, + (const ::size_t *)dst_origin, (const ::size_t *) region, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_COPY_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if defined(CL_VERSION_1_2) + /** + * Enqueue a command to fill an image object with a specified color. + * \param fillColor is the color to use to fill the image. + * This is a four component RGBA floating-point color value if + * the image channel data type is not an unnormalized signed or + * unsigned data type. + */ + cl_int enqueueFillImage( + const Image& image, + cl_float4 fillColor, + const size_t<3>& origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueFillImage( + object_, + image(), + static_cast(&fillColor), + (const ::size_t *) origin, + (const ::size_t *) region, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_FILL_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * Enqueue a command to fill an image object with a specified color. + * \param fillColor is the color to use to fill the image. + * This is a four component RGBA signed integer color value if + * the image channel data type is an unnormalized signed integer + * type. + */ + cl_int enqueueFillImage( + const Image& image, + cl_int4 fillColor, + const size_t<3>& origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueFillImage( + object_, + image(), + static_cast(&fillColor), + (const ::size_t *) origin, + (const ::size_t *) region, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_FILL_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * Enqueue a command to fill an image object with a specified color. + * \param fillColor is the color to use to fill the image. + * This is a four component RGBA unsigned integer color value if + * the image channel data type is an unnormalized unsigned integer + * type. + */ + cl_int enqueueFillImage( + const Image& image, + cl_uint4 fillColor, + const size_t<3>& origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueFillImage( + object_, + image(), + static_cast(&fillColor), + (const ::size_t *) origin, + (const ::size_t *) region, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_FILL_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif // #if defined(CL_VERSION_1_2) + + cl_int enqueueCopyImageToBuffer( + const Image& src, + const Buffer& dst, + const size_t<3>& src_origin, + const size_t<3>& region, + ::size_t dst_offset, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyImageToBuffer( + object_, src(), dst(), (const ::size_t *) src_origin, + (const ::size_t *) region, dst_offset, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueCopyBufferToImage( + const Buffer& src, + const Image& dst, + ::size_t src_offset, + const size_t<3>& dst_origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyBufferToImage( + object_, src(), dst(), src_offset, + (const ::size_t *) dst_origin, (const ::size_t *) region, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + void* enqueueMapBuffer( + const Buffer& buffer, + cl_bool blocking, + cl_map_flags flags, + ::size_t offset, + ::size_t size, + const VECTOR_CLASS* events = NULL, + Event* event = NULL, + cl_int* err = NULL) const + { + cl_int error; + void * result = ::clEnqueueMapBuffer( + object_, buffer(), blocking, flags, offset, size, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (cl_event*) event, + &error); + + detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + return result; + } + + void* enqueueMapImage( + const Image& buffer, + cl_bool blocking, + cl_map_flags flags, + const size_t<3>& origin, + const size_t<3>& region, + ::size_t * row_pitch, + ::size_t * slice_pitch, + const VECTOR_CLASS* events = NULL, + Event* event = NULL, + cl_int* err = NULL) const + { + cl_int error; + void * result = ::clEnqueueMapImage( + object_, buffer(), blocking, flags, + (const ::size_t *) origin, (const ::size_t *) region, + row_pitch, slice_pitch, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (cl_event*) event, + &error); + + detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + return result; + } + + cl_int enqueueUnmapMemObject( + const Memory& memory, + void* mapped_ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueUnmapMemObject( + object_, memory(), mapped_ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_UNMAP_MEM_OBJECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if defined(CL_VERSION_1_2) + /** + * Enqueues a marker command which waits for either a list of events to complete, + * or all previously enqueued commands to complete. + * + * Enqueues a marker command which waits for either a list of events to complete, + * or if the list is empty it waits for all commands previously enqueued in command_queue + * to complete before it completes. This command returns an event which can be waited on, + * i.e. this event can be waited on to insure that all events either in the event_wait_list + * or all previously enqueued commands, queued before this command to command_queue, + * have completed. + */ + cl_int enqueueMarkerWithWaitList( + const VECTOR_CLASS *events = 0, + Event *event = 0) + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueMarkerWithWaitList( + object_, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_MARKER_WAIT_LIST_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * A synchronization point that enqueues a barrier operation. + * + * Enqueues a barrier command which waits for either a list of events to complete, + * or if the list is empty it waits for all commands previously enqueued in command_queue + * to complete before it completes. This command blocks command execution, that is, any + * following commands enqueued after it do not execute until it completes. This command + * returns an event which can be waited on, i.e. this event can be waited on to insure that + * all events either in the event_wait_list or all previously enqueued commands, queued + * before this command to command_queue, have completed. + */ + cl_int enqueueBarrierWithWaitList( + const VECTOR_CLASS *events = 0, + Event *event = 0) + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueBarrierWithWaitList( + object_, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_BARRIER_WAIT_LIST_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * Enqueues a command to indicate with which device a set of memory objects + * should be associated. + */ + cl_int enqueueMigrateMemObjects( + const VECTOR_CLASS &memObjects, + cl_mem_migration_flags flags, + const VECTOR_CLASS* events = NULL, + Event* event = NULL + ) + { + cl_event tmp; + + cl_mem* localMemObjects = static_cast(alloca(memObjects.size() * sizeof(cl_mem))); + for( int i = 0; i < (int)memObjects.size(); ++i ) { + localMemObjects[i] = memObjects[i](); + } + + + cl_int err = detail::errHandler( + ::clEnqueueMigrateMemObjects( + object_, + (cl_uint)memObjects.size(), + static_cast(localMemObjects), + flags, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_UNMAP_MEM_OBJECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif // #if defined(CL_VERSION_1_2) + + cl_int enqueueNDRangeKernel( + const Kernel& kernel, + const NDRange& offset, + const NDRange& global, + const NDRange& local = NullRange, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueNDRangeKernel( + object_, kernel(), (cl_uint) global.dimensions(), + offset.dimensions() != 0 ? (const ::size_t*) offset : NULL, + (const ::size_t*) global, + local.dimensions() != 0 ? (const ::size_t*) local : NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_NDRANGE_KERNEL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if !defined(CL_VERSION_2_0) || defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) + cl_int enqueueTask( + const Kernel& kernel, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueTask( + object_, kernel(), + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_TASK_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif + + cl_int enqueueNativeKernel( + void (CL_CALLBACK *userFptr)(void *), + std::pair args, + const VECTOR_CLASS* mem_objects = NULL, + const VECTOR_CLASS* mem_locs = NULL, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_mem * mems = (mem_objects != NULL && mem_objects->size() > 0) + ? (cl_mem*) alloca(mem_objects->size() * sizeof(cl_mem)) + : NULL; + + if (mems != NULL) { + for (unsigned int i = 0; i < mem_objects->size(); i++) { + mems[i] = ((*mem_objects)[i])(); + } + } + + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueNativeKernel( + object_, userFptr, args.first, args.second, + (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, + mems, + (mem_locs != NULL) ? (const void **) &mem_locs->front() : NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_NATIVE_KERNEL); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) + CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + { + return detail::errHandler( + ::clEnqueueMarker(object_, (cl_event*) event), + __ENQUEUE_MARKER_ERR); + } + + CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + cl_int enqueueWaitForEvents(const VECTOR_CLASS& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + { + return detail::errHandler( + ::clEnqueueWaitForEvents( + object_, + (cl_uint) events.size(), + (const cl_event*) &events.front()), + __ENQUEUE_WAIT_FOR_EVENTS_ERR); + } +#endif // #if defined(CL_VERSION_1_1) + + cl_int enqueueAcquireGLObjects( + const VECTOR_CLASS* mem_objects = NULL, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueAcquireGLObjects( + object_, + (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, + (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_ACQUIRE_GL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueReleaseGLObjects( + const VECTOR_CLASS* mem_objects = NULL, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueReleaseGLObjects( + object_, + (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, + (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_RELEASE_GL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if defined (USE_DX_INTEROP) +typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem* mem_objects, cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, cl_event* event); +typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem* mem_objects, cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, cl_event* event); + + cl_int enqueueAcquireD3D10Objects( + const VECTOR_CLASS* mem_objects = NULL, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL; +#if defined(CL_VERSION_1_2) + cl_context context = getInfo(); + cl::Device device(getInfo()); + cl_platform_id platform = device.getInfo(); + __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueAcquireD3D10ObjectsKHR); +#endif +#if defined(CL_VERSION_1_1) + __INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR); +#endif + + cl_event tmp; + cl_int err = detail::errHandler( + pfn_clEnqueueAcquireD3D10ObjectsKHR( + object_, + (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, + (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_ACQUIRE_GL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueReleaseD3D10Objects( + const VECTOR_CLASS* mem_objects = NULL, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL; +#if defined(CL_VERSION_1_2) + cl_context context = getInfo(); + cl::Device device(getInfo()); + cl_platform_id platform = device.getInfo(); + __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueReleaseD3D10ObjectsKHR); +#endif // #if defined(CL_VERSION_1_2) +#if defined(CL_VERSION_1_1) + __INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR); +#endif // #if defined(CL_VERSION_1_1) + + cl_event tmp; + cl_int err = detail::errHandler( + pfn_clEnqueueReleaseD3D10ObjectsKHR( + object_, + (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, + (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_RELEASE_GL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif + +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) + CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + { + return detail::errHandler( + ::clEnqueueBarrier(object_), + __ENQUEUE_BARRIER_ERR); + } +#endif // #if defined(CL_VERSION_1_1) + + cl_int flush() const + { + return detail::errHandler(::clFlush(object_), __FLUSH_ERR); + } + + cl_int finish() const + { + return detail::errHandler(::clFinish(object_), __FINISH_ERR); + } +}; + +#ifdef _WIN32 +__declspec(selectany) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED; +__declspec(selectany) CommandQueue CommandQueue::default_; +__declspec(selectany) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS; +#else +__attribute__((weak)) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED; +__attribute__((weak)) CommandQueue CommandQueue::default_; +__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS; +#endif + +inline cl_int enqueueReadBuffer( + const Buffer& buffer, + cl_bool blocking, + ::size_t offset, + ::size_t size, + void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event); +} + +inline cl_int enqueueWriteBuffer( + const Buffer& buffer, + cl_bool blocking, + ::size_t offset, + ::size_t size, + const void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event); +} + +inline void* enqueueMapBuffer( + const Buffer& buffer, + cl_bool blocking, + cl_map_flags flags, + ::size_t offset, + ::size_t size, + const VECTOR_CLASS* events = NULL, + Event* event = NULL, + cl_int* err = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + + void * result = ::clEnqueueMapBuffer( + queue(), buffer(), blocking, flags, offset, size, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (cl_event*) event, + &error); + + detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + return result; +} + +inline cl_int enqueueUnmapMemObject( + const Memory& memory, + void* mapped_ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + if (error != CL_SUCCESS) { + return error; + } + + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueUnmapMemObject( + queue(), memory(), mapped_ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_UNMAP_MEM_OBJECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; +} + +inline cl_int enqueueCopyBuffer( + const Buffer& src, + const Buffer& dst, + ::size_t src_offset, + ::size_t dst_offset, + ::size_t size, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event); +} + +/** + * Blocking copy operation between iterators and a buffer. + */ +template< typename IteratorType > +inline cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer ) +{ + typedef typename std::iterator_traits::value_type DataType; + cl_int error; + + ::size_t length = endIterator-startIterator; + ::size_t byteLength = length*sizeof(DataType); + + DataType *pointer = + static_cast(enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error)); + // if exceptions enabled, enqueueMapBuffer will throw + if( error != CL_SUCCESS ) { + return error; + } +#if defined(_MSC_VER) + std::copy( + startIterator, + endIterator, + stdext::checked_array_iterator( + pointer, length)); +#else + std::copy(startIterator, endIterator, pointer); +#endif + Event endEvent; + error = enqueueUnmapMemObject(buffer, pointer, 0, &endEvent); + // if exceptions enabled, enqueueUnmapMemObject will throw + if( error != CL_SUCCESS ) { + return error; + } + endEvent.wait(); + return CL_SUCCESS; +} + +/** + * Blocking copy operation between iterators and a buffer. + */ +template< typename IteratorType > +inline cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator ) +{ + typedef typename std::iterator_traits::value_type DataType; + cl_int error; + + ::size_t length = endIterator-startIterator; + ::size_t byteLength = length*sizeof(DataType); + + DataType *pointer = + static_cast(enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error)); + // if exceptions enabled, enqueueMapBuffer will throw + if( error != CL_SUCCESS ) { + return error; + } + std::copy(pointer, pointer + length, startIterator); + Event endEvent; + error = enqueueUnmapMemObject(buffer, pointer, 0, &endEvent); + // if exceptions enabled, enqueueUnmapMemObject will throw + if( error != CL_SUCCESS ) { + return error; + } + endEvent.wait(); + return CL_SUCCESS; +} + +#if defined(CL_VERSION_1_1) +inline cl_int enqueueReadBufferRect( + const Buffer& buffer, + cl_bool blocking, + const size_t<3>& buffer_offset, + const size_t<3>& host_offset, + const size_t<3>& region, + ::size_t buffer_row_pitch, + ::size_t buffer_slice_pitch, + ::size_t host_row_pitch, + ::size_t host_slice_pitch, + void *ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueReadBufferRect( + buffer, + blocking, + buffer_offset, + host_offset, + region, + buffer_row_pitch, + buffer_slice_pitch, + host_row_pitch, + host_slice_pitch, + ptr, + events, + event); +} + +inline cl_int enqueueWriteBufferRect( + const Buffer& buffer, + cl_bool blocking, + const size_t<3>& buffer_offset, + const size_t<3>& host_offset, + const size_t<3>& region, + ::size_t buffer_row_pitch, + ::size_t buffer_slice_pitch, + ::size_t host_row_pitch, + ::size_t host_slice_pitch, + void *ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueWriteBufferRect( + buffer, + blocking, + buffer_offset, + host_offset, + region, + buffer_row_pitch, + buffer_slice_pitch, + host_row_pitch, + host_slice_pitch, + ptr, + events, + event); +} + +inline cl_int enqueueCopyBufferRect( + const Buffer& src, + const Buffer& dst, + const size_t<3>& src_origin, + const size_t<3>& dst_origin, + const size_t<3>& region, + ::size_t src_row_pitch, + ::size_t src_slice_pitch, + ::size_t dst_row_pitch, + ::size_t dst_slice_pitch, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyBufferRect( + src, + dst, + src_origin, + dst_origin, + region, + src_row_pitch, + src_slice_pitch, + dst_row_pitch, + dst_slice_pitch, + events, + event); +} +#endif + +inline cl_int enqueueReadImage( + const Image& image, + cl_bool blocking, + const size_t<3>& origin, + const size_t<3>& region, + ::size_t row_pitch, + ::size_t slice_pitch, + void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueReadImage( + image, + blocking, + origin, + region, + row_pitch, + slice_pitch, + ptr, + events, + event); +} + +inline cl_int enqueueWriteImage( + const Image& image, + cl_bool blocking, + const size_t<3>& origin, + const size_t<3>& region, + ::size_t row_pitch, + ::size_t slice_pitch, + void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueWriteImage( + image, + blocking, + origin, + region, + row_pitch, + slice_pitch, + ptr, + events, + event); +} + +inline cl_int enqueueCopyImage( + const Image& src, + const Image& dst, + const size_t<3>& src_origin, + const size_t<3>& dst_origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyImage( + src, + dst, + src_origin, + dst_origin, + region, + events, + event); +} + +inline cl_int enqueueCopyImageToBuffer( + const Image& src, + const Buffer& dst, + const size_t<3>& src_origin, + const size_t<3>& region, + ::size_t dst_offset, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyImageToBuffer( + src, + dst, + src_origin, + region, + dst_offset, + events, + event); +} + +inline cl_int enqueueCopyBufferToImage( + const Buffer& src, + const Image& dst, + ::size_t src_offset, + const size_t<3>& dst_origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyBufferToImage( + src, + dst, + src_offset, + dst_origin, + region, + events, + event); +} + + +inline cl_int flush(void) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.flush(); +} + +inline cl_int finish(void) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + + return queue.finish(); +} + +// Kernel Functor support +// New interface as of September 2011 +// Requires the C++11 std::tr1::function (note do not support TR1) +// Visual Studio 2010 and GCC 4.2 + +struct EnqueueArgs +{ + CommandQueue queue_; + const NDRange offset_; + const NDRange global_; + const NDRange local_; + VECTOR_CLASS events_; + + EnqueueArgs(NDRange global) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(NullRange) + { + + } + + EnqueueArgs(NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(local) + { + + } + + EnqueueArgs(NDRange offset, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(offset), + global_(global), + local_(local) + { + + } + + EnqueueArgs(Event e, NDRange global) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(NullRange) + { + events_.push_back(e); + } + + EnqueueArgs(Event e, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(local) + { + events_.push_back(e); + } + + EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(offset), + global_(global), + local_(local) + { + events_.push_back(e); + } + + EnqueueArgs(const VECTOR_CLASS &events, NDRange global) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(NullRange), + events_(events) + { + + } + + EnqueueArgs(const VECTOR_CLASS &events, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(local), + events_(events) + { + + } + + EnqueueArgs(const VECTOR_CLASS &events, NDRange offset, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(offset), + global_(global), + local_(local), + events_(events) + { + + } + + EnqueueArgs(CommandQueue &queue, NDRange global) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(NullRange) + { + + } + + EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(local) + { + + } + + EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) : + queue_(queue), + offset_(offset), + global_(global), + local_(local) + { + + } + + EnqueueArgs(CommandQueue &queue, Event e, NDRange global) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(NullRange) + { + events_.push_back(e); + } + + EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(local) + { + events_.push_back(e); + } + + EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) : + queue_(queue), + offset_(offset), + global_(global), + local_(local) + { + events_.push_back(e); + } + + EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS &events, NDRange global) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(NullRange), + events_(events) + { + + } + + EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS &events, NDRange global, NDRange local) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(local), + events_(events) + { + + } + + EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS &events, NDRange offset, NDRange global, NDRange local) : + queue_(queue), + offset_(offset), + global_(global), + local_(local), + events_(events) + { + + } +}; + +namespace detail { + +class NullType {}; + +template +struct SetArg +{ + static void set (Kernel kernel, T0 arg) + { + kernel.setArg(index, arg); + } +}; + +template +struct SetArg +{ + static void set (Kernel, NullType) + { + } +}; + +template < + typename T0, typename T1, typename T2, typename T3, + typename T4, typename T5, typename T6, typename T7, + typename T8, typename T9, typename T10, typename T11, + typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, + typename T20, typename T21, typename T22, typename T23, + typename T24, typename T25, typename T26, typename T27, + typename T28, typename T29, typename T30, typename T31 +> +class KernelFunctorGlobal +{ +private: + Kernel kernel_; + +public: + KernelFunctorGlobal( + Kernel kernel) : + kernel_(kernel) + {} + + KernelFunctorGlobal( + const Program& program, + const STRING_CLASS name, + cl_int * err = NULL) : + kernel_(program, name.c_str(), err) + {} + + Event operator() ( + const EnqueueArgs& args, + T0 t0, + T1 t1 = NullType(), + T2 t2 = NullType(), + T3 t3 = NullType(), + T4 t4 = NullType(), + T5 t5 = NullType(), + T6 t6 = NullType(), + T7 t7 = NullType(), + T8 t8 = NullType(), + T9 t9 = NullType(), + T10 t10 = NullType(), + T11 t11 = NullType(), + T12 t12 = NullType(), + T13 t13 = NullType(), + T14 t14 = NullType(), + T15 t15 = NullType(), + T16 t16 = NullType(), + T17 t17 = NullType(), + T18 t18 = NullType(), + T19 t19 = NullType(), + T20 t20 = NullType(), + T21 t21 = NullType(), + T22 t22 = NullType(), + T23 t23 = NullType(), + T24 t24 = NullType(), + T25 t25 = NullType(), + T26 t26 = NullType(), + T27 t27 = NullType(), + T28 t28 = NullType(), + T29 t29 = NullType(), + T30 t30 = NullType(), + T31 t31 = NullType() + ) + { + Event event; + SetArg<0, T0>::set(kernel_, t0); + SetArg<1, T1>::set(kernel_, t1); + SetArg<2, T2>::set(kernel_, t2); + SetArg<3, T3>::set(kernel_, t3); + SetArg<4, T4>::set(kernel_, t4); + SetArg<5, T5>::set(kernel_, t5); + SetArg<6, T6>::set(kernel_, t6); + SetArg<7, T7>::set(kernel_, t7); + SetArg<8, T8>::set(kernel_, t8); + SetArg<9, T9>::set(kernel_, t9); + SetArg<10, T10>::set(kernel_, t10); + SetArg<11, T11>::set(kernel_, t11); + SetArg<12, T12>::set(kernel_, t12); + SetArg<13, T13>::set(kernel_, t13); + SetArg<14, T14>::set(kernel_, t14); + SetArg<15, T15>::set(kernel_, t15); + SetArg<16, T16>::set(kernel_, t16); + SetArg<17, T17>::set(kernel_, t17); + SetArg<18, T18>::set(kernel_, t18); + SetArg<19, T19>::set(kernel_, t19); + SetArg<20, T20>::set(kernel_, t20); + SetArg<21, T21>::set(kernel_, t21); + SetArg<22, T22>::set(kernel_, t22); + SetArg<23, T23>::set(kernel_, t23); + SetArg<24, T24>::set(kernel_, t24); + SetArg<25, T25>::set(kernel_, t25); + SetArg<26, T26>::set(kernel_, t26); + SetArg<27, T27>::set(kernel_, t27); + SetArg<28, T28>::set(kernel_, t28); + SetArg<29, T29>::set(kernel_, t29); + SetArg<30, T30>::set(kernel_, t30); + SetArg<31, T31>::set(kernel_, t31); + + args.queue_.enqueueNDRangeKernel( + kernel_, + args.offset_, + args.global_, + args.local_, + &args.events_, + &event); + + return event; + } + +}; + +//------------------------------------------------------------------------------------------------------ + + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26, + typename T27, + typename T28, + typename T29, + typename T30, + typename T31> +struct functionImplementation_ +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + T30, + T31> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 32)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + T30, + T31); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26, + T27 arg27, + T28 arg28, + T29 arg29, + T30 arg30, + T31 arg31) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26, + arg27, + arg28, + arg29, + arg30, + arg31); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26, + typename T27, + typename T28, + typename T29, + typename T30> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + T30, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + T30, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 31)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + T30); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26, + T27 arg27, + T28 arg28, + T29 arg29, + T30 arg30) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26, + arg27, + arg28, + arg29, + arg30); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26, + typename T27, + typename T28, + typename T29> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 30)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26, + T27 arg27, + T28 arg28, + T29 arg29) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26, + arg27, + arg28, + arg29); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26, + typename T27, + typename T28> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 29)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26, + T27 arg27, + T28 arg28) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26, + arg27, + arg28); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26, + typename T27> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 28)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26, + T27 arg27) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26, + arg27); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 27)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 26)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 25)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 24)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 23)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 22)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 21)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 20)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 19)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 18)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 17)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 16)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 15)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 14)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 13)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 12)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 11)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 10)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 9)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 8)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 7)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 6)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 5)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 4)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3); + } + + +}; + +template< + typename T0, + typename T1, + typename T2> +struct functionImplementation_ +< T0, + T1, + T2, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 3)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2); + } + + +}; + +template< + typename T0, + typename T1> +struct functionImplementation_ +< T0, + T1, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 2)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1) + { + return functor_( + enqueueArgs, + arg0, + arg1); + } + + +}; + +template< + typename T0> +struct functionImplementation_ +< T0, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 1)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0) + { + return functor_( + enqueueArgs, + arg0); + } + + +}; + + + + + +} // namespace detail + +//---------------------------------------------------------------------------------------------- + +template < + typename T0, typename T1 = detail::NullType, typename T2 = detail::NullType, + typename T3 = detail::NullType, typename T4 = detail::NullType, + typename T5 = detail::NullType, typename T6 = detail::NullType, + typename T7 = detail::NullType, typename T8 = detail::NullType, + typename T9 = detail::NullType, typename T10 = detail::NullType, + typename T11 = detail::NullType, typename T12 = detail::NullType, + typename T13 = detail::NullType, typename T14 = detail::NullType, + typename T15 = detail::NullType, typename T16 = detail::NullType, + typename T17 = detail::NullType, typename T18 = detail::NullType, + typename T19 = detail::NullType, typename T20 = detail::NullType, + typename T21 = detail::NullType, typename T22 = detail::NullType, + typename T23 = detail::NullType, typename T24 = detail::NullType, + typename T25 = detail::NullType, typename T26 = detail::NullType, + typename T27 = detail::NullType, typename T28 = detail::NullType, + typename T29 = detail::NullType, typename T30 = detail::NullType, + typename T31 = detail::NullType +> +struct make_kernel : + public detail::functionImplementation_< + T0, T1, T2, T3, + T4, T5, T6, T7, + T8, T9, T10, T11, + T12, T13, T14, T15, + T16, T17, T18, T19, + T20, T21, T22, T23, + T24, T25, T26, T27, + T28, T29, T30, T31 + > +{ +public: + typedef detail::KernelFunctorGlobal< + T0, T1, T2, T3, + T4, T5, T6, T7, + T8, T9, T10, T11, + T12, T13, T14, T15, + T16, T17, T18, T19, + T20, T21, T22, T23, + T24, T25, T26, T27, + T28, T29, T30, T31 + > FunctorType; + + make_kernel( + const Program& program, + const STRING_CLASS name, + cl_int * err = NULL) : + detail::functionImplementation_< + T0, T1, T2, T3, + T4, T5, T6, T7, + T8, T9, T10, T11, + T12, T13, T14, T15, + T16, T17, T18, T19, + T20, T21, T22, T23, + T24, T25, T26, T27, + T28, T29, T30, T31 + >( + FunctorType(program, name, err)) + {} + + make_kernel( + const Kernel kernel) : + detail::functionImplementation_< + T0, T1, T2, T3, + T4, T5, T6, T7, + T8, T9, T10, T11, + T12, T13, T14, T15, + T16, T17, T18, T19, + T20, T21, T22, T23, + T24, T25, T26, T27, + T28, T29, T30, T31 + >( + FunctorType(kernel)) + {} +}; + + +//---------------------------------------------------------------------------------------------------------------------- + +#undef __ERR_STR +#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS) +#undef __GET_DEVICE_INFO_ERR +#undef __GET_PLATFORM_INFO_ERR +#undef __GET_DEVICE_IDS_ERR +#undef __GET_CONTEXT_INFO_ERR +#undef __GET_EVENT_INFO_ERR +#undef __GET_EVENT_PROFILE_INFO_ERR +#undef __GET_MEM_OBJECT_INFO_ERR +#undef __GET_IMAGE_INFO_ERR +#undef __GET_SAMPLER_INFO_ERR +#undef __GET_KERNEL_INFO_ERR +#undef __GET_KERNEL_ARG_INFO_ERR +#undef __GET_KERNEL_WORK_GROUP_INFO_ERR +#undef __GET_PROGRAM_INFO_ERR +#undef __GET_PROGRAM_BUILD_INFO_ERR +#undef __GET_COMMAND_QUEUE_INFO_ERR + +#undef __CREATE_CONTEXT_ERR +#undef __CREATE_CONTEXT_FROM_TYPE_ERR +#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR + +#undef __CREATE_BUFFER_ERR +#undef __CREATE_SUBBUFFER_ERR +#undef __CREATE_IMAGE2D_ERR +#undef __CREATE_IMAGE3D_ERR +#undef __CREATE_SAMPLER_ERR +#undef __CREATE_SAMPLER_PROPERTY_ERR +#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR + +#undef __CREATE_USER_EVENT_ERR +#undef __SET_USER_EVENT_STATUS_ERR +#undef __SET_EVENT_CALLBACK_ERR +#undef __SET_PRINTF_CALLBACK_ERR + +#undef __WAIT_FOR_EVENTS_ERR + +#undef __CREATE_KERNEL_ERR +#undef __SET_KERNEL_ARGS_ERR +#undef __CREATE_PROGRAM_WITH_SOURCE_ERR +#undef __CREATE_PROGRAM_WITH_BINARY_ERR +#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR +#undef __BUILD_PROGRAM_ERR +#undef __CREATE_KERNELS_IN_PROGRAM_ERR + +#undef __CREATE_COMMAND_QUEUE_ERR +#undef __CREATE_COMMAND_QUEUE_PROPERTY_ERR +#undef __SET_COMMAND_QUEUE_PROPERTY_ERR +#undef __ENQUEUE_READ_BUFFER_ERR +#undef __ENQUEUE_WRITE_BUFFER_ERR +#undef __ENQUEUE_READ_BUFFER_RECT_ERR +#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR +#undef __ENQEUE_COPY_BUFFER_ERR +#undef __ENQEUE_COPY_BUFFER_RECT_ERR +#undef __ENQUEUE_READ_IMAGE_ERR +#undef __ENQUEUE_WRITE_IMAGE_ERR +#undef __ENQUEUE_COPY_IMAGE_ERR +#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR +#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR +#undef __ENQUEUE_MAP_BUFFER_ERR +#undef __ENQUEUE_MAP_IMAGE_ERR +#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR +#undef __ENQUEUE_NDRANGE_KERNEL_ERR +#undef __ENQUEUE_TASK_ERR +#undef __ENQUEUE_NATIVE_KERNEL + +#undef __CL_EXPLICIT_CONSTRUCTORS + +#undef __UNLOAD_COMPILER_ERR +#endif //__CL_USER_OVERRIDE_ERROR_STRINGS + +#undef __CL_FUNCTION_TYPE + +// Extensions +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_VERSION_1_1) +#undef __INIT_CL_EXT_FCN_PTR +#endif // #if defined(CL_VERSION_1_1) +#undef __CREATE_SUB_DEVICES + +#if defined(USE_CL_DEVICE_FISSION) +#undef __PARAM_NAME_DEVICE_FISSION +#endif // USE_CL_DEVICE_FISSION + +#undef __DEFAULT_NOT_INITIALIZED +#undef __DEFAULT_BEING_INITIALIZED +#undef __DEFAULT_INITIALIZED + +} // namespace cl + +#ifdef _WIN32 +#pragma pop_macro("max") +#endif // _WIN32 + +#endif // CL_HPP_ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_egl.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_egl.h new file mode 100644 index 0000000000000000000000000000000000000000..6fb3721f12533925ba6ca3737eb48bd864aae444 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_egl.h @@ -0,0 +1,123 @@ +/******************************************************************************* + * Copyright (c) 2008-2020 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ + +#ifndef __OPENCL_CL_EGL_H +#define __OPENCL_CL_EGL_H + +#ifdef __APPLE__ +#else +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + + +/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */ +#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR 0x202F +#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR 0x202D +#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR 0x202E + +/* Error type for clCreateFromEGLImageKHR */ +#define CL_INVALID_EGL_OBJECT_KHR -1093 +#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR -1092 + +/* CLeglImageKHR is an opaque handle to an EGLImage */ +typedef void* CLeglImageKHR; + +/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */ +typedef void* CLeglDisplayKHR; + +/* CLeglSyncKHR is an opaque handle to an EGLSync object */ +typedef void* CLeglSyncKHR; + +/* properties passed to clCreateFromEGLImageKHR */ +typedef intptr_t cl_egl_image_properties_khr; + + +#define cl_khr_egl_image 1 + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromEGLImageKHR(cl_context context, + CLeglDisplayKHR egldisplay, + CLeglImageKHR eglimage, + cl_mem_flags flags, + const cl_egl_image_properties_khr * properties, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)( + cl_context context, + CLeglDisplayKHR egldisplay, + CLeglImageKHR eglimage, + cl_mem_flags flags, + const cl_egl_image_properties_khr * properties, + cl_int * errcode_ret); + + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireEGLObjectsKHR(cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event); + + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseEGLObjectsKHR(cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event); + + +#define cl_khr_egl_event 1 + +extern CL_API_ENTRY cl_event CL_API_CALL +clCreateEventFromEGLSyncKHR(cl_context context, + CLeglSyncKHR sync, + CLeglDisplayKHR display, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)( + cl_context context, + CLeglSyncKHR sync, + CLeglDisplayKHR display, + cl_int * errcode_ret); + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_EGL_H */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_gl.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_gl.h new file mode 100644 index 0000000000000000000000000000000000000000..4e390d197fd1ef1b36b17c0bcabbabbac7a14d66 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_gl.h @@ -0,0 +1,154 @@ +/******************************************************************************* + * Copyright (c) 2008-2020 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ + +#ifndef __OPENCL_CL_GL_H +#define __OPENCL_CL_GL_H + +#ifdef __APPLE__ +#include +#else +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef cl_uint cl_gl_object_type; +typedef cl_uint cl_gl_texture_info; +typedef cl_uint cl_gl_platform_info; +typedef struct __GLsync *cl_GLsync; + +/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken */ +#define CL_GL_OBJECT_BUFFER 0x2000 +#define CL_GL_OBJECT_TEXTURE2D 0x2001 +#define CL_GL_OBJECT_TEXTURE3D 0x2002 +#define CL_GL_OBJECT_RENDERBUFFER 0x2003 +#define CL_GL_OBJECT_TEXTURE2D_ARRAY 0x200E +#define CL_GL_OBJECT_TEXTURE1D 0x200F +#define CL_GL_OBJECT_TEXTURE1D_ARRAY 0x2010 +#define CL_GL_OBJECT_TEXTURE_BUFFER 0x2011 + +/* cl_gl_texture_info */ +#define CL_GL_TEXTURE_TARGET 0x2004 +#define CL_GL_MIPMAP_LEVEL 0x2005 +#define CL_GL_NUM_SAMPLES 0x2012 + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromGLBuffer(cl_context context, + cl_mem_flags flags, + cl_GLuint bufobj, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromGLTexture(cl_context context, + cl_mem_flags flags, + cl_GLenum target, + cl_GLint miplevel, + cl_GLuint texture, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromGLRenderbuffer(cl_context context, + cl_mem_flags flags, + cl_GLuint renderbuffer, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetGLObjectInfo(cl_mem memobj, + cl_gl_object_type * gl_object_type, + cl_GLuint * gl_object_name) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetGLTextureInfo(cl_mem memobj, + cl_gl_texture_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireGLObjects(cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseGLObjects(cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + + +/* Deprecated OpenCL 1.1 APIs */ +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateFromGLTexture2D(cl_context context, + cl_mem_flags flags, + cl_GLenum target, + cl_GLint miplevel, + cl_GLuint texture, + cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateFromGLTexture3D(cl_context context, + cl_mem_flags flags, + cl_GLenum target, + cl_GLint miplevel, + cl_GLuint texture, + cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +/* cl_khr_gl_sharing extension */ + +#define cl_khr_gl_sharing 1 + +typedef cl_uint cl_gl_context_info; + +/* Additional Error Codes */ +#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000 + +/* cl_gl_context_info */ +#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006 +#define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007 + +/* Additional cl_context_properties */ +#define CL_GL_CONTEXT_KHR 0x2008 +#define CL_EGL_DISPLAY_KHR 0x2009 +#define CL_GLX_DISPLAY_KHR 0x200A +#define CL_WGL_HDC_KHR 0x200B +#define CL_CGL_SHAREGROUP_KHR 0x200C + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetGLContextInfoKHR(const cl_context_properties * properties, + cl_gl_context_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)( + const cl_context_properties * properties, + cl_gl_context_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret); + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_GL_H */ \ No newline at end of file diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_gl_ext.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_gl_ext.h new file mode 100644 index 0000000000000000000000000000000000000000..83102e3435e93eaa34c99efae3680c913ba3990e --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_gl_ext.h @@ -0,0 +1,44 @@ +/******************************************************************************* + * Copyright (c) 2008-2020 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ + +#ifndef __OPENCL_CL_GL_EXT_H +#define __OPENCL_CL_GL_EXT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __APPLE__ + #include +#else + #include +#endif + +/* + * cl_khr_gl_event extension + */ +#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D + +extern CL_API_ENTRY cl_event CL_API_CALL +clCreateEventFromGLsyncKHR(cl_context context, + cl_GLsync sync, + cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_GL_EXT_H */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_platform.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_platform.h new file mode 100644 index 0000000000000000000000000000000000000000..0fba9080bd6cf87a1614c1378a1b9b151efbd21e --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/cl_platform.h @@ -0,0 +1,1414 @@ +/******************************************************************************* + * Copyright (c) 2008-2020 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ + +#ifndef __CL_PLATFORM_H +#define __CL_PLATFORM_H + +#ifdef __APPLE__ + /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */ + #include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_WIN32) + #define CL_API_ENTRY + #define CL_API_CALL __stdcall + #define CL_CALLBACK __stdcall +#else + #define CL_API_ENTRY + #define CL_API_CALL + #define CL_CALLBACK +#endif + +/* + * Deprecation flags refer to the last version of the header in which the + * feature was not deprecated. + * + * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without + * deprecation but is deprecated in versions later than 1.1. + */ +#ifdef __APPLE__ + #define CL_EXTENSION_WEAK_LINK __attribute__((weak_import)) + #define CL_API_SUFFIX__VERSION_1_0 AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_0 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER + #define CL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define GCL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + + #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 + #else + #warning This path should never happen outside of internal operating system development. AvailabilityMacros do not function correctly here! + #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #endif + + + +#else + #define CL_EXTENSION_WEAK_LINK + #define CL_API_SUFFIX__VERSION_1_0 + #define CL_EXT_SUFFIX__VERSION_1_0 + #define CL_API_SUFFIX__VERSION_1_1 + #define CL_EXT_SUFFIX__VERSION_1_1 + #define CL_API_SUFFIX__VERSION_1_2 + #define CL_EXT_SUFFIX__VERSION_1_2 + #define CL_API_SUFFIX__VERSION_2_0 + #define CL_EXT_SUFFIX__VERSION_2_0 + #define CL_API_SUFFIX__VERSION_2_1 + #define CL_EXT_SUFFIX__VERSION_2_1 + #define CL_API_SUFFIX__VERSION_2_2 + #define CL_EXT_SUFFIX__VERSION_2_2 + #define CL_API_SUFFIX__VERSION_3_0 + #define CL_EXT_SUFFIX__VERSION_3_0 + #define CL_API_SUFFIX__EXPERIMENTAL + #define CL_EXT_SUFFIX__EXPERIMENTAL + + #ifdef __GNUC__ + #define CL_EXT_SUFFIX_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX_DEPRECATED + #elif defined(_WIN32) + #define CL_EXT_SUFFIX_DEPRECATED + #define CL_EXT_PREFIX_DEPRECATED __declspec(deprecated) + #else + #define CL_EXT_SUFFIX_DEPRECATED + #define CL_EXT_PREFIX_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS + #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS + #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS + #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_2_2_APIS + #define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED + #endif +#endif + +#if (defined (_WIN32) && defined(_MSC_VER)) + +/* scalar types */ +typedef signed __int8 cl_char; +typedef unsigned __int8 cl_uchar; +typedef signed __int16 cl_short; +typedef unsigned __int16 cl_ushort; +typedef signed __int32 cl_int; +typedef unsigned __int32 cl_uint; +typedef signed __int64 cl_long; +typedef unsigned __int64 cl_ulong; + +typedef unsigned __int16 cl_half; +typedef float cl_float; +typedef double cl_double; + +/* Macro names and corresponding values defined by OpenCL */ +#define CL_CHAR_BIT 8 +#define CL_SCHAR_MAX 127 +#define CL_SCHAR_MIN (-127-1) +#define CL_CHAR_MAX CL_SCHAR_MAX +#define CL_CHAR_MIN CL_SCHAR_MIN +#define CL_UCHAR_MAX 255 +#define CL_SHRT_MAX 32767 +#define CL_SHRT_MIN (-32767-1) +#define CL_USHRT_MAX 65535 +#define CL_INT_MAX 2147483647 +#define CL_INT_MIN (-2147483647-1) +#define CL_UINT_MAX 0xffffffffU +#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) +#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) +#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) + +#define CL_FLT_DIG 6 +#define CL_FLT_MANT_DIG 24 +#define CL_FLT_MAX_10_EXP +38 +#define CL_FLT_MAX_EXP +128 +#define CL_FLT_MIN_10_EXP -37 +#define CL_FLT_MIN_EXP -125 +#define CL_FLT_RADIX 2 +#define CL_FLT_MAX 340282346638528859811704183484516925440.0f +#define CL_FLT_MIN 1.175494350822287507969e-38f +#define CL_FLT_EPSILON 1.1920928955078125e-7f + +#define CL_HALF_DIG 3 +#define CL_HALF_MANT_DIG 11 +#define CL_HALF_MAX_10_EXP +4 +#define CL_HALF_MAX_EXP +16 +#define CL_HALF_MIN_10_EXP -4 +#define CL_HALF_MIN_EXP -13 +#define CL_HALF_RADIX 2 +#define CL_HALF_MAX 65504.0f +#define CL_HALF_MIN 6.103515625e-05f +#define CL_HALF_EPSILON 9.765625e-04f + +#define CL_DBL_DIG 15 +#define CL_DBL_MANT_DIG 53 +#define CL_DBL_MAX_10_EXP +308 +#define CL_DBL_MAX_EXP +1024 +#define CL_DBL_MIN_10_EXP -307 +#define CL_DBL_MIN_EXP -1021 +#define CL_DBL_RADIX 2 +#define CL_DBL_MAX 1.7976931348623158e+308 +#define CL_DBL_MIN 2.225073858507201383090e-308 +#define CL_DBL_EPSILON 2.220446049250313080847e-16 + +#define CL_M_E 2.7182818284590452354 +#define CL_M_LOG2E 1.4426950408889634074 +#define CL_M_LOG10E 0.43429448190325182765 +#define CL_M_LN2 0.69314718055994530942 +#define CL_M_LN10 2.30258509299404568402 +#define CL_M_PI 3.14159265358979323846 +#define CL_M_PI_2 1.57079632679489661923 +#define CL_M_PI_4 0.78539816339744830962 +#define CL_M_1_PI 0.31830988618379067154 +#define CL_M_2_PI 0.63661977236758134308 +#define CL_M_2_SQRTPI 1.12837916709551257390 +#define CL_M_SQRT2 1.41421356237309504880 +#define CL_M_SQRT1_2 0.70710678118654752440 + +#define CL_M_E_F 2.718281828f +#define CL_M_LOG2E_F 1.442695041f +#define CL_M_LOG10E_F 0.434294482f +#define CL_M_LN2_F 0.693147181f +#define CL_M_LN10_F 2.302585093f +#define CL_M_PI_F 3.141592654f +#define CL_M_PI_2_F 1.570796327f +#define CL_M_PI_4_F 0.785398163f +#define CL_M_1_PI_F 0.318309886f +#define CL_M_2_PI_F 0.636619772f +#define CL_M_2_SQRTPI_F 1.128379167f +#define CL_M_SQRT2_F 1.414213562f +#define CL_M_SQRT1_2_F 0.707106781f + +#define CL_NAN (CL_INFINITY - CL_INFINITY) +#define CL_HUGE_VALF ((cl_float) 1e50) +#define CL_HUGE_VAL ((cl_double) 1e500) +#define CL_MAXFLOAT CL_FLT_MAX +#define CL_INFINITY CL_HUGE_VALF + +#else + +#include + +/* scalar types */ +typedef int8_t cl_char; +typedef uint8_t cl_uchar; +typedef int16_t cl_short; +typedef uint16_t cl_ushort; +typedef int32_t cl_int; +typedef uint32_t cl_uint; +typedef int64_t cl_long; +typedef uint64_t cl_ulong; + +typedef uint16_t cl_half; +typedef float cl_float; +typedef double cl_double; + +/* Macro names and corresponding values defined by OpenCL */ +#define CL_CHAR_BIT 8 +#define CL_SCHAR_MAX 127 +#define CL_SCHAR_MIN (-127-1) +#define CL_CHAR_MAX CL_SCHAR_MAX +#define CL_CHAR_MIN CL_SCHAR_MIN +#define CL_UCHAR_MAX 255 +#define CL_SHRT_MAX 32767 +#define CL_SHRT_MIN (-32767-1) +#define CL_USHRT_MAX 65535 +#define CL_INT_MAX 2147483647 +#define CL_INT_MIN (-2147483647-1) +#define CL_UINT_MAX 0xffffffffU +#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) +#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) +#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) + +#define CL_FLT_DIG 6 +#define CL_FLT_MANT_DIG 24 +#define CL_FLT_MAX_10_EXP +38 +#define CL_FLT_MAX_EXP +128 +#define CL_FLT_MIN_10_EXP -37 +#define CL_FLT_MIN_EXP -125 +#define CL_FLT_RADIX 2 +#define CL_FLT_MAX 340282346638528859811704183484516925440.0f +#define CL_FLT_MIN 1.175494350822287507969e-38f +#define CL_FLT_EPSILON 1.1920928955078125e-7f + +#define CL_HALF_DIG 3 +#define CL_HALF_MANT_DIG 11 +#define CL_HALF_MAX_10_EXP +4 +#define CL_HALF_MAX_EXP +16 +#define CL_HALF_MIN_10_EXP -4 +#define CL_HALF_MIN_EXP -13 +#define CL_HALF_RADIX 2 +#define CL_HALF_MAX 65504.0f +#define CL_HALF_MIN 6.103515625e-05f +#define CL_HALF_EPSILON 9.765625e-04f + +#define CL_DBL_DIG 15 +#define CL_DBL_MANT_DIG 53 +#define CL_DBL_MAX_10_EXP +308 +#define CL_DBL_MAX_EXP +1024 +#define CL_DBL_MIN_10_EXP -307 +#define CL_DBL_MIN_EXP -1021 +#define CL_DBL_RADIX 2 +#define CL_DBL_MAX 179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0 +#define CL_DBL_MIN 2.225073858507201383090e-308 +#define CL_DBL_EPSILON 2.220446049250313080847e-16 + +#define CL_M_E 2.7182818284590452354 +#define CL_M_LOG2E 1.4426950408889634074 +#define CL_M_LOG10E 0.43429448190325182765 +#define CL_M_LN2 0.69314718055994530942 +#define CL_M_LN10 2.30258509299404568402 +#define CL_M_PI 3.14159265358979323846 +#define CL_M_PI_2 1.57079632679489661923 +#define CL_M_PI_4 0.78539816339744830962 +#define CL_M_1_PI 0.31830988618379067154 +#define CL_M_2_PI 0.63661977236758134308 +#define CL_M_2_SQRTPI 1.12837916709551257390 +#define CL_M_SQRT2 1.41421356237309504880 +#define CL_M_SQRT1_2 0.70710678118654752440 + +#define CL_M_E_F 2.718281828f +#define CL_M_LOG2E_F 1.442695041f +#define CL_M_LOG10E_F 0.434294482f +#define CL_M_LN2_F 0.693147181f +#define CL_M_LN10_F 2.302585093f +#define CL_M_PI_F 3.141592654f +#define CL_M_PI_2_F 1.570796327f +#define CL_M_PI_4_F 0.785398163f +#define CL_M_1_PI_F 0.318309886f +#define CL_M_2_PI_F 0.636619772f +#define CL_M_2_SQRTPI_F 1.128379167f +#define CL_M_SQRT2_F 1.414213562f +#define CL_M_SQRT1_2_F 0.707106781f + +#if defined( __GNUC__ ) + #define CL_HUGE_VALF __builtin_huge_valf() + #define CL_HUGE_VAL __builtin_huge_val() + #define CL_NAN __builtin_nanf( "" ) +#else + #define CL_HUGE_VALF ((cl_float) 1e50) + #define CL_HUGE_VAL ((cl_double) 1e500) + float nanf( const char * ); + #define CL_NAN nanf( "" ) +#endif +#define CL_MAXFLOAT CL_FLT_MAX +#define CL_INFINITY CL_HUGE_VALF + +#endif + +#include + +/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */ +typedef unsigned int cl_GLuint; +typedef int cl_GLint; +typedef unsigned int cl_GLenum; + +/* + * Vector types + * + * Note: OpenCL requires that all types be naturally aligned. + * This means that vector types must be naturally aligned. + * For example, a vector of four floats must be aligned to + * a 16 byte boundary (calculated as 4 * the natural 4-byte + * alignment of the float). The alignment qualifiers here + * will only function properly if your compiler supports them + * and if you don't actively work to defeat them. For example, + * in order for a cl_float4 to be 16 byte aligned in a struct, + * the start of the struct must itself be 16-byte aligned. + * + * Maintaining proper alignment is the user's responsibility. + */ + +/* Define basic vector types */ +#if defined( __VEC__ ) + #if !defined(__clang__) + #include /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */ + #endif + typedef __vector unsigned char __cl_uchar16; + typedef __vector signed char __cl_char16; + typedef __vector unsigned short __cl_ushort8; + typedef __vector signed short __cl_short8; + typedef __vector unsigned int __cl_uint4; + typedef __vector signed int __cl_int4; + typedef __vector float __cl_float4; + #define __CL_UCHAR16__ 1 + #define __CL_CHAR16__ 1 + #define __CL_USHORT8__ 1 + #define __CL_SHORT8__ 1 + #define __CL_UINT4__ 1 + #define __CL_INT4__ 1 + #define __CL_FLOAT4__ 1 +#endif + +#if defined( __SSE__ ) + #if defined( __MINGW64__ ) + #include + #else + #include + #endif + #if defined( __GNUC__ ) + typedef float __cl_float4 __attribute__((vector_size(16))); + #else + typedef __m128 __cl_float4; + #endif + #define __CL_FLOAT4__ 1 +#endif + +#if defined( __SSE2__ ) + #if defined( __MINGW64__ ) + #include + #else + #include + #endif + #if defined( __GNUC__ ) + typedef cl_uchar __cl_uchar16 __attribute__((vector_size(16))); + typedef cl_char __cl_char16 __attribute__((vector_size(16))); + typedef cl_ushort __cl_ushort8 __attribute__((vector_size(16))); + typedef cl_short __cl_short8 __attribute__((vector_size(16))); + typedef cl_uint __cl_uint4 __attribute__((vector_size(16))); + typedef cl_int __cl_int4 __attribute__((vector_size(16))); + typedef cl_ulong __cl_ulong2 __attribute__((vector_size(16))); + typedef cl_long __cl_long2 __attribute__((vector_size(16))); + typedef cl_double __cl_double2 __attribute__((vector_size(16))); + #else + typedef __m128i __cl_uchar16; + typedef __m128i __cl_char16; + typedef __m128i __cl_ushort8; + typedef __m128i __cl_short8; + typedef __m128i __cl_uint4; + typedef __m128i __cl_int4; + typedef __m128i __cl_ulong2; + typedef __m128i __cl_long2; + typedef __m128d __cl_double2; + #endif + #define __CL_UCHAR16__ 1 + #define __CL_CHAR16__ 1 + #define __CL_USHORT8__ 1 + #define __CL_SHORT8__ 1 + #define __CL_INT4__ 1 + #define __CL_UINT4__ 1 + #define __CL_ULONG2__ 1 + #define __CL_LONG2__ 1 + #define __CL_DOUBLE2__ 1 +#endif + +#if defined( __MMX__ ) + #include + #if defined( __GNUC__ ) + typedef cl_uchar __cl_uchar8 __attribute__((vector_size(8))); + typedef cl_char __cl_char8 __attribute__((vector_size(8))); + typedef cl_ushort __cl_ushort4 __attribute__((vector_size(8))); + typedef cl_short __cl_short4 __attribute__((vector_size(8))); + typedef cl_uint __cl_uint2 __attribute__((vector_size(8))); + typedef cl_int __cl_int2 __attribute__((vector_size(8))); + typedef cl_ulong __cl_ulong1 __attribute__((vector_size(8))); + typedef cl_long __cl_long1 __attribute__((vector_size(8))); + typedef cl_float __cl_float2 __attribute__((vector_size(8))); + #else + typedef __m64 __cl_uchar8; + typedef __m64 __cl_char8; + typedef __m64 __cl_ushort4; + typedef __m64 __cl_short4; + typedef __m64 __cl_uint2; + typedef __m64 __cl_int2; + typedef __m64 __cl_ulong1; + typedef __m64 __cl_long1; + typedef __m64 __cl_float2; + #endif + #define __CL_UCHAR8__ 1 + #define __CL_CHAR8__ 1 + #define __CL_USHORT4__ 1 + #define __CL_SHORT4__ 1 + #define __CL_INT2__ 1 + #define __CL_UINT2__ 1 + #define __CL_ULONG1__ 1 + #define __CL_LONG1__ 1 + #define __CL_FLOAT2__ 1 +#endif + +#if defined( __AVX__ ) + #if defined( __MINGW64__ ) + #include + #else + #include + #endif + #if defined( __GNUC__ ) + typedef cl_float __cl_float8 __attribute__((vector_size(32))); + typedef cl_double __cl_double4 __attribute__((vector_size(32))); + #else + typedef __m256 __cl_float8; + typedef __m256d __cl_double4; + #endif + #define __CL_FLOAT8__ 1 + #define __CL_DOUBLE4__ 1 +#endif + +/* Define capabilities for anonymous struct members. */ +#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L +#define __CL_HAS_ANON_STRUCT__ 1 +#define __CL_ANON_STRUCT__ +#elif defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) +#define __CL_HAS_ANON_STRUCT__ 1 +#define __CL_ANON_STRUCT__ __extension__ +#elif defined( _WIN32) && defined(_MSC_VER) && ! defined(__STDC__) + #if _MSC_VER >= 1500 + /* Microsoft Developer Studio 2008 supports anonymous structs, but + * complains by default. */ + #define __CL_HAS_ANON_STRUCT__ 1 + #define __CL_ANON_STRUCT__ + /* Disable warning C4201: nonstandard extension used : nameless + * struct/union */ + #pragma warning( push ) + #pragma warning( disable : 4201 ) + #endif +#else +#define __CL_HAS_ANON_STRUCT__ 0 +#define __CL_ANON_STRUCT__ +#endif + +/* Define alignment keys */ +#if defined( __GNUC__ ) + #define CL_ALIGNED(_x) __attribute__ ((aligned(_x))) +#elif defined( _WIN32) && (_MSC_VER) + /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements */ + /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx */ + /* #include */ + /* #define CL_ALIGNED(_x) _CRT_ALIGN(_x) */ + #define CL_ALIGNED(_x) +#else + #warning Need to implement some method to align data here + #define CL_ALIGNED(_x) +#endif + +/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */ +#if __CL_HAS_ANON_STRUCT__ + /* .xyzw and .s0123...{f|F} are supported */ + #define CL_HAS_NAMED_VECTOR_FIELDS 1 + /* .hi and .lo are supported */ + #define CL_HAS_HI_LO_VECTOR_FIELDS 1 +#endif + +/* Define cl_vector types */ + +/* ---- cl_charn ---- */ +typedef union +{ + cl_char CL_ALIGNED(2) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_char lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2; +#endif +}cl_char2; + +typedef union +{ + cl_char CL_ALIGNED(4) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[2]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4; +#endif +}cl_char4; + +/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */ +typedef cl_char4 cl_char3; + +typedef union +{ + cl_char CL_ALIGNED(8) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[4]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4[2]; +#endif +#if defined( __CL_CHAR8__ ) + __cl_char8 v8; +#endif +}cl_char8; + +typedef union +{ + cl_char CL_ALIGNED(16) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[8]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4[4]; +#endif +#if defined( __CL_CHAR8__ ) + __cl_char8 v8[2]; +#endif +#if defined( __CL_CHAR16__ ) + __cl_char16 v16; +#endif +}cl_char16; + + +/* ---- cl_ucharn ---- */ +typedef union +{ + cl_uchar CL_ALIGNED(2) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_uchar lo, hi; }; +#endif +#if defined( __cl_uchar2__) + __cl_uchar2 v2; +#endif +}cl_uchar2; + +typedef union +{ + cl_uchar CL_ALIGNED(4) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[2]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4; +#endif +}cl_uchar4; + +/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */ +typedef cl_uchar4 cl_uchar3; + +typedef union +{ + cl_uchar CL_ALIGNED(8) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[4]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4[2]; +#endif +#if defined( __CL_UCHAR8__ ) + __cl_uchar8 v8; +#endif +}cl_uchar8; + +typedef union +{ + cl_uchar CL_ALIGNED(16) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[8]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4[4]; +#endif +#if defined( __CL_UCHAR8__ ) + __cl_uchar8 v8[2]; +#endif +#if defined( __CL_UCHAR16__ ) + __cl_uchar16 v16; +#endif +}cl_uchar16; + + +/* ---- cl_shortn ---- */ +typedef union +{ + cl_short CL_ALIGNED(4) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_short lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2; +#endif +}cl_short2; + +typedef union +{ + cl_short CL_ALIGNED(8) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[2]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4; +#endif +}cl_short4; + +/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */ +typedef cl_short4 cl_short3; + +typedef union +{ + cl_short CL_ALIGNED(16) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[4]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4[2]; +#endif +#if defined( __CL_SHORT8__ ) + __cl_short8 v8; +#endif +}cl_short8; + +typedef union +{ + cl_short CL_ALIGNED(32) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[8]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4[4]; +#endif +#if defined( __CL_SHORT8__ ) + __cl_short8 v8[2]; +#endif +#if defined( __CL_SHORT16__ ) + __cl_short16 v16; +#endif +}cl_short16; + + +/* ---- cl_ushortn ---- */ +typedef union +{ + cl_ushort CL_ALIGNED(4) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_ushort lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2; +#endif +}cl_ushort2; + +typedef union +{ + cl_ushort CL_ALIGNED(8) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[2]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4; +#endif +}cl_ushort4; + +/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */ +typedef cl_ushort4 cl_ushort3; + +typedef union +{ + cl_ushort CL_ALIGNED(16) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[4]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4[2]; +#endif +#if defined( __CL_USHORT8__ ) + __cl_ushort8 v8; +#endif +}cl_ushort8; + +typedef union +{ + cl_ushort CL_ALIGNED(32) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[8]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4[4]; +#endif +#if defined( __CL_USHORT8__ ) + __cl_ushort8 v8[2]; +#endif +#if defined( __CL_USHORT16__ ) + __cl_ushort16 v16; +#endif +}cl_ushort16; + + +/* ---- cl_halfn ---- */ +typedef union +{ + cl_half CL_ALIGNED(4) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_half x, y; }; + __CL_ANON_STRUCT__ struct{ cl_half s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_half lo, hi; }; +#endif +#if defined( __CL_HALF2__) + __cl_half2 v2; +#endif +}cl_half2; + +typedef union +{ + cl_half CL_ALIGNED(8) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; }; +#endif +#if defined( __CL_HALF2__) + __cl_half2 v2[2]; +#endif +#if defined( __CL_HALF4__) + __cl_half4 v4; +#endif +}cl_half4; + +/* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */ +typedef cl_half4 cl_half3; + +typedef union +{ + cl_half CL_ALIGNED(16) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; }; +#endif +#if defined( __CL_HALF2__) + __cl_half2 v2[4]; +#endif +#if defined( __CL_HALF4__) + __cl_half4 v4[2]; +#endif +#if defined( __CL_HALF8__ ) + __cl_half8 v8; +#endif +}cl_half8; + +typedef union +{ + cl_half CL_ALIGNED(32) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; }; +#endif +#if defined( __CL_HALF2__) + __cl_half2 v2[8]; +#endif +#if defined( __CL_HALF4__) + __cl_half4 v4[4]; +#endif +#if defined( __CL_HALF8__ ) + __cl_half8 v8[2]; +#endif +#if defined( __CL_HALF16__ ) + __cl_half16 v16; +#endif +}cl_half16; + +/* ---- cl_intn ---- */ +typedef union +{ + cl_int CL_ALIGNED(8) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_int lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2; +#endif +}cl_int2; + +typedef union +{ + cl_int CL_ALIGNED(16) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[2]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4; +#endif +}cl_int4; + +/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */ +typedef cl_int4 cl_int3; + +typedef union +{ + cl_int CL_ALIGNED(32) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[4]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4[2]; +#endif +#if defined( __CL_INT8__ ) + __cl_int8 v8; +#endif +}cl_int8; + +typedef union +{ + cl_int CL_ALIGNED(64) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[8]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4[4]; +#endif +#if defined( __CL_INT8__ ) + __cl_int8 v8[2]; +#endif +#if defined( __CL_INT16__ ) + __cl_int16 v16; +#endif +}cl_int16; + + +/* ---- cl_uintn ---- */ +typedef union +{ + cl_uint CL_ALIGNED(8) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_uint lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2; +#endif +}cl_uint2; + +typedef union +{ + cl_uint CL_ALIGNED(16) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[2]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4; +#endif +}cl_uint4; + +/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */ +typedef cl_uint4 cl_uint3; + +typedef union +{ + cl_uint CL_ALIGNED(32) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[4]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4[2]; +#endif +#if defined( __CL_UINT8__ ) + __cl_uint8 v8; +#endif +}cl_uint8; + +typedef union +{ + cl_uint CL_ALIGNED(64) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[8]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4[4]; +#endif +#if defined( __CL_UINT8__ ) + __cl_uint8 v8[2]; +#endif +#if defined( __CL_UINT16__ ) + __cl_uint16 v16; +#endif +}cl_uint16; + +/* ---- cl_longn ---- */ +typedef union +{ + cl_long CL_ALIGNED(16) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_long lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2; +#endif +}cl_long2; + +typedef union +{ + cl_long CL_ALIGNED(32) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[2]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4; +#endif +}cl_long4; + +/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */ +typedef cl_long4 cl_long3; + +typedef union +{ + cl_long CL_ALIGNED(64) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[4]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4[2]; +#endif +#if defined( __CL_LONG8__ ) + __cl_long8 v8; +#endif +}cl_long8; + +typedef union +{ + cl_long CL_ALIGNED(128) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[8]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4[4]; +#endif +#if defined( __CL_LONG8__ ) + __cl_long8 v8[2]; +#endif +#if defined( __CL_LONG16__ ) + __cl_long16 v16; +#endif +}cl_long16; + + +/* ---- cl_ulongn ---- */ +typedef union +{ + cl_ulong CL_ALIGNED(16) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_ulong lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2; +#endif +}cl_ulong2; + +typedef union +{ + cl_ulong CL_ALIGNED(32) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[2]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4; +#endif +}cl_ulong4; + +/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */ +typedef cl_ulong4 cl_ulong3; + +typedef union +{ + cl_ulong CL_ALIGNED(64) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[4]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4[2]; +#endif +#if defined( __CL_ULONG8__ ) + __cl_ulong8 v8; +#endif +}cl_ulong8; + +typedef union +{ + cl_ulong CL_ALIGNED(128) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[8]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4[4]; +#endif +#if defined( __CL_ULONG8__ ) + __cl_ulong8 v8[2]; +#endif +#if defined( __CL_ULONG16__ ) + __cl_ulong16 v16; +#endif +}cl_ulong16; + + +/* --- cl_floatn ---- */ + +typedef union +{ + cl_float CL_ALIGNED(8) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_float lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2; +#endif +}cl_float2; + +typedef union +{ + cl_float CL_ALIGNED(16) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_float2 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[2]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4; +#endif +}cl_float4; + +/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */ +typedef cl_float4 cl_float3; + +typedef union +{ + cl_float CL_ALIGNED(32) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_float4 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[4]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4[2]; +#endif +#if defined( __CL_FLOAT8__ ) + __cl_float8 v8; +#endif +}cl_float8; + +typedef union +{ + cl_float CL_ALIGNED(64) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[8]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4[4]; +#endif +#if defined( __CL_FLOAT8__ ) + __cl_float8 v8[2]; +#endif +#if defined( __CL_FLOAT16__ ) + __cl_float16 v16; +#endif +}cl_float16; + +/* --- cl_doublen ---- */ + +typedef union +{ + cl_double CL_ALIGNED(16) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_double lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2; +#endif +}cl_double2; + +typedef union +{ + cl_double CL_ALIGNED(32) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[2]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4; +#endif +}cl_double4; + +/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */ +typedef cl_double4 cl_double3; + +typedef union +{ + cl_double CL_ALIGNED(64) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[4]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4[2]; +#endif +#if defined( __CL_DOUBLE8__ ) + __cl_double8 v8; +#endif +}cl_double8; + +typedef union +{ + cl_double CL_ALIGNED(128) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[8]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4[4]; +#endif +#if defined( __CL_DOUBLE8__ ) + __cl_double8 v8[2]; +#endif +#if defined( __CL_DOUBLE16__ ) + __cl_double16 v16; +#endif +}cl_double16; + +/* Macro to facilitate debugging + * Usage: + * Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. + * The first line ends with: CL_PROGRAM_STRING_DEBUG_INFO \" + * Each line thereafter of OpenCL C source must end with: \n\ + * The last line ends in "; + * + * Example: + * + * const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\ + * kernel void foo( int a, float * b ) \n\ + * { \n\ + * // my comment \n\ + * *b[ get_global_id(0)] = a; \n\ + * } \n\ + * "; + * + * This should correctly set up the line, (column) and file information for your source + * string so you can do source level debugging. + */ +#define __CL_STRINGIFY( _x ) # _x +#define _CL_STRINGIFY( _x ) __CL_STRINGIFY( _x ) +#define CL_PROGRAM_STRING_DEBUG_INFO "#line " _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n" + +#ifdef __cplusplus +} +#endif + +#undef __CL_HAS_ANON_STRUCT__ +#undef __CL_ANON_STRUCT__ +#if defined( _WIN32) && defined(_MSC_VER) && ! defined(__STDC__) + #if _MSC_VER >=1500 + #pragma warning( pop ) + #endif +#endif + +#endif /* __CL_PLATFORM_H */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/opencl.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/opencl.h new file mode 100644 index 0000000000000000000000000000000000000000..bd7fa6be57b8df26003c414041683340bcb95404 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/CL/opencl.h @@ -0,0 +1,40 @@ +/******************************************************************************* + * Copyright (c) 2008-2020 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ + +#ifndef __OPENCL_H +#define __OPENCL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __APPLE__ +#include +#include +#include +#include +#else +#include +#include +#include +#include +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_H */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e6abf443f9177781b402b0ca90365fb9462e316 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/builtin_types.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/builtin_types.h new file mode 100644 index 0000000000000000000000000000000000000000..5247c40807f0dd36a886513ab1bff5d2977364db --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/builtin_types.h @@ -0,0 +1,64 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "device_types.h" +#if !defined(__CUDACC_RTC__) +#define EXCLUDE_FROM_RTC +#include "driver_types.h" +#undef EXCLUDE_FROM_RTC +#endif /* !__CUDACC_RTC__ */ +#include "surface_types.h" +#include "texture_types.h" +#include "vector_types.h" diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/channel_descriptor.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/channel_descriptor.h new file mode 100644 index 0000000000000000000000000000000000000000..1d61d2974a4f2b527d04baf586c73c2af86358f4 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/channel_descriptor.h @@ -0,0 +1,595 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CHANNEL_DESCRIPTOR_H__) +#define __CHANNEL_DESCRIPTOR_H__ + +#if defined(__cplusplus) + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +/** + * \addtogroup CUDART_HIGHLEVEL + * + * @{ + */ + +/** + * \brief \hl Returns a channel descriptor using the specified format + * + * Returns a channel descriptor with format \p f and number of bits of each + * component \p x, \p y, \p z, and \p w. The ::cudaChannelFormatDesc is + * defined as: + * \code + struct cudaChannelFormatDesc { + int x, y, z, w; + enum cudaChannelFormatKind f; + }; + * \endcode + * + * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned, + * ::cudaChannelFormatKindUnsigned, cudaChannelFormatKindFloat, + * ::cudaChannelFormatKindSignedNormalized8X1, ::cudaChannelFormatKindSignedNormalized8X2, + * ::cudaChannelFormatKindSignedNormalized8X4, + * ::cudaChannelFormatKindUnsignedNormalized8X1, ::cudaChannelFormatKindUnsignedNormalized8X2, + * ::cudaChannelFormatKindUnsignedNormalized8X4, + * ::cudaChannelFormatKindSignedNormalized16X1, ::cudaChannelFormatKindSignedNormalized16X2, + * ::cudaChannelFormatKindSignedNormalized16X4, + * ::cudaChannelFormatKindUnsignedNormalized16X1, ::cudaChannelFormatKindUnsignedNormalized16X2, + * ::cudaChannelFormatKindUnsignedNormalized16X4 + * or ::cudaChannelFormatKindNV12. + * + * The format is specified by the template specialization. + * + * The template function specializes for the following scalar types: + * char, signed char, unsigned char, short, unsigned short, int, unsigned int, long, unsigned long, and float. + * The template function specializes for the following vector types: + * char{1|2|4}, uchar{1|2|4}, short{1|2|4}, ushort{1|2|4}, int{1|2|4}, uint{1|2|4}, long{1|2|4}, ulong{1|2|4}, float{1|2|4}. + * The template function specializes for following cudaChannelFormatKind enum values: + * ::cudaChannelFormatKind{Uns|S}ignedNormalized{8|16}X{1|2|4}, and ::cudaChannelFormatKindNV12. + * + * Invoking the function on a type without a specialization defaults to creating a channel format of kind ::cudaChannelFormatKindNone + * + * \return + * Channel descriptor with format \p f + * + * \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (High level)", + * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, size_t) "cudaBindTexture (High level, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (High level)", + * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (High level)", + * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (High level, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (High level)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (High level)" + */ +template __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone); +} + +static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat); +} + +static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat); +} + +static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat); +} + +static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(char) * 8; + +#if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__) + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +#else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */ + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +#endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */ +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(signed char) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned char) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(signed char) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned char) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(signed char) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned char) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(signed char) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned char) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(short) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(short) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(short) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(short) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(int) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned int) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(int) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned int) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(int) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned int) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(int) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned int) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned); +} + +#if !defined(__LP64__) + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(long) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned long) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(long) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned long) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(long) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned long) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(long) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned long) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned); +} + +#endif /* !__LP64__ */ + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(float) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(float) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(float) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(float) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat); +} + +static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescNV12(void) +{ + int e = (int)sizeof(char) * 8; + + return cudaCreateChannelDesc(e, e, e, 0, cudaChannelFormatKindNV12); +} + +template __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone); +} + +/* Signed 8-bit normalized integer formats */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedNormalized8X1); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedNormalized8X2); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindSignedNormalized8X4); +} + +/* Unsigned 8-bit normalized integer formats */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized8X1); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedNormalized8X2); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedNormalized8X4); +} + +/* Signed 16-bit normalized integer formats */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindSignedNormalized16X1); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindSignedNormalized16X2); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindSignedNormalized16X4); +} + +/* Unsigned 16-bit normalized integer formats */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized16X1); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindUnsignedNormalized16X2); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindUnsignedNormalized16X4); +} + +/* NV12 format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 0, cudaChannelFormatKindNV12); +} + +/* BC1 format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1); +} + +/* BC1sRGB format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1SRGB); +} + +/* BC2 format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2); +} + +/* BC2sRGB format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2SRGB); +} + +/* BC3 format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3); +} + +/* BC3sRGB format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3SRGB); +} + +/* BC4 unsigned format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed4); +} + +/* BC4 signed format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedBlockCompressed4); +} + +/* BC5 unsigned format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed5); +} + +/* BC5 signed format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedBlockCompressed5); +} + +/* BC6H unsigned format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindUnsignedBlockCompressed6H); +} + +/* BC6H signed format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindSignedBlockCompressed6H); +} + +/* BC7 format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7); +} + +/* BC7sRGB format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7SRGB); +} + +#endif /* __cplusplus */ + +/** @} */ +/** @} */ /* END CUDART_TEXTURE_HL */ + +#endif /* !__CHANNEL_DESCRIPTOR_H__ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups.h new file mode 100644 index 0000000000000000000000000000000000000000..c8823fb79bc78e513861f9e7d3e671bf0effc6b7 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups.h @@ -0,0 +1,1828 @@ +/* + * Copyright 1993-2021 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _COOPERATIVE_GROUPS_H_ +#define _COOPERATIVE_GROUPS_H_ + +#if defined(__cplusplus) && defined(__CUDACC__) + +#include "cooperative_groups/details/info.h" +#include "cooperative_groups/details/driver_abi.h" +#include "cooperative_groups/details/helpers.h" + +#if defined(_CG_HAS_STL_ATOMICS) +#include +#define _CG_THREAD_SCOPE(scope) _CG_STATIC_CONST_DECL cuda::thread_scope thread_scope = scope; +#else +#define _CG_THREAD_SCOPE(scope) +#endif + +_CG_BEGIN_NAMESPACE + +namespace details { + _CG_CONST_DECL unsigned int coalesced_group_id = 1; + _CG_CONST_DECL unsigned int multi_grid_group_id = 2; + _CG_CONST_DECL unsigned int grid_group_id = 3; + _CG_CONST_DECL unsigned int thread_block_id = 4; + _CG_CONST_DECL unsigned int multi_tile_group_id = 5; + _CG_CONST_DECL unsigned int cluster_group_id = 6; +} + +/** + * class thread_group; + * + * Generic thread group type, into which all groups are convertible. + * It acts as a container for all storage necessary for the derived groups, + * and will dispatch the API calls to the correct derived group. This means + * that all derived groups must implement the same interface as thread_group. + */ +class thread_group +{ +protected: + struct group_data { + unsigned int _unused : 1; + unsigned int type : 7, : 0; + }; + + struct gg_data { + details::grid_workspace *gridWs; + }; + +#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) + struct mg_data { + unsigned long long _unused : 1; + unsigned long long type : 7; + unsigned long long handle : 56; + const details::multi_grid::multi_grid_functions *functions; + }; +#endif + + struct tg_data { + unsigned int is_tiled : 1; + unsigned int type : 7; + unsigned int size : 24; + // packed to 4b + unsigned int metaGroupSize : 16; + unsigned int metaGroupRank : 16; + // packed to 8b + unsigned int mask; + // packed to 12b + unsigned int _res; + }; + + friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz); + friend class thread_block; + + union __align__(8) { + group_data group; + tg_data coalesced; + gg_data grid; +#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) + mg_data multi_grid; +#endif + } _data; + + _CG_QUALIFIER thread_group operator=(const thread_group& src); + + _CG_QUALIFIER thread_group(unsigned int type) { + _data.group.type = type; + _data.group._unused = false; + } + +#ifdef _CG_CPP11_FEATURES + static_assert(sizeof(tg_data) <= 16, "Failed size check"); + static_assert(sizeof(gg_data) <= 16, "Failed size check"); +# ifdef _CG_ABI_EXPERIMENTAL + static_assert(sizeof(mg_data) <= 16, "Failed size check"); +# endif +#endif + +public: + _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device) + + _CG_QUALIFIER unsigned long long size() const; + _CG_QUALIFIER unsigned long long num_threads() const; + _CG_QUALIFIER unsigned long long thread_rank() const; + _CG_QUALIFIER void sync() const; + _CG_QUALIFIER unsigned int get_type() const { + return _data.group.type; + } + +}; + +template +struct thread_group_base : public thread_group { + _CG_QUALIFIER thread_group_base() : thread_group(TyId) {} + _CG_STATIC_CONST_DECL unsigned int id = TyId; +}; + +#if defined(_CG_HAS_MULTI_GRID_GROUP) + +/** + * class multi_grid_group; + * + * Threads within this this group are guaranteed to be co-resident on the + * same system, on multiple devices within the same launched kernels. + * To use this group, the kernel must have been launched with + * cuLaunchCooperativeKernelMultiDevice (or the CUDA Runtime equivalent), + * and the device must support it (queryable device attribute). + * + * Constructed via this_multi_grid(); + */ + + +# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) +class multi_grid_group; + +// Multi grid group requires these functions to be templated to prevent ptxas from trying to use CG syscalls +template +__device__ _CG_DEPRECATED multi_grid_group this_multi_grid(); + +class multi_grid_group : public thread_group_base +{ +private: + template + _CG_QUALIFIER multi_grid_group() { + _data.multi_grid.functions = details::multi_grid::load_grid_intrinsics(); + _data.multi_grid.handle = _data.multi_grid.functions->get_intrinsic_handle(); + } + + friend multi_grid_group this_multi_grid(); + +public: + _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system) + + _CG_QUALIFIER bool is_valid() const { + return (_data.multi_grid.handle != 0); + } + + _CG_QUALIFIER void sync() const { + if (!is_valid()) { + _CG_ABORT(); + } + _data.multi_grid.functions->sync(_data.multi_grid.handle); + } + + _CG_QUALIFIER unsigned long long num_threads() const { + _CG_ASSERT(is_valid()); + return _data.multi_grid.functions->size(_data.multi_grid.handle); + } + + _CG_QUALIFIER unsigned long long size() const { + return num_threads(); + } + + _CG_QUALIFIER unsigned long long thread_rank() const { + _CG_ASSERT(is_valid()); + return _data.multi_grid.functions->thread_rank(_data.multi_grid.handle); + } + + _CG_QUALIFIER unsigned int grid_rank() const { + _CG_ASSERT(is_valid()); + return (_data.multi_grid.functions->grid_rank(_data.multi_grid.handle)); + } + + _CG_QUALIFIER unsigned int num_grids() const { + _CG_ASSERT(is_valid()); + return (_data.multi_grid.functions->num_grids(_data.multi_grid.handle)); + } +}; +# else +class multi_grid_group +{ +private: + unsigned long long _handle; + unsigned int _size; + unsigned int _rank; + + friend _CG_QUALIFIER multi_grid_group this_multi_grid(); + + _CG_QUALIFIER multi_grid_group() { + _handle = details::multi_grid::get_intrinsic_handle(); + _size = details::multi_grid::size(_handle); + _rank = details::multi_grid::thread_rank(_handle); + } + +public: + _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system) + + _CG_QUALIFIER _CG_DEPRECATED bool is_valid() const { + return (_handle != 0); + } + + _CG_QUALIFIER _CG_DEPRECATED void sync() const { + if (!is_valid()) { + _CG_ABORT(); + } + details::multi_grid::sync(_handle); + } + + _CG_QUALIFIER _CG_DEPRECATED unsigned long long num_threads() const { + _CG_ASSERT(is_valid()); + return _size; + } + + _CG_QUALIFIER _CG_DEPRECATED unsigned long long size() const { + return num_threads(); + } + + _CG_QUALIFIER _CG_DEPRECATED unsigned long long thread_rank() const { + _CG_ASSERT(is_valid()); + return _rank; + } + + _CG_QUALIFIER _CG_DEPRECATED unsigned int grid_rank() const { + _CG_ASSERT(is_valid()); + return (details::multi_grid::grid_rank(_handle)); + } + + _CG_QUALIFIER _CG_DEPRECATED unsigned int num_grids() const { + _CG_ASSERT(is_valid()); + return (details::multi_grid::num_grids(_handle)); + } +}; +# endif + +/** + * multi_grid_group this_multi_grid() + * + * Constructs a multi_grid_group + */ +# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) +template +__device__ +#else +_CG_QUALIFIER +# endif +_CG_DEPRECATED +multi_grid_group this_multi_grid() +{ + return multi_grid_group(); +} +#endif + +/** + * class grid_group; + * + * Threads within this this group are guaranteed to be co-resident on the + * same device within the same launched kernel. To use this group, the kernel + * must have been launched with cuLaunchCooperativeKernel (or the CUDA Runtime equivalent), + * and the device must support it (queryable device attribute). + * + * Constructed via this_grid(); + */ +class grid_group : public thread_group_base +{ + _CG_STATIC_CONST_DECL unsigned int _group_id = details::grid_group_id; + friend _CG_QUALIFIER grid_group this_grid(); + +private: + _CG_QUALIFIER grid_group(details::grid_workspace *gridWs) { + _data.grid.gridWs = gridWs; + } + + public: + _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device) + + _CG_QUALIFIER bool is_valid() const { + return (_data.grid.gridWs != NULL); + } + + _CG_QUALIFIER void sync() const { + if (!is_valid()) { + _CG_ABORT(); + } + details::grid::sync(&_data.grid.gridWs->barrier); + } + + _CG_STATIC_QUALIFIER unsigned long long size() { + return details::grid::size(); + } + + _CG_STATIC_QUALIFIER unsigned long long thread_rank() { + return details::grid::thread_rank(); + } + + _CG_STATIC_QUALIFIER dim3 group_dim() { + return details::grid::grid_dim(); + } + + _CG_STATIC_QUALIFIER unsigned long long num_threads() { + return details::grid::num_threads(); + } + + _CG_STATIC_QUALIFIER dim3 dim_blocks() { + return details::grid::dim_blocks(); + } + + _CG_STATIC_QUALIFIER unsigned long long num_blocks() { + return details::grid::num_blocks(); + } + + _CG_STATIC_QUALIFIER dim3 block_index() { + return details::grid::block_index(); + } + + _CG_STATIC_QUALIFIER unsigned long long block_rank() { + return details::grid::block_rank(); + } + +# if defined(_CG_HAS_CLUSTER_GROUP) + _CG_STATIC_QUALIFIER dim3 dim_clusters() { + return details::grid::dim_clusters(); + } + + _CG_STATIC_QUALIFIER unsigned long long num_clusters() { + return details::grid::num_clusters(); + } + + _CG_STATIC_QUALIFIER dim3 cluster_index() { + return details::grid::cluster_index(); + } + + _CG_STATIC_QUALIFIER unsigned long long cluster_rank() { + return details::grid::cluster_rank(); + } +# endif +}; + +_CG_QUALIFIER grid_group this_grid() { + // Load a workspace from the driver + grid_group gg(details::get_grid_workspace()); +#ifdef _CG_DEBUG + // *all* threads must be available to synchronize + gg.sync(); +#endif // _CG_DEBUG + return gg; +} + +#if defined(_CG_HAS_CLUSTER_GROUP) +/** + * class cluster_group + * + * Every GPU kernel is executed by a grid of thread blocks. A grid can be evenly + * divided along all dimensions to form groups of blocks, each group of which is + * a block cluster. Clustered grids are subject to various restrictions and + * limitations. Primarily, a cluster consists of at most 8 blocks by default + * (although the user is allowed to opt-in to non-standard sizes,) and clustered + * grids are subject to additional occupancy limitations due to per-cluster + * hardware resource consumption. In exchange, a block cluster is guaranteed to + * be a cooperative group, with access to all cooperative group capabilities, as + * well as cluster specific capabilities and accelerations. A cluster_group + * represents a block cluster. + * + * Constructed via this_cluster_group(); + */ +class cluster_group : public thread_group_base +{ + // Friends + friend _CG_QUALIFIER cluster_group this_cluster(); + + // Disable constructor + _CG_QUALIFIER cluster_group() + { + } + + public: + //_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_cluster) + + // Functionality exposed by the group + _CG_STATIC_QUALIFIER void sync() + { + return details::cluster::sync(); + } + + _CG_STATIC_QUALIFIER void barrier_arrive() + { + return details::cluster::barrier_arrive(); + } + + _CG_STATIC_QUALIFIER void barrier_wait() + { + return details::cluster::barrier_wait(); + } + + _CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr) + { + return details::cluster::query_shared_rank(addr); + } + + template + _CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank) + { + return details::cluster::map_shared_rank(addr, rank); + } + + _CG_STATIC_QUALIFIER dim3 block_index() + { + return details::cluster::block_index(); + } + + _CG_STATIC_QUALIFIER unsigned int block_rank() + { + return details::cluster::block_rank(); + } + + _CG_STATIC_QUALIFIER unsigned int thread_rank() + { + return details::cluster::thread_rank(); + } + + _CG_STATIC_QUALIFIER dim3 dim_blocks() + { + return details::cluster::dim_blocks(); + } + + _CG_STATIC_QUALIFIER unsigned int num_blocks() + { + return details::cluster::num_blocks(); + } + + _CG_STATIC_QUALIFIER dim3 dim_threads() + { + return details::cluster::dim_threads(); + } + + _CG_STATIC_QUALIFIER unsigned int num_threads() + { + return details::cluster::num_threads(); + } + + // Legacy aliases + _CG_STATIC_QUALIFIER unsigned int size() + { + return num_threads(); + } +}; + +/* + * cluster_group this_cluster() + * + * Constructs a cluster_group + */ +_CG_QUALIFIER cluster_group this_cluster() +{ + cluster_group cg; +#ifdef _CG_DEBUG + cg.sync(); +#endif + return cg; +} +#endif + +#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) +namespace details { + + _CG_CONSTEXPR_QUALIFIER unsigned int scratch_sync_memory_size(unsigned int max_block_size) { + // One barrier per possible size of the group rounded up to multiple of 4. + return 8 * sizeof(details::barrier_t); + } + + _CG_CONSTEXPR_QUALIFIER unsigned int scratch_collectives_memory_size(unsigned int communication_size, unsigned int max_block_size) { + // One slot of collectives memory per warp. + return max_block_size / 32 * communication_size; + } + + _CG_CONSTEXPR_QUALIFIER unsigned int scratch_size_needed(unsigned int communication_size, unsigned int max_block_size) { + return scratch_sync_memory_size(max_block_size) + scratch_collectives_memory_size(communication_size, max_block_size); + } + + _CG_CONSTEXPR_QUALIFIER size_t scratch_alignment(unsigned int communication_size) { + return ((communication_size & (communication_size - 1) == 0) && communication_size > 8) ? + communication_size : 8; + } + + _CG_CONST_DECL unsigned int default_tile_communication_size = 8; + _CG_CONST_DECL unsigned int default_max_block_size = 1024; + + struct multi_warp_scratch { + char memory[1]; + }; +} + +class thread_block; +namespace experimental { + template + struct __align__(details::scratch_alignment(TileCommunicationSize)) block_tile_memory { + private: + char scratch[details::scratch_size_needed(TileCommunicationSize, MaxBlockSize)]; + + public: + _CG_QUALIFIER void* get_memory() { + return static_cast(scratch); + } + + _CG_STATIC_QUALIFIER unsigned int get_size() { + return details::scratch_size_needed(TileCommunicationSize, MaxBlockSize); + } + }; + + template + _CG_QUALIFIER thread_block this_thread_block(experimental::block_tile_memory& scratch); +} +#endif + +/** + * class thread_block + * + * Every GPU kernel is executed by a grid of thread blocks, and threads within + * each block are guaranteed to reside on the same streaming multiprocessor. + * A thread_block represents a thread block whose dimensions are not known until runtime. + * + * Constructed via this_thread_block(); + */ +class thread_block : public thread_group_base +{ + // Friends + friend _CG_QUALIFIER thread_block this_thread_block(); + friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz); + friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz); + +#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) + template + friend _CG_QUALIFIER thread_block experimental::this_thread_block( + experimental::block_tile_memory& scratch); + + const unsigned short communication_size; + const unsigned short max_block_size; + details::multi_warp_scratch* const tile_memory; + + template + friend class __static_size_multi_warp_tile_base; + + template + _CG_QUALIFIER thread_block(experimental::block_tile_memory& scratch) : + tile_memory(reinterpret_cast(&scratch)), + communication_size(TileCommunicationSize), max_block_size(MaxBlockSize) { + if (thread_rank() < details::scratch_sync_memory_size(MaxBlockSize) / sizeof(details::barrier_t)) { + details::barrier_t* barriers = reinterpret_cast(&tile_memory->memory); + barriers[thread_rank()] = 0; + } + sync(); + } +#endif + + // Disable constructor + _CG_QUALIFIER thread_block() +#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) + : tile_memory(NULL), communication_size(0), max_block_size(0) +#endif + { } + + // Internal Use + _CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const { + const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0); + + // Invalid, immediately fail + if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) { + details::abort(); + return (thread_block()); + } + + unsigned int mask; + unsigned int base_offset = thread_rank() & (~(tilesz - 1)); + unsigned int masklength = min((unsigned int)size() - base_offset, tilesz); + + mask = (unsigned int)(-1) >> (32 - masklength); + mask <<= (details::laneid() & ~(tilesz - 1)); + thread_group tile = thread_group(details::coalesced_group_id); + tile._data.coalesced.mask = mask; + tile._data.coalesced.size = __popc(mask); + tile._data.coalesced.metaGroupSize = (details::cta::size() + tilesz - 1) / tilesz; + tile._data.coalesced.metaGroupRank = details::cta::thread_rank() / tilesz; + tile._data.coalesced.is_tiled = true; + return (tile); + } + + public: + _CG_STATIC_CONST_DECL unsigned int _group_id = details::thread_block_id; + _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block) + + _CG_STATIC_QUALIFIER void sync() { + details::cta::sync(); + } + + _CG_STATIC_QUALIFIER unsigned int size() { + return details::cta::size(); + } + + _CG_STATIC_QUALIFIER unsigned int thread_rank() { + return details::cta::thread_rank(); + } + + // Additional functionality exposed by the group + _CG_STATIC_QUALIFIER dim3 group_index() { + return details::cta::group_index(); + } + + _CG_STATIC_QUALIFIER dim3 thread_index() { + return details::cta::thread_index(); + } + + _CG_STATIC_QUALIFIER dim3 group_dim() { + return details::cta::block_dim(); + } + + _CG_STATIC_QUALIFIER dim3 dim_threads() { + return details::cta::dim_threads(); + } + + _CG_STATIC_QUALIFIER unsigned int num_threads() { + return details::cta::num_threads(); + } + +}; + +/** + * thread_block this_thread_block() + * + * Constructs a thread_block group + */ +_CG_QUALIFIER thread_block this_thread_block() +{ + return (thread_block()); +} + +#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) +namespace experimental { + template + _CG_QUALIFIER thread_block this_thread_block(experimental::block_tile_memory& scratch) { + return (thread_block(scratch)); + } +} +#endif + +/** + * class coalesced_group + * + * A group representing the current set of converged threads in a warp. + * The size of the group is not guaranteed and it may return a group of + * only one thread (itself). + * + * This group exposes warp-synchronous builtins. + * Constructed via coalesced_threads(); + */ +class coalesced_group : public thread_group_base +{ +private: + friend _CG_QUALIFIER coalesced_group coalesced_threads(); + friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz); + friend _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz); + friend class details::_coalesced_group_data_access; + + _CG_QUALIFIER unsigned int _packLanes(unsigned laneMask) const { + unsigned int member_pack = 0; + unsigned int member_rank = 0; + for (int bit_idx = 0; bit_idx < 32; bit_idx++) { + unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx); + if (lane_bit) { + if (laneMask & lane_bit) + member_pack |= 1 << member_rank; + member_rank++; + } + } + return (member_pack); + } + + // Internal Use + _CG_QUALIFIER coalesced_group _get_tiled_threads(unsigned int tilesz) const { + const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0); + + // Invalid, immediately fail + if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) { + details::abort(); + return (coalesced_group(0)); + } + if (size() <= tilesz) { + return (*this); + } + + if ((_data.coalesced.is_tiled == true) && pow2_tilesz) { + unsigned int base_offset = (thread_rank() & (~(tilesz - 1))); + unsigned int masklength = min((unsigned int)size() - base_offset, tilesz); + unsigned int mask = (unsigned int)(-1) >> (32 - masklength); + + mask <<= (details::laneid() & ~(tilesz - 1)); + coalesced_group coalesced_tile = coalesced_group(mask); + coalesced_tile._data.coalesced.metaGroupSize = size() / tilesz; + coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz; + coalesced_tile._data.coalesced.is_tiled = true; + return (coalesced_tile); + } + else if ((_data.coalesced.is_tiled == false) && pow2_tilesz) { + unsigned int mask = 0; + unsigned int member_rank = 0; + int seen_lanes = (thread_rank() / tilesz) * tilesz; + for (unsigned int bit_idx = 0; bit_idx < 32; bit_idx++) { + unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx); + if (lane_bit) { + if (seen_lanes <= 0 && member_rank < tilesz) { + mask |= lane_bit; + member_rank++; + } + seen_lanes--; + } + } + coalesced_group coalesced_tile = coalesced_group(mask); + // Override parent with the size of this group + coalesced_tile._data.coalesced.metaGroupSize = (size() + tilesz - 1) / tilesz; + coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz; + return coalesced_tile; + } + else { + // None in _CG_VERSION 1000 + details::abort(); + } + + return (coalesced_group(0)); + } + + protected: + _CG_QUALIFIER coalesced_group(unsigned int mask) { + _data.coalesced.mask = mask; + _data.coalesced.size = __popc(mask); + _data.coalesced.metaGroupRank = 0; + _data.coalesced.metaGroupSize = 1; + _data.coalesced.is_tiled = false; + } + + _CG_QUALIFIER unsigned int get_mask() const { + return (_data.coalesced.mask); + } + + public: + _CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id; + _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block) + + _CG_QUALIFIER unsigned int num_threads() const { + return _data.coalesced.size; + } + + _CG_QUALIFIER unsigned int size() const { + return num_threads(); + } + + _CG_QUALIFIER unsigned int thread_rank() const { + return (__popc(_data.coalesced.mask & details::lanemask32_lt())); + } + + // Rank of this group in the upper level of the hierarchy + _CG_QUALIFIER unsigned int meta_group_rank() const { + return _data.coalesced.metaGroupRank; + } + + // Total num partitions created out of all CTAs when the group was created + _CG_QUALIFIER unsigned int meta_group_size() const { + return _data.coalesced.metaGroupSize; + } + + _CG_QUALIFIER void sync() const { + __syncwarp(_data.coalesced.mask); + } + +#ifdef _CG_CPP11_FEATURES + template > + _CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const { + unsigned int lane = (srcRank == 0) ? __ffs(_data.coalesced.mask) - 1 : + (size() == 32) ? srcRank : __fns(_data.coalesced.mask, 0, (srcRank + 1)); + + return details::tile::shuffle_dispatch::shfl( + _CG_STL_NAMESPACE::forward(elem), _data.coalesced.mask, lane, 32); + } + + template > + _CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const { + if (size() == 32) { + return details::tile::shuffle_dispatch::shfl_down( + _CG_STL_NAMESPACE::forward(elem), 0xFFFFFFFF, delta, 32); + } + + unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1); + + if (lane >= 32) + lane = details::laneid(); + + return details::tile::shuffle_dispatch::shfl( + _CG_STL_NAMESPACE::forward(elem), _data.coalesced.mask, lane, 32); + } + + template > + _CG_QUALIFIER TyRet shfl_up(TyElem&& elem, int delta) const { + if (size() == 32) { + return details::tile::shuffle_dispatch::shfl_up( + _CG_STL_NAMESPACE::forward(elem), 0xFFFFFFFF, delta, 32); + } + + unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1)); + if (lane >= 32) + lane = details::laneid(); + + return details::tile::shuffle_dispatch::shfl( + _CG_STL_NAMESPACE::forward(elem), _data.coalesced.mask, lane, 32); + } +#else + template + _CG_QUALIFIER TyIntegral shfl(TyIntegral var, unsigned int src_rank) const { + details::assert_if_not_arithmetic(); + unsigned int lane = (src_rank == 0) ? __ffs(_data.coalesced.mask) - 1 : + (size() == 32) ? src_rank : __fns(_data.coalesced.mask, 0, (src_rank + 1)); + return (__shfl_sync(_data.coalesced.mask, var, lane, 32)); + } + + template + _CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, int delta) const { + details::assert_if_not_arithmetic(); + if (size() == 32) { + return (__shfl_up_sync(0xFFFFFFFF, var, delta, 32)); + } + unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1)); + if (lane >= 32) lane = details::laneid(); + return (__shfl_sync(_data.coalesced.mask, var, lane, 32)); + } + + template + _CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, int delta) const { + details::assert_if_not_arithmetic(); + if (size() == 32) { + return (__shfl_down_sync(0xFFFFFFFF, var, delta, 32)); + } + unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1); + if (lane >= 32) lane = details::laneid(); + return (__shfl_sync(_data.coalesced.mask, var, lane, 32)); + } +#endif + + _CG_QUALIFIER int any(int predicate) const { + return (__ballot_sync(_data.coalesced.mask, predicate) != 0); + } + _CG_QUALIFIER int all(int predicate) const { + return (__ballot_sync(_data.coalesced.mask, predicate) == _data.coalesced.mask); + } + _CG_QUALIFIER unsigned int ballot(int predicate) const { + if (size() == 32) { + return (__ballot_sync(0xFFFFFFFF, predicate)); + } + unsigned int lane_ballot = __ballot_sync(_data.coalesced.mask, predicate); + return (_packLanes(lane_ballot)); + } + +#ifdef _CG_HAS_MATCH_COLLECTIVE + + template + _CG_QUALIFIER unsigned int match_any(TyIntegral val) const { + details::assert_if_not_arithmetic(); + if (size() == 32) { + return (__match_any_sync(0xFFFFFFFF, val)); + } + unsigned int lane_match = __match_any_sync(_data.coalesced.mask, val); + return (_packLanes(lane_match)); + } + + template + _CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const { + details::assert_if_not_arithmetic(); + if (size() == 32) { + return (__match_all_sync(0xFFFFFFFF, val, &pred)); + } + unsigned int lane_match = __match_all_sync(_data.coalesced.mask, val, &pred); + return (_packLanes(lane_match)); + } + +#endif /* !_CG_HAS_MATCH_COLLECTIVE */ + +}; + +_CG_QUALIFIER coalesced_group coalesced_threads() +{ + return (coalesced_group(__activemask())); +} + +namespace details { + template struct verify_thread_block_tile_size; + template <> struct verify_thread_block_tile_size<32> { typedef void OK; }; + template <> struct verify_thread_block_tile_size<16> { typedef void OK; }; + template <> struct verify_thread_block_tile_size<8> { typedef void OK; }; + template <> struct verify_thread_block_tile_size<4> { typedef void OK; }; + template <> struct verify_thread_block_tile_size<2> { typedef void OK; }; + template <> struct verify_thread_block_tile_size<1> { typedef void OK; }; + +#ifdef _CG_CPP11_FEATURES + template + using _is_power_of_2 = _CG_STL_NAMESPACE::integral_constant; + + template + using _is_single_warp = _CG_STL_NAMESPACE::integral_constant; + template + using _is_multi_warp = + _CG_STL_NAMESPACE::integral_constant 32) && (Size <= 1024)>; + + template + using _is_valid_single_warp_tile = + _CG_STL_NAMESPACE::integral_constant::value && _is_single_warp::value>; + template + using _is_valid_multi_warp_tile = + _CG_STL_NAMESPACE::integral_constant::value && _is_multi_warp::value>; +#else + template + struct _is_multi_warp { + static const bool value = false; + }; +#endif +} + +template +class __static_size_tile_base +{ +protected: + _CG_STATIC_CONST_DECL unsigned int numThreads = Size; + +public: + _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block) + + // Rank of thread within tile + _CG_STATIC_QUALIFIER unsigned int thread_rank() { + return (details::cta::thread_rank() & (numThreads - 1)); + } + + // Number of threads within tile + _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int num_threads() { + return numThreads; + } + + _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int size() { + return num_threads(); + } +}; + +template +class __static_size_thread_block_tile_base : public __static_size_tile_base +{ + friend class details::_coalesced_group_data_access; + typedef details::tile::tile_helpers th; + +#ifdef _CG_CPP11_FEATURES + static_assert(details::_is_valid_single_warp_tile::value, "Size must be one of 1/2/4/8/16/32"); +#else + typedef typename details::verify_thread_block_tile_size::OK valid; +#endif + using __static_size_tile_base::numThreads; + _CG_STATIC_CONST_DECL unsigned int fullMask = 0xFFFFFFFF; + + protected: + _CG_STATIC_QUALIFIER unsigned int build_mask() { + unsigned int mask = fullMask; + if (numThreads != 32) { + // [0,31] representing the current active thread in the warp + unsigned int laneId = details::laneid(); + // shift mask according to the partition it belongs to + mask = th::tileMask << (laneId & ~(th::laneMask)); + } + return (mask); + } + +public: + _CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id; + + _CG_STATIC_QUALIFIER void sync() { + __syncwarp(build_mask()); + } + +#ifdef _CG_CPP11_FEATURES + // PTX supported collectives + template > + _CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const { + return details::tile::shuffle_dispatch::shfl( + _CG_STL_NAMESPACE::forward(elem), build_mask(), srcRank, numThreads); + } + + template > + _CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const { + return details::tile::shuffle_dispatch::shfl_down( + _CG_STL_NAMESPACE::forward(elem), build_mask(), delta, numThreads); + } + + template > + _CG_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int delta) const { + return details::tile::shuffle_dispatch::shfl_up( + _CG_STL_NAMESPACE::forward(elem), build_mask(), delta, numThreads); + } + + template > + _CG_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int laneMask) const { + return details::tile::shuffle_dispatch::shfl_xor( + _CG_STL_NAMESPACE::forward(elem), build_mask(), laneMask, numThreads); + } +#else + template + _CG_QUALIFIER TyIntegral shfl(TyIntegral var, int srcRank) const { + details::assert_if_not_arithmetic(); + return (__shfl_sync(build_mask(), var, srcRank, numThreads)); + } + + template + _CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, unsigned int delta) const { + details::assert_if_not_arithmetic(); + return (__shfl_down_sync(build_mask(), var, delta, numThreads)); + } + + template + _CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, unsigned int delta) const { + details::assert_if_not_arithmetic(); + return (__shfl_up_sync(build_mask(), var, delta, numThreads)); + } + + template + _CG_QUALIFIER TyIntegral shfl_xor(TyIntegral var, unsigned int laneMask) const { + details::assert_if_not_arithmetic(); + return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads)); + } +#endif //_CG_CPP11_FEATURES + + _CG_QUALIFIER int any(int predicate) const { + unsigned int lane_ballot = __ballot_sync(build_mask(), predicate); + return (lane_ballot != 0); + } + _CG_QUALIFIER int all(int predicate) const { + unsigned int lane_ballot = __ballot_sync(build_mask(), predicate); + return (lane_ballot == build_mask()); + } + _CG_QUALIFIER unsigned int ballot(int predicate) const { + unsigned int lane_ballot = __ballot_sync(build_mask(), predicate); + return (lane_ballot >> (details::laneid() & (~(th::laneMask)))); + } + +#ifdef _CG_HAS_MATCH_COLLECTIVE + template + _CG_QUALIFIER unsigned int match_any(TyIntegral val) const { + details::assert_if_not_arithmetic(); + unsigned int lane_match = __match_any_sync(build_mask(), val); + return (lane_match >> (details::laneid() & (~(th::laneMask)))); + } + + template + _CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const { + details::assert_if_not_arithmetic(); + unsigned int lane_match = __match_all_sync(build_mask(), val, &pred); + return (lane_match >> (details::laneid() & (~(th::laneMask)))); + } +#endif + +}; + +template +class __static_parent_thread_block_tile_base +{ +public: + // Rank of this group in the upper level of the hierarchy + _CG_STATIC_QUALIFIER unsigned int meta_group_rank() { + return ParentT::thread_rank() / Size; + } + + // Total num partitions created out of all CTAs when the group was created + _CG_STATIC_QUALIFIER unsigned int meta_group_size() { + return (ParentT::size() + Size - 1) / Size; + } +}; + +/** + * class thread_block_tile + * + * Statically-sized group type, representing one tile of a thread block. + * The only specializations currently supported are those with native + * hardware support (1/2/4/8/16/32) + * + * This group exposes warp-synchronous builtins. + * Can only be constructed via tiled_partition(ParentT&) + */ + +template +class __single_warp_thread_block_tile : + public __static_size_thread_block_tile_base, + public __static_parent_thread_block_tile_base +{ + typedef __static_parent_thread_block_tile_base staticParentBaseT; + friend class details::_coalesced_group_data_access; + +protected: + _CG_QUALIFIER __single_warp_thread_block_tile() { }; + _CG_QUALIFIER __single_warp_thread_block_tile(unsigned int, unsigned int) { }; + + _CG_STATIC_QUALIFIER unsigned int get_mask() { + return __static_size_thread_block_tile_base::build_mask(); + } +}; + +template +class __single_warp_thread_block_tile : + public __static_size_thread_block_tile_base, + public thread_group_base +{ + _CG_STATIC_CONST_DECL unsigned int numThreads = Size; + + template friend class __single_warp_thread_block_tile; + friend class details::_coalesced_group_data_access; + + typedef __static_size_thread_block_tile_base staticSizeBaseT; + +protected: + _CG_QUALIFIER __single_warp_thread_block_tile(unsigned int meta_group_rank, unsigned int meta_group_size) { + _data.coalesced.mask = staticSizeBaseT::build_mask(); + _data.coalesced.size = numThreads; + _data.coalesced.metaGroupRank = meta_group_rank; + _data.coalesced.metaGroupSize = meta_group_size; + _data.coalesced.is_tiled = true; + } + + _CG_QUALIFIER unsigned int get_mask() const { + return (_data.coalesced.mask); + } + +public: + using staticSizeBaseT::sync; + using staticSizeBaseT::size; + using staticSizeBaseT::num_threads; + using staticSizeBaseT::thread_rank; + + _CG_QUALIFIER unsigned int meta_group_rank() const { + return _data.coalesced.metaGroupRank; + } + + _CG_QUALIFIER unsigned int meta_group_size() const { + return _data.coalesced.metaGroupSize; + } +}; + +/** + * Outer level API calls + * void sync(GroupT) - see .sync() + * void thread_rank(GroupT) - see .thread_rank() + * void group_size(GroupT) - see .size() + */ +template +_CG_QUALIFIER void sync(GroupT const &g) +{ + g.sync(); +} + +// TODO: Use a static dispatch to determine appropriate return type +// C++03 is stuck with unsigned long long for now +#ifdef _CG_CPP11_FEATURES +template +_CG_QUALIFIER auto thread_rank(GroupT const& g) -> decltype(g.thread_rank()) { + return g.thread_rank(); +} + + +template +_CG_QUALIFIER auto group_size(GroupT const &g) -> decltype(g.num_threads()) { + return g.num_threads(); +} +#else +template +_CG_QUALIFIER unsigned long long thread_rank(GroupT const& g) { + return static_cast(g.thread_rank()); +} + + +template +_CG_QUALIFIER unsigned long long group_size(GroupT const &g) { + return static_cast(g.num_threads()); +} +#endif + + +/** + * tiled_partition + * + * The tiled_partition(parent, tilesz) method is a collective operation that + * partitions the parent group into a one-dimensional, row-major, tiling of subgroups. + * + * A total of ((size(parent)+tilesz-1)/tilesz) subgroups will + * be created where threads having identical k = (thread_rank(parent)/tilesz) + * will be members of the same subgroup. + * + * The implementation may cause the calling thread to wait until all the members + * of the parent group have invoked the operation before resuming execution. + * + * Functionality is limited to power-of-two sized subgorup instances of at most + * 32 threads. Only thread_block, thread_block_tile<>, and their subgroups can be + * tiled_partition() in _CG_VERSION 1000. + */ +_CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz) +{ + if (parent.get_type() == details::coalesced_group_id) { + const coalesced_group *_cg = static_cast(&parent); + return _cg->_get_tiled_threads(tilesz); + } + else { + const thread_block *_tb = static_cast(&parent); + return _tb->_get_tiled_threads(tilesz); + } +} + +// Thread block type overload: returns a basic thread_group for now (may be specialized later) +_CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz) +{ + return (parent._get_tiled_threads(tilesz)); +} + +// Coalesced group type overload: retains its ability to stay coalesced +_CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz) +{ + return (parent._get_tiled_threads(tilesz)); +} + +namespace details { + template + class internal_thread_block_tile : public __single_warp_thread_block_tile {}; + + template + _CG_QUALIFIER internal_thread_block_tile tiled_partition_internal() { + return internal_thread_block_tile(); + } + + template + _CG_QUALIFIER TyVal multi_warp_collectives_helper( + const GroupT& group, + WarpLambda warp_lambda, + InterWarpLambda inter_warp_lambda) { + return group.template collectives_scheme(warp_lambda, inter_warp_lambda); + } + + template + _CG_QUALIFIER T* multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id) { + return group.template get_scratch_location(warp_id); + } + + template + _CG_QUALIFIER details::barrier_t* multi_warp_sync_location_getter(const GroupT& group) { + return group.get_sync_location(); + } + +} +/** + * tiled_partition + * + * The tiled_partition(parent) method is a collective operation that + * partitions the parent group into a one-dimensional, row-major, tiling of subgroups. + * + * A total of ((size(parent)/tilesz) subgroups will be created, + * therefore the parent group size must be evenly divisible by the tilesz. + * The allow parent groups are thread_block or thread_block_tile. + * + * The implementation may cause the calling thread to wait until all the members + * of the parent group have invoked the operation before resuming execution. + * + * Functionality is limited to native hardware sizes, 1/2/4/8/16/32. + * The size(parent) must be greater than the template Size parameter + * otherwise the results are undefined. + */ + +#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) +template +class __static_size_multi_warp_tile_base : public __static_size_tile_base +{ + static_assert(details::_is_valid_multi_warp_tile::value, "Size must be one of 64/128/256/512"); + + template + friend TyVal details::multi_warp_collectives_helper( + const GroupT& group, + WarpLambda warp_lambda, + InterWarpLambda inter_warp_lambda); + template + friend T* details::multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id); + template + friend details::barrier_t* details::multi_warp_sync_location_getter(const GroupT& group); + template + friend class __static_size_multi_warp_tile_base; + using WarpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base>; + using ThisType = __static_size_multi_warp_tile_base; + _CG_STATIC_CONST_DECL int numWarps = Size / 32; + const unsigned short communication_size; + const unsigned short max_block_size; + +protected: + details::multi_warp_scratch* const tile_memory; + + template + _CG_QUALIFIER __static_size_multi_warp_tile_base(const GroupT& g) : + tile_memory(g.tile_memory), communication_size(g.communication_size), max_block_size(g.max_block_size) {} + + +private: + _CG_QUALIFIER details::barrier_t* get_sync_location() const { + // Different group sizes use different barriers, all groups of a given size share one barrier. + unsigned int sync_id = details::log2(Size / 64); + return &(reinterpret_cast(tile_memory->memory)[sync_id]); + } + + template + _CG_QUALIFIER T* get_scratch_location(unsigned int warp_id) const { + unsigned int sync_mem_size = details::scratch_sync_memory_size(max_block_size); + unsigned int scratch_id = (details::cta::thread_rank() - thread_rank()) / 32 + warp_id; + return reinterpret_cast(&tile_memory->memory[sync_mem_size + scratch_id * communication_size]); + } + + template + _CG_QUALIFIER T* get_scratch_location() const { + unsigned int sync_mem_size = details::scratch_sync_memory_size(max_block_size); + unsigned int scratch_id = details::cta::thread_rank() / 32; + return reinterpret_cast(&tile_memory->memory[sync_mem_size + scratch_id * communication_size]); + } + + template + _CG_QUALIFIER TyVal shfl_impl(TyVal val, unsigned int src) const { + unsigned int src_warp = src / 32; + auto warp = details::tiled_partition_internal<32, ThisType>(); + details::barrier_t* sync_location = get_sync_location(); + + // Get warp slot of the source threads warp. + TyVal* warp_scratch_location = get_scratch_location(src_warp); + + if (warp.meta_group_rank() == src_warp) { + // Put shuffled value into my warp slot and let my warp arrive at the barrier. + if (thread_rank() == src) { + *warp_scratch_location = val; + } + details::sync_warps_arrive(sync_location, details::cta::thread_rank(), numWarps); + TyVal result = *warp_scratch_location; + details::sync_warps_wait(sync_location, details::cta::thread_rank()); + return result; + } + else { + // Wait for the source warp to arrive on the barrier. + details::sync_warps_wait_for_warps( + (details::cta::thread_rank() / 32 - warp.meta_group_rank() + src_warp), + sync_location, details::cta::thread_rank(), + numWarps); + TyVal result = *warp_scratch_location; + details::sync_warps(sync_location, details::cta::thread_rank(), numWarps); + return result; + } + } + + template + _CG_QUALIFIER TyVal shfl_iterative_impl(TyVal val, unsigned int src) const { + auto warp = details::tiled_partition_internal<32, ThisType>(); + + details::copy_channel broadcast_channel{ + get_scratch_location(0), + get_sync_location(), + (size_t) communication_size * numWarps}; + + if (warp.meta_group_rank() == src / 32) { + val = warp.shfl(val, src); + broadcast_channel.template send_value< + TyVal, 32, decltype(broadcast_channel)::send_many_to_many>( + val, warp.thread_rank(), details::cta::thread_rank() / 32); + } + else { + broadcast_channel.template receive_value(val, warp.thread_rank() == 0); + } + sync(); + return val; + } + + template + _CG_QUALIFIER TyVal collectives_scheme_impl(const WarpLambda& warp_lambda, const InterWarpLambda& inter_warp_lambda) const { + auto warp = details::tiled_partition_internal<32, ThisType>(); + details::barrier_t* sync_location = get_sync_location(); + TyVal* warp_scratch_location = get_scratch_location(); + + warp_lambda(warp, warp_scratch_location); + + if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), numWarps)) { + auto subwarp = details::tiled_partition_internal(); + if (subwarp.meta_group_rank() == 0) { + TyVal* thread_scratch_location = get_scratch_location(subwarp.thread_rank()); + inter_warp_lambda(subwarp, thread_scratch_location); + } + warp.sync(); + details::sync_warps_release(sync_location, warp.thread_rank() == 0, details::cta::thread_rank(), numWarps); + } + TyVal result = *warp_scratch_location; + warp.sync(); // Added warpsync, if all collectives do sync before writing to reduce_location (they does right now), + // we could delete it. + return result; + } + + template + _CG_QUALIFIER TyVal collectives_scheme_iterative_impl( + const WarpLambda& warp_lambda, + const InterWarpLambda& inter_warp_lambda) const { + auto warp = details::tiled_partition_internal<32, ThisType>(); + details::barrier_t* sync_location = get_sync_location(); + details::copy_channel final_result_channel{ + get_scratch_location(0), + sync_location, + (size_t) communication_size * numWarps}; + + TyVal warp_result; + warp_lambda(warp, &warp_result); + + if (warp.meta_group_rank() == 0) { + auto subwarp = details::tiled_partition_internal(); + details::copy_channel partial_results_channel{ + get_scratch_location(subwarp.thread_rank()), + sync_location, + (size_t) communication_size}; + + // Thread 0 in subwarp set as inactive to not overwrite warp 0 warp_result. + partial_results_channel.template receive_value( + warp_result, + warp.thread_rank() == 0, + subwarp.thread_rank() != 0 && subwarp.meta_group_rank() == 0); + if (subwarp.meta_group_rank() == 0) { + inter_warp_lambda(subwarp, &warp_result); + } + warp_result = warp.shfl(warp_result, 0); + final_result_channel.template send_value( + warp_result, + warp.thread_rank(), + details::cta::thread_rank() / 32); + } + else { + details::copy_channel partial_results_channel{get_scratch_location(), sync_location, (size_t) communication_size}; + partial_results_channel.template send_value( + warp_result, + warp.thread_rank(), + (details::cta::thread_rank() - thread_rank()) / 32); + final_result_channel.template receive_value(warp_result, warp.thread_rank() == 0); + } + sync(); + return warp_result; + } + + template + _CG_QUALIFIER TyVal collectives_scheme(const WarpLambda& warp_lambda, const InterWarpLambda& inter_warp_lambda) const { + if (sizeof(TyVal) > communication_size) { + return collectives_scheme_iterative_impl(warp_lambda, inter_warp_lambda); + } + else { + return collectives_scheme_impl(warp_lambda, inter_warp_lambda); + } + } + +public: + _CG_STATIC_CONST_DECL unsigned int _group_id = details::multi_tile_group_id; + + using __static_size_tile_base::thread_rank; + + template + _CG_QUALIFIER TyVal shfl(TyVal val, unsigned int src) const { + if (sizeof(TyVal) > communication_size) { + return shfl_iterative_impl(val, src); + } + else { + return shfl_impl(val, src); + } + } + + _CG_QUALIFIER void sync() const { + details::sync_warps(get_sync_location(), details::cta::thread_rank(), numWarps); + } + + _CG_QUALIFIER int any(int predicate) const { + auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) { + *warp_scratch_location = __any_sync(0xFFFFFFFF, predicate); + }; + auto inter_warp_lambda = + [] (details::internal_thread_block_tile& subwarp, int* thread_scratch_location) { + *thread_scratch_location = __any_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location); + }; + return collectives_scheme(warp_lambda, inter_warp_lambda); + } + + _CG_QUALIFIER int all(int predicate) const { + auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) { + *warp_scratch_location = __all_sync(0xFFFFFFFF, predicate); + }; + auto inter_warp_lambda = + [] (details::internal_thread_block_tile& subwarp, int* thread_scratch_location) { + *thread_scratch_location = __all_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location); + }; + return collectives_scheme(warp_lambda, inter_warp_lambda); + } +}; + + +template +class __multi_warp_thread_block_tile : + public __static_size_multi_warp_tile_base, + public __static_parent_thread_block_tile_base +{ + typedef __static_parent_thread_block_tile_base staticParentBaseT; + typedef __static_size_multi_warp_tile_base staticTileBaseT; +protected: + _CG_QUALIFIER __multi_warp_thread_block_tile(const ParentT& g) : + __static_size_multi_warp_tile_base(g) {} +}; + +template +class __multi_warp_thread_block_tile : public __static_size_multi_warp_tile_base +{ + const unsigned int metaGroupRank; + const unsigned int metaGroupSize; + +protected: + template + _CG_QUALIFIER __multi_warp_thread_block_tile(const __multi_warp_thread_block_tile& g) : + __static_size_multi_warp_tile_base(g), metaGroupRank(g.meta_group_rank()), metaGroupSize(g.meta_group_size()) {} + +public: + _CG_QUALIFIER unsigned int meta_group_rank() const { + return metaGroupRank; + } + + _CG_QUALIFIER unsigned int meta_group_size() const { + return metaGroupSize; + } +}; +#endif + +template +class thread_block_tile; + +namespace details { + template + class thread_block_tile_impl; + + template + class thread_block_tile_impl: public __single_warp_thread_block_tile + { + protected: + template + _CG_QUALIFIER thread_block_tile_impl(const thread_block_tile_impl& g) : + __single_warp_thread_block_tile(g.meta_group_rank(), g.meta_group_size()) {} + + _CG_QUALIFIER thread_block_tile_impl(const thread_block& g) : + __single_warp_thread_block_tile() {} + }; + +#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) + template + class thread_block_tile_impl : public __multi_warp_thread_block_tile + { + protected: + template + _CG_QUALIFIER thread_block_tile_impl(const GroupT& g) : + __multi_warp_thread_block_tile(g) {} + }; +#else + template + class thread_block_tile_impl + { + protected: + template + _CG_QUALIFIER thread_block_tile_impl(const GroupT& g) {} + }; +#endif +} + +template +class thread_block_tile : public details::thread_block_tile_impl::value> +{ + friend _CG_QUALIFIER thread_block_tile<1, void> this_thread(); + +protected: + _CG_QUALIFIER thread_block_tile(const ParentT& g) : + details::thread_block_tile_impl::value>(g) {} + +public: + _CG_QUALIFIER operator thread_block_tile() const { + return thread_block_tile(*this); + } +}; + +template +class thread_block_tile : public details::thread_block_tile_impl::value> +{ + template + friend class thread_block_tile; + +protected: + template + _CG_QUALIFIER thread_block_tile(const thread_block_tile& g) : + details::thread_block_tile_impl::value>(g) {} + +public: + template + _CG_QUALIFIER thread_block_tile(const thread_block_tile& g) : + details::thread_block_tile_impl::value>(g) {} +}; + +namespace details { + template + struct tiled_partition_impl; + + template + struct tiled_partition_impl : public thread_block_tile { + _CG_QUALIFIER tiled_partition_impl(const thread_block& g) : + thread_block_tile(g) {} + }; + + // ParentT = static thread_block_tile specialization + template + struct tiled_partition_impl > : + public thread_block_tile > { +#ifdef _CG_CPP11_FEATURES + static_assert(Size < ParentSize, "Tile size bigger or equal to the parent group size"); +#endif + _CG_QUALIFIER tiled_partition_impl(const thread_block_tile& g) : + thread_block_tile >(g) {} + }; + +} + +namespace experimental { + template + _CG_QUALIFIER thread_block_tile tiled_partition(const ParentT& g) + { +#if defined(_CG_CPP11_FEATURES) && !defined(_CG_ABI_EXPERIMENTAL) + static_assert(details::_is_single_warp::value, "_CG_ABI_EXPERIMENTAL needs to be defined" + " before cooperative_groups header is included to enable experimental features"); +#endif + return details::tiled_partition_impl(g); + } + +} + +template +_CG_QUALIFIER thread_block_tile tiled_partition(const ParentT& g) +{ +#ifdef _CG_CPP11_FEATURES + static_assert(details::_is_single_warp::value, "Tiled partition with Size > 32 is supported only by" + " cooperative_groups::experimental::tiled_partition available with experimental features enabled"); +#endif + return details::tiled_partition_impl(g); +} + +/** + * thread_group this_thread() + * + * Constructs a generic thread_group containing only the calling thread + */ +_CG_QUALIFIER thread_block_tile<1, void> this_thread() +{ + // Make thread_block_tile<1, thread_block> parent of the returned group, so it will have its + // meta group rank and size set to 0 and 1 respectively. + return thread_block_tile<1, thread_block_tile<1, thread_block> >(this_thread_block()); +} + +/** + * .sync() + * + * Executes a barrier across the group + * + * Implements both a compiler fence and an architectural fence to prevent, + * memory reordering around the barrier. + */ +_CG_QUALIFIER void thread_group::sync() const +{ + switch (_data.group.type) { + case details::coalesced_group_id: + cooperative_groups::sync(*static_cast(this)); + break; + case details::thread_block_id: + cooperative_groups::sync(*static_cast(this)); + break; + case details::grid_group_id: + cooperative_groups::sync(*static_cast(this)); + break; +#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) + case details::multi_grid_group_id: + cooperative_groups::sync(*static_cast(this)); + break; +#endif +#if defined(_CG_HAS_CLUSTER_GROUP) + case details::cluster_group_id: + cooperative_groups::sync(*static_cast(this)); + break; +#endif + default: + break; + } +} + +/** + * .size() + * + * Returns the total number of threads in the group. + */ +_CG_QUALIFIER unsigned long long thread_group::size() const +{ + unsigned long long size = 0; + switch (_data.group.type) { + case details::coalesced_group_id: + size = cooperative_groups::group_size(*static_cast(this)); + break; + case details::thread_block_id: + size = cooperative_groups::group_size(*static_cast(this)); + break; + case details::grid_group_id: + size = cooperative_groups::group_size(*static_cast(this)); + break; +#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) + case details::multi_grid_group_id: + size = cooperative_groups::group_size(*static_cast(this)); + break; +#endif +#if defined(_CG_HAS_CLUSTER_GROUP) + case details::cluster_group_id: + size = cooperative_groups::group_size(*static_cast(this)); + break; +#endif + default: + break; + } + return size; +} + +/** + * .thread_rank() + * + * Returns the linearized rank of the calling thread along the interval [0, size()). + */ +_CG_QUALIFIER unsigned long long thread_group::thread_rank() const +{ + unsigned long long rank = 0; + switch (_data.group.type) { + case details::coalesced_group_id: + rank = cooperative_groups::thread_rank(*static_cast(this)); + break; + case details::thread_block_id: + rank = cooperative_groups::thread_rank(*static_cast(this)); + break; + case details::grid_group_id: + rank = cooperative_groups::thread_rank(*static_cast(this)); + break; +#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) + case details::multi_grid_group_id: + rank = cooperative_groups::thread_rank(*static_cast(this)); + break; +#endif +#if defined(_CG_HAS_CLUSTER_GROUP) + case details::cluster_group_id: + rank = cooperative_groups::thread_rank(*static_cast(this)); + break; +#endif + default: + break; + } + return rank; +} + +_CG_END_NAMESPACE + +#include + +# endif /* ! (__cplusplus, __CUDACC__) */ + +#endif /* !_COOPERATIVE_GROUPS_H_ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/coalesced_reduce.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/coalesced_reduce.h new file mode 100644 index 0000000000000000000000000000000000000000..c3722fb5c22809027cee66ab05758e477e8ef2bf --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/coalesced_reduce.h @@ -0,0 +1,108 @@ + /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CG_COALESCED_REDUCE_H_ +#define _CG_COALESCED_REDUCE_H_ + +#include "info.h" +#include "helpers.h" +#include "cooperative_groups.h" +#include "partitioning.h" +#include "coalesced_scan.h" + +_CG_BEGIN_NAMESPACE + +namespace details { + +template +_CG_QUALIFIER auto coalesced_reduce_to_one(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) { + if (group.size() == 32) { + auto out = val; + for (int offset = group.size() >> 1; offset > 0; offset >>= 1) { + out = op(out, group.shfl_up(out, offset)); + } + return out; + } + else { + auto scan_result = + inclusive_scan_non_contiguous(group, _CG_STL_NAMESPACE::forward(val), _CG_STL_NAMESPACE::forward(op)); + return scan_result; + } +} + +template +_CG_QUALIFIER auto coalesced_reduce(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) { + auto out = coalesced_reduce_to_one(group, _CG_STL_NAMESPACE::forward(val), _CG_STL_NAMESPACE::forward(op)); + if (group.size() == 32) { + return group.shfl(out, 31); + } + else { + unsigned int group_mask = _coalesced_group_data_access::get_mask(group); + unsigned int last_thread_id = 31 - __clz(group_mask); + return details::tile::shuffle_dispatch::shfl( + _CG_STL_NAMESPACE::forward(out), group_mask, last_thread_id, 32); + } +} + +template +_CG_QUALIFIER auto coalesced_reduce(const __single_warp_thread_block_tile& group, + TyVal&& val, + TyOp&& op) -> decltype(op(val, val)) { + auto out = val; + for (int mask = TySize >> 1; mask > 0; mask >>= 1) { + out = op(out, group.shfl_xor(out, mask)); + } + + return out; +} + +} // details + +_CG_END_NAMESPACE + +#endif // _CG_COALESCED_REDUCE_H_ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/functional.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/functional.h new file mode 100644 index 0000000000000000000000000000000000000000..ca17d08ad8407af2ea8967e53a8bdc4b59079c77 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/functional.h @@ -0,0 +1,207 @@ + /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CG_FUNCTIONAL_H +#define _CG_FUNCTIONAL_H + +#include "info.h" +#include "helpers.h" + +#ifdef _CG_CPP11_FEATURES +#ifdef _CG_USE_CUDA_STL +# include +#endif + +_CG_BEGIN_NAMESPACE + +namespace details { +#ifdef _CG_USE_CUDA_STL + using cuda::std::plus; + using cuda::std::bit_and; + using cuda::std::bit_xor; + using cuda::std::bit_or; +#else + template struct plus {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 + arg2;}}; + template struct bit_and {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 & arg2;}}; + template struct bit_xor {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 ^ arg2;}}; + template struct bit_or {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 | arg2;}}; +#endif // _CG_USE_PLATFORM_STL +} // details + +template +struct plus : public details::plus {}; + +template +struct less { + __device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const { + return (arg2 < arg1) ? arg2 : arg1; + } +}; + +template +struct greater { + __device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const { + return (arg1 < arg2) ? arg2 : arg1; + } +}; + +template +struct bit_and : public details::bit_and {}; + +template +struct bit_xor : public details::bit_xor {}; + +template +struct bit_or : public details::bit_or {}; + +#if defined(_CG_HAS_STL_ATOMICS) && defined(_CG_ABI_EXPERIMENTAL) +namespace details { + template + using _atomic_is_type_supported = _CG_STL_NAMESPACE::integral_constant::value && (sizeof(Ty) == 4 || sizeof(Ty) == 8)>; + + template struct _atomic_op_supported : public _CG_STL_NAMESPACE::false_type {}; + template struct _atomic_op_supported> : public _atomic_is_type_supported {}; + template struct _atomic_op_supported> : public _atomic_is_type_supported {}; + template struct _atomic_op_supported> : public _atomic_is_type_supported {}; + template struct _atomic_op_supported> : public _atomic_is_type_supported {}; + template struct _atomic_op_supported> : public _atomic_is_type_supported {}; + template struct _atomic_op_supported> : public _atomic_is_type_supported {}; + + template + _CG_QUALIFIER remove_qual atomic_cas_fallback(TyAtomic&& atomic, TyVal&& val, TyOp&& op) { + remove_qual old = atomic; + while(!atomic.compare_exchange_weak(old, op(old, val))); + return old; + } + + template + struct op_picker; + + template + struct op_picker> { + template + _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) { + return atomic.fetch_add(val); + } + }; + + template + struct op_picker> { + template + _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) { + return atomic.fetch_min(val); + } + }; + + template + struct op_picker> { + template + _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) { + return atomic.fetch_max(val); + } + }; + + template + struct op_picker> { + template + _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) { + return atomic.fetch_and(val); + } + }; + + template + struct op_picker> { + template + _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) { + return atomic.fetch_xor(val); + } + }; + + template + struct op_picker> { + template + _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) { + return atomic.fetch_or(val); + } + }; + + template + struct atomic_update_dispatch {}; + + template<> + struct atomic_update_dispatch { + template + _CG_STATIC_QUALIFIER remove_qual atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) { + return atomic_cas_fallback(atomic, _CG_STL_NAMESPACE::forward(val), _CG_STL_NAMESPACE::forward(op)); + } + }; + + template<> + struct atomic_update_dispatch { + template + _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val, TyOp&& op) { + using dispatch = op_picker>; + + return dispatch::atomic_update(atomic, val); + } + }; + + template + _CG_QUALIFIER remove_qual atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) { + using dispatch = atomic_update_dispatch<_atomic_op_supported>::value>; + + return dispatch::atomic_update(atomic, _CG_STL_NAMESPACE::forward(val), _CG_STL_NAMESPACE::forward(op)); + } +} +#endif + +_CG_END_NAMESPACE + +#endif +#endif //_CG_FUNCTIONAL_H diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/helpers.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/helpers.h new file mode 100644 index 0000000000000000000000000000000000000000..3e4f98b4eb59c68ed61c929b7b7982f9985b12f7 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/helpers.h @@ -0,0 +1,707 @@ + /* Copyright 1993-2021 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _COOPERATIVE_GROUPS_HELPERS_H_ +# define _COOPERATIVE_GROUPS_HELPERS_H_ + +#include "info.h" +#include "sync.h" + +_CG_BEGIN_NAMESPACE + +namespace details { +#ifdef _CG_CPP11_FEATURES + template struct _is_float_or_half : public _CG_STL_NAMESPACE::is_floating_point {}; +# ifdef _CG_HAS_FP16_COLLECTIVE + template <> struct _is_float_or_half<__half> : public _CG_STL_NAMESPACE::true_type {}; + template <> struct _is_float_or_half<__half2> : public _CG_STL_NAMESPACE::true_type {}; +# endif + template + using is_float_or_half = _is_float_or_half::type>; + + // Non-STL utility templates + template + using remove_qual = typename _CG_STL_NAMESPACE::remove_cv::type>::type; + + template + using is_op_type_same = _CG_STL_NAMESPACE::is_same, remove_qual + >; +#endif + + template + _CG_STATIC_QUALIFIER TyTrunc vec3_to_linear(dim3 index, dim3 nIndex) { + return ((TyTrunc)index.z * nIndex.y * nIndex.x) + + ((TyTrunc)index.y * nIndex.x) + + (TyTrunc)index.x; + } + + namespace cta { + + _CG_STATIC_QUALIFIER void sync() + { + __barrier_sync(0); + } + + _CG_STATIC_QUALIFIER unsigned int num_threads() + { + return static_cast(blockDim.x * blockDim.y * blockDim.z); + } + + _CG_STATIC_QUALIFIER unsigned int thread_rank() + { + return vec3_to_linear(threadIdx, blockDim); + } + + _CG_STATIC_QUALIFIER dim3 group_index() + { + return dim3(blockIdx.x, blockIdx.y, blockIdx.z); + } + + _CG_STATIC_QUALIFIER dim3 thread_index() + { + return dim3(threadIdx.x, threadIdx.y, threadIdx.z); + } + + _CG_STATIC_QUALIFIER dim3 dim_threads() + { + return dim3(blockDim.x, blockDim.y, blockDim.z); + } + + // Legacy aliases + _CG_STATIC_QUALIFIER unsigned int size() + { + return num_threads(); + } + + _CG_STATIC_QUALIFIER dim3 block_dim() + { + return dim_threads(); + } + + }; + + class _coalesced_group_data_access { + public: + // Retrieve mask of coalesced groups + template + _CG_STATIC_QUALIFIER unsigned int get_mask(const TyGroup &group) { + return group.get_mask(); + } + + // Retrieve mask of tiles + template