| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | #ifndef __cuda_occupancy_h__ |
| | #define __cuda_occupancy_h__ |
| |
|
| | #include <stddef.h> |
| | #include <limits.h> |
| | #include <string.h> |
| |
|
| |
|
| | |
| | |
| | #ifdef __CUDACC__ |
| | #define __OCC_INLINE inline __host__ __device__ |
| | #elif defined _MSC_VER |
| | #define __OCC_INLINE __inline |
| | #else |
| | #define __OCC_INLINE inline |
| | #endif |
| |
|
| | enum cudaOccError_enum { |
| | CUDA_OCC_SUCCESS = 0, |
| | CUDA_OCC_ERROR_INVALID_INPUT = 1, |
| | CUDA_OCC_ERROR_UNKNOWN_DEVICE = 2, |
| | |
| | |
| | }; |
| | typedef enum cudaOccError_enum cudaOccError; |
| |
|
| | typedef struct cudaOccResult cudaOccResult; |
| | typedef struct cudaOccDeviceProp cudaOccDeviceProp; |
| | typedef struct cudaOccFuncAttributes cudaOccFuncAttributes; |
| | typedef struct cudaOccDeviceState cudaOccDeviceState; |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | static __OCC_INLINE |
| | cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor( |
| | cudaOccResult *result, |
| | const cudaOccDeviceProp *properties, |
| | const cudaOccFuncAttributes *attributes, |
| | const cudaOccDeviceState *state, |
| | int blockSize, |
| | size_t dynamicSmemSize); |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | static __OCC_INLINE |
| | cudaOccError cudaOccMaxPotentialOccupancyBlockSize( |
| | int *minGridSize, |
| | int *blockSize, |
| | const cudaOccDeviceProp *properties, |
| | const cudaOccFuncAttributes *attributes, |
| | const cudaOccDeviceState *state, |
| | size_t (*blockSizeToDynamicSMemSize)(int), |
| | size_t dynamicSMemSize); |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | #if defined(__cplusplus) |
| | namespace { |
| |
|
| | __OCC_INLINE |
| | cudaOccError cudaOccMaxPotentialOccupancyBlockSize( |
| | int *minGridSize, |
| | int *blockSize, |
| | const cudaOccDeviceProp *properties, |
| | const cudaOccFuncAttributes *attributes, |
| | const cudaOccDeviceState *state, |
| | size_t dynamicSMemSize = 0); |
| |
|
| | template <typename UnaryFunction> |
| | __OCC_INLINE |
| | cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem( |
| | int *minGridSize, |
| | int *blockSize, |
| | const cudaOccDeviceProp *properties, |
| | const cudaOccFuncAttributes *attributes, |
| | const cudaOccDeviceState *state, |
| | UnaryFunction blockSizeToDynamicSMemSize); |
| |
|
| | } |
| | #endif |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | static __OCC_INLINE |
| | cudaOccError cudaOccAvailableDynamicSMemPerBlock( |
| | size_t *dynamicSmemSize, |
| | const cudaOccDeviceProp *properties, |
| | const cudaOccFuncAttributes *attributes, |
| | const cudaOccDeviceState *state, |
| | int numBlocks, |
| | int blockSize); |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | struct cudaOccDeviceProp { |
| | int computeMajor; |
| | int computeMinor; |
| | |
| | |
| | int maxThreadsPerBlock; |
| | int maxThreadsPerMultiprocessor; |
| | |
| | |
| | int regsPerBlock; |
| | int regsPerMultiprocessor; |
| | int warpSize; |
| | size_t sharedMemPerBlock; |
| | size_t sharedMemPerMultiprocessor; |
| | int numSms; |
| | size_t sharedMemPerBlockOptin; |
| | size_t reservedSharedMemPerBlock; |
| |
|
| | #ifdef __cplusplus |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | template<typename DeviceProp> |
| | __OCC_INLINE |
| | cudaOccDeviceProp(const DeviceProp &props) |
| | : computeMajor (props.major), |
| | computeMinor (props.minor), |
| | maxThreadsPerBlock (props.maxThreadsPerBlock), |
| | maxThreadsPerMultiprocessor (props.maxThreadsPerMultiProcessor), |
| | regsPerBlock (props.regsPerBlock), |
| | regsPerMultiprocessor (props.regsPerMultiprocessor), |
| | warpSize (props.warpSize), |
| | sharedMemPerBlock (props.sharedMemPerBlock), |
| | sharedMemPerMultiprocessor (props.sharedMemPerMultiprocessor), |
| | numSms (props.multiProcessorCount), |
| | sharedMemPerBlockOptin (props.sharedMemPerBlockOptin), |
| | reservedSharedMemPerBlock (props.reservedSharedMemPerBlock) |
| | {} |
| |
|
| | __OCC_INLINE |
| | cudaOccDeviceProp() |
| | : computeMajor (0), |
| | computeMinor (0), |
| | maxThreadsPerBlock (0), |
| | maxThreadsPerMultiprocessor (0), |
| | regsPerBlock (0), |
| | regsPerMultiprocessor (0), |
| | warpSize (0), |
| | sharedMemPerBlock (0), |
| | sharedMemPerMultiprocessor (0), |
| | numSms (0), |
| | sharedMemPerBlockOptin (0), |
| | reservedSharedMemPerBlock (0) |
| | {} |
| | #endif |
| | }; |
| |
|
| | |
| | |
| | |
| | typedef enum cudaOccPartitionedGCConfig_enum { |
| | PARTITIONED_GC_OFF, |
| | PARTITIONED_GC_ON, |
| | PARTITIONED_GC_ON_STRICT |
| | } cudaOccPartitionedGCConfig; |
| |
|
| | |
| | |
| | |
| | typedef enum cudaOccFuncShmemConfig_enum { |
| | FUNC_SHMEM_LIMIT_DEFAULT, |
| | FUNC_SHMEM_LIMIT_OPTIN, |
| | } cudaOccFuncShmemConfig; |
| |
|
| | |
| | |
| | |
| | |
| | |
| | struct cudaOccFuncAttributes { |
| | int maxThreadsPerBlock; |
| | |
| | |
| | int numRegs; |
| | |
| | |
| | size_t sharedSizeBytes; |
| |
|
| | cudaOccPartitionedGCConfig partitionedGCConfig; |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | cudaOccFuncShmemConfig shmemLimitConfig; |
| | |
| | |
| | |
| | |
| |
|
| | size_t maxDynamicSharedSizeBytes; |
| | |
| | |
| | |
| | |
| |
|
| | int numBlockBarriers; |
| | #ifdef __cplusplus |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | template<typename FuncAttributes> |
| | __OCC_INLINE |
| | cudaOccFuncAttributes(const FuncAttributes &attr) |
| | : maxThreadsPerBlock (attr.maxThreadsPerBlock), |
| | numRegs (attr.numRegs), |
| | sharedSizeBytes (attr.sharedSizeBytes), |
| | partitionedGCConfig (PARTITIONED_GC_OFF), |
| | shmemLimitConfig (FUNC_SHMEM_LIMIT_OPTIN), |
| | maxDynamicSharedSizeBytes (attr.maxDynamicSharedSizeBytes), |
| | numBlockBarriers (1) |
| | {} |
| |
|
| | __OCC_INLINE |
| | cudaOccFuncAttributes() |
| | : maxThreadsPerBlock (0), |
| | numRegs (0), |
| | sharedSizeBytes (0), |
| | partitionedGCConfig (PARTITIONED_GC_OFF), |
| | shmemLimitConfig (FUNC_SHMEM_LIMIT_DEFAULT), |
| | maxDynamicSharedSizeBytes (0), |
| | numBlockBarriers (0) |
| | {} |
| | #endif |
| | }; |
| |
|
| | typedef enum cudaOccCacheConfig_enum { |
| | CACHE_PREFER_NONE = 0x00, |
| | CACHE_PREFER_SHARED = 0x01, |
| | CACHE_PREFER_L1 = 0x02, |
| | CACHE_PREFER_EQUAL = 0x03 |
| | } cudaOccCacheConfig; |
| |
|
| | typedef enum cudaOccCarveoutConfig_enum { |
| | SHAREDMEM_CARVEOUT_DEFAULT = -1, |
| | SHAREDMEM_CARVEOUT_MAX_SHARED = 100, |
| | SHAREDMEM_CARVEOUT_MAX_L1 = 0, |
| | SHAREDMEM_CARVEOUT_HALF = 50 |
| | } cudaOccCarveoutConfig; |
| |
|
| | |
| | |
| | |
| | |
| | |
| | struct cudaOccDeviceState |
| | { |
| | |
| | cudaOccCacheConfig cacheConfig; |
| | |
| | int carveoutConfig; |
| |
|
| | #ifdef __cplusplus |
| | __OCC_INLINE |
| | cudaOccDeviceState() |
| | : cacheConfig (CACHE_PREFER_NONE), |
| | carveoutConfig (SHAREDMEM_CARVEOUT_DEFAULT) |
| | {} |
| | #endif |
| | }; |
| |
|
| | typedef enum cudaOccLimitingFactor_enum { |
| | |
| | OCC_LIMIT_WARPS = 0x01, |
| | OCC_LIMIT_REGISTERS = 0x02, |
| | OCC_LIMIT_SHARED_MEMORY = 0x04, |
| | OCC_LIMIT_BLOCKS = 0x08, |
| | OCC_LIMIT_BARRIERS = 0x10 |
| | } cudaOccLimitingFactor; |
| |
|
| | |
| | |
| | |
| | |
| | |
| | struct cudaOccResult { |
| | int activeBlocksPerMultiprocessor; |
| | unsigned int limitingFactors; |
| | |
| | |
| | int blockLimitRegs; |
| | |
| | |
| | int blockLimitSharedMem; |
| | |
| | |
| | int blockLimitWarps; |
| | int blockLimitBlocks; |
| | |
| | int blockLimitBarriers; |
| | int allocatedRegistersPerBlock; |
| | |
| | size_t allocatedSharedMemPerBlock; |
| | |
| | cudaOccPartitionedGCConfig partitionedGCConfig; |
| | |
| | |
| | }; |
| |
|
| | |
| | |
| | |
| | |
| | |
| | typedef enum cudaOccPartitionedGCSupport_enum { |
| | PARTITIONED_GC_NOT_SUPPORTED, |
| | PARTITIONED_GC_SUPPORTED, |
| | } cudaOccPartitionedGCSupport; |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| | #define __CUDA_OCC_MAJOR__ 12 |
| | #define __CUDA_OCC_MINOR__ 0 |
| |
|
| | |
| | |
| | |
| |
|
| | static __OCC_INLINE int __occMin(int lhs, int rhs) |
| | { |
| | return rhs < lhs ? rhs : lhs; |
| | } |
| |
|
| | static __OCC_INLINE int __occDivideRoundUp(int x, int y) |
| | { |
| | return (x + (y - 1)) / y; |
| | } |
| |
|
| | static __OCC_INLINE int __occRoundUp(int x, int y) |
| | { |
| | return y * __occDivideRoundUp(x, y); |
| | } |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | static __OCC_INLINE cudaOccError cudaOccSMemAllocationGranularity(int *limit, const cudaOccDeviceProp *properties) |
| | { |
| | int value; |
| |
|
| | switch(properties->computeMajor) { |
| | case 3: |
| | case 5: |
| | case 6: |
| | case 7: |
| | value = 256; |
| | break; |
| | case 8: |
| | case 9: |
| | case 10: |
| | case 12: |
| | value = 128; |
| | break; |
| | default: |
| | return CUDA_OCC_ERROR_UNKNOWN_DEVICE; |
| | } |
| |
|
| | *limit = value; |
| |
|
| | return CUDA_OCC_SUCCESS; |
| | } |
| |
|
| | |
| | |
| | |
| | static __OCC_INLINE cudaOccError cudaOccRegAllocationMaxPerThread(int *limit, const cudaOccDeviceProp *properties) |
| | { |
| | int value; |
| |
|
| | switch(properties->computeMajor) { |
| | case 3: |
| | case 5: |
| | case 6: |
| | value = 255; |
| | break; |
| | case 7: |
| | case 8: |
| | case 9: |
| | case 10: |
| | case 12: |
| | value = 256; |
| | break; |
| | default: |
| | return CUDA_OCC_ERROR_UNKNOWN_DEVICE; |
| | } |
| |
|
| | *limit = value; |
| |
|
| | return CUDA_OCC_SUCCESS; |
| | } |
| |
|
| | |
| | |
| | |
| | static __OCC_INLINE cudaOccError cudaOccRegAllocationGranularity(int *limit, const cudaOccDeviceProp *properties) |
| | { |
| | int value; |
| |
|
| | switch(properties->computeMajor) { |
| | case 3: |
| | case 5: |
| | case 6: |
| | case 7: |
| | case 8: |
| | case 9: |
| | case 10: |
| | case 12: |
| | value = 256; |
| | break; |
| | default: |
| | return CUDA_OCC_ERROR_UNKNOWN_DEVICE; |
| | } |
| |
|
| | *limit = value; |
| |
|
| | return CUDA_OCC_SUCCESS; |
| | } |
| |
|
| | |
| | |
| | |
| | static __OCC_INLINE cudaOccError cudaOccSubPartitionsPerMultiprocessor(int *limit, const cudaOccDeviceProp *properties) |
| | { |
| | int value; |
| |
|
| | switch(properties->computeMajor) { |
| | case 3: |
| | case 5: |
| | case 7: |
| | case 8: |
| | case 9: |
| | case 10: |
| | case 12: |
| | value = 4; |
| | break; |
| | case 6: |
| | value = properties->computeMinor ? 4 : 2; |
| | break; |
| | default: |
| | return CUDA_OCC_ERROR_UNKNOWN_DEVICE; |
| | } |
| |
|
| | *limit = value; |
| |
|
| | return CUDA_OCC_SUCCESS; |
| | } |
| |
|
| |
|
| | |
| | |
| | |
| | static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerMultiprocessor(int* limit, const cudaOccDeviceProp *properties) |
| | { |
| | int value; |
| |
|
| | switch(properties->computeMajor) { |
| | case 3: |
| | value = 16; |
| | break; |
| | case 5: |
| | case 6: |
| | value = 32; |
| | break; |
| | case 7: { |
| | int isTuring = properties->computeMinor == 5; |
| | value = (isTuring) ? 16 : 32; |
| | break; |
| | } |
| | case 8: |
| | if (properties->computeMinor == 0) { |
| | value = 32; |
| | } |
| | else if (properties->computeMinor == 9) { |
| | value = 24; |
| | } |
| | else { |
| | value = 16; |
| | } |
| | break; |
| | case 9: |
| | value = 32; |
| | break; |
| | case 10: |
| | switch(properties->computeMinor) { |
| | case 1 : |
| | value = 24; |
| | break; |
| | case 0 : |
| | default : |
| | value = 32; |
| | } |
| | break; |
| | case 12: |
| | value = 24; |
| | break; |
| | default: |
| | return CUDA_OCC_ERROR_UNKNOWN_DEVICE; |
| | } |
| |
|
| | *limit = value; |
| |
|
| | return CUDA_OCC_SUCCESS; |
| | } |
| |
|
| | |
| | |
| | |
| | static __OCC_INLINE cudaOccError cudaOccAlignUpShmemSizeVoltaPlus(size_t *shMemSize, const cudaOccDeviceProp *properties) |
| | { |
| | |
| | |
| | |
| | size_t size = *shMemSize; |
| |
|
| | switch (properties->computeMajor) { |
| | case 7: { |
| | |
| | int isTuring = properties->computeMinor == 5; |
| | if (isTuring) { |
| | if (size <= 32 * 1024) { |
| | *shMemSize = 32 * 1024; |
| | } |
| | else if (size <= 64 * 1024) { |
| | *shMemSize = 64 * 1024; |
| | } |
| | else { |
| | return CUDA_OCC_ERROR_INVALID_INPUT; |
| | } |
| | } |
| | |
| | else { |
| | if (size == 0) { |
| | *shMemSize = 0; |
| | } |
| | else if (size <= 8 * 1024) { |
| | *shMemSize = 8 * 1024; |
| | } |
| | else if (size <= 16 * 1024) { |
| | *shMemSize = 16 * 1024; |
| | } |
| | else if (size <= 32 * 1024) { |
| | *shMemSize = 32 * 1024; |
| | } |
| | else if (size <= 64 * 1024) { |
| | *shMemSize = 64 * 1024; |
| | } |
| | else if (size <= 96 * 1024) { |
| | *shMemSize = 96 * 1024; |
| | } |
| | else { |
| | return CUDA_OCC_ERROR_INVALID_INPUT; |
| | } |
| | } |
| | break; |
| | } |
| | case 8: |
| | if (properties->computeMinor == 0 || properties->computeMinor == 7) { |
| | if (size == 0) { |
| | *shMemSize = 0; |
| | } |
| | else if (size <= 8 * 1024) { |
| | *shMemSize = 8 * 1024; |
| | } |
| | else if (size <= 16 * 1024) { |
| | *shMemSize = 16 * 1024; |
| | } |
| | else if (size <= 32 * 1024) { |
| | *shMemSize = 32 * 1024; |
| | } |
| | else if (size <= 64 * 1024) { |
| | *shMemSize = 64 * 1024; |
| | } |
| | else if (size <= 100 * 1024) { |
| | *shMemSize = 100 * 1024; |
| | } |
| | else if (size <= 132 * 1024) { |
| | *shMemSize = 132 * 1024; |
| | } |
| | else if (size <= 164 * 1024) { |
| | *shMemSize = 164 * 1024; |
| | } |
| | else { |
| | return CUDA_OCC_ERROR_INVALID_INPUT; |
| | } |
| | } |
| | else { |
| | if (size == 0) { |
| | *shMemSize = 0; |
| | } |
| | else if (size <= 8 * 1024) { |
| | *shMemSize = 8 * 1024; |
| | } |
| | else if (size <= 16 * 1024) { |
| | *shMemSize = 16 * 1024; |
| | } |
| | else if (size <= 32 * 1024) { |
| | *shMemSize = 32 * 1024; |
| | } |
| | else if (size <= 64 * 1024) { |
| | *shMemSize = 64 * 1024; |
| | } |
| | else if (size <= 100 * 1024) { |
| | *shMemSize = 100 * 1024; |
| | } |
| | else { |
| | return CUDA_OCC_ERROR_INVALID_INPUT; |
| | } |
| | } |
| | break; |
| | case 9: { |
| | if (size == 0) { |
| | *shMemSize = 0; |
| | } |
| | else if (size <= 8 * 1024) { |
| | *shMemSize = 8 * 1024; |
| | } |
| | else if (size <= 16 * 1024) { |
| | *shMemSize = 16 * 1024; |
| | } |
| | else if (size <= 32 * 1024) { |
| | *shMemSize = 32 * 1024; |
| | } |
| | else if (size <= 64 * 1024) { |
| | *shMemSize = 64 * 1024; |
| | } |
| | else if (size <= 100 * 1024) { |
| | *shMemSize = 100 * 1024; |
| | } |
| | else if (size <= 132 * 1024) { |
| | *shMemSize = 132 * 1024; |
| | } |
| | else if (size <= 164 * 1024) { |
| | *shMemSize = 164 * 1024; |
| | } |
| | else if (size <= 196 * 1024) { |
| | *shMemSize = 196 * 1024; |
| | } |
| | else if (size <= 228 * 1024) { |
| | *shMemSize = 228 * 1024; |
| | } |
| | else { |
| | return CUDA_OCC_ERROR_INVALID_INPUT; |
| | } |
| | break; |
| | } |
| | case 10: { |
| | switch (properties->computeMinor) { |
| | |
| | |
| | case 0: |
| | case 1: |
| | if (size == 0) { |
| | *shMemSize = 0; |
| | } |
| | else if (size <= 8 * 1024) { |
| | *shMemSize = 8 * 1024; |
| | } |
| | else if (size <= 16 * 1024) { |
| | *shMemSize = 16 * 1024; |
| | } |
| | else if (size <= 32 * 1024) { |
| | *shMemSize = 32 * 1024; |
| | } |
| | else if (size <= 64 * 1024) { |
| | *shMemSize = 64 * 1024; |
| | } |
| | else if (size <= 100 * 1024) { |
| | *shMemSize = 100 * 1024; |
| | } |
| | else if (size <= 132 * 1024) { |
| | *shMemSize = 132 * 1024; |
| | } |
| | else if (size <= 164 * 1024) { |
| | *shMemSize = 164 * 1024; |
| | } |
| | else if (size <= 196 * 1024) { |
| | *shMemSize = 196 * 1024; |
| | } |
| | else if (size <= 228 * 1024) { |
| | *shMemSize = 228 * 1024; |
| | } |
| | else { |
| | return CUDA_OCC_ERROR_INVALID_INPUT; |
| | } |
| | break; |
| | default: |
| | return CUDA_OCC_ERROR_UNKNOWN_DEVICE; |
| | } |
| | break; |
| | } |
| | case 12: { |
| | switch (properties->computeMinor) { |
| | case 0: |
| | if (size == 0) { |
| | *shMemSize = 0; |
| | } |
| | else if (size <= 8 * 1024) { |
| | *shMemSize = 8 * 1024; |
| | } |
| | else if (size <= 16 * 1024) { |
| | *shMemSize = 16 * 1024; |
| | } |
| | else if (size <= 32 * 1024) { |
| | *shMemSize = 32 * 1024; |
| | } |
| | else if (size <= 64 * 1024) { |
| | *shMemSize = 64 * 1024; |
| | } |
| | else if (size <= 100 * 1024) { |
| | *shMemSize = 100 * 1024; |
| | } |
| | else { |
| | return CUDA_OCC_ERROR_INVALID_INPUT; |
| | } |
| | break; |
| | default: |
| | return CUDA_OCC_ERROR_UNKNOWN_DEVICE; |
| | } |
| | break; |
| | } |
| | break; |
| | default: |
| | return CUDA_OCC_ERROR_UNKNOWN_DEVICE; |
| | } |
| |
|
| | return CUDA_OCC_SUCCESS; |
| | } |
| |
|
| | |
| | |
| | |
| | static __OCC_INLINE cudaOccError cudaOccSMemPreferenceVoltaPlus(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state) |
| | { |
| | cudaOccError status = CUDA_OCC_SUCCESS; |
| | size_t preferenceShmemSize; |
| |
|
| | |
| | |
| | |
| | int effectivePreference = state->carveoutConfig; |
| | if ((effectivePreference < SHAREDMEM_CARVEOUT_DEFAULT) || (effectivePreference > SHAREDMEM_CARVEOUT_MAX_SHARED)) { |
| | return CUDA_OCC_ERROR_INVALID_INPUT; |
| | } |
| | |
| | if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) { |
| | switch (state->cacheConfig) |
| | { |
| | case CACHE_PREFER_L1: |
| | effectivePreference = SHAREDMEM_CARVEOUT_MAX_L1; |
| | break; |
| | case CACHE_PREFER_SHARED: |
| | effectivePreference = SHAREDMEM_CARVEOUT_MAX_SHARED; |
| | break; |
| | case CACHE_PREFER_EQUAL: |
| | effectivePreference = SHAREDMEM_CARVEOUT_HALF; |
| | break; |
| | default: |
| | effectivePreference = SHAREDMEM_CARVEOUT_DEFAULT; |
| | break; |
| | } |
| | } |
| |
|
| | if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) { |
| | preferenceShmemSize = properties->sharedMemPerMultiprocessor; |
| | } |
| | else { |
| | preferenceShmemSize = (size_t) (effectivePreference * properties->sharedMemPerMultiprocessor) / 100; |
| | } |
| |
|
| | status = cudaOccAlignUpShmemSizeVoltaPlus(&preferenceShmemSize, properties); |
| | *limit = preferenceShmemSize; |
| | return status; |
| | } |
| |
|
| | |
| | |
| | |
| | static __OCC_INLINE cudaOccError cudaOccSMemPreference(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state) |
| | { |
| | size_t bytes = 0; |
| | size_t sharedMemPerMultiprocessorHigh = properties->sharedMemPerMultiprocessor; |
| | cudaOccCacheConfig cacheConfig = state->cacheConfig; |
| |
|
| | |
| | |
| | |
| | |
| | |
| | size_t minCacheSize = 16384; |
| | size_t maxCacheSize = 49152; |
| | size_t cacheAndSharedTotal = sharedMemPerMultiprocessorHigh + minCacheSize; |
| | size_t sharedMemPerMultiprocessorLow = cacheAndSharedTotal - maxCacheSize; |
| |
|
| | switch (properties->computeMajor) { |
| | case 3: |
| | |
| | |
| | |
| | switch (cacheConfig) { |
| | default : |
| | case CACHE_PREFER_NONE: |
| | case CACHE_PREFER_SHARED: |
| | bytes = sharedMemPerMultiprocessorHigh; |
| | break; |
| | case CACHE_PREFER_L1: |
| | bytes = sharedMemPerMultiprocessorLow; |
| | break; |
| | case CACHE_PREFER_EQUAL: |
| | |
| | |
| | |
| | bytes = (sharedMemPerMultiprocessorHigh + sharedMemPerMultiprocessorLow) / 2; |
| | break; |
| | } |
| | break; |
| | case 5: |
| | case 6: |
| | |
| | |
| | bytes = sharedMemPerMultiprocessorHigh; |
| | break; |
| | default: |
| | return CUDA_OCC_ERROR_UNKNOWN_DEVICE; |
| | } |
| |
|
| | *limit = bytes; |
| |
|
| | return CUDA_OCC_SUCCESS; |
| | } |
| |
|
| | |
| | |
| | |
| | static __OCC_INLINE cudaOccError cudaOccSMemPerMultiprocessor(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state) |
| | { |
| | |
| | |
| | if (properties->computeMajor >= 7) { |
| | return cudaOccSMemPreferenceVoltaPlus(limit, properties, state); |
| | } |
| | return cudaOccSMemPreference(limit, properties, state); |
| | } |
| |
|
| | |
| | |
| | |
| | static __OCC_INLINE cudaOccError cudaOccSMemPerBlock(size_t *limit, const cudaOccDeviceProp *properties, cudaOccFuncShmemConfig shmemLimitConfig, size_t smemPerCta) |
| | { |
| | switch (properties->computeMajor) { |
| | case 2: |
| | case 3: |
| | case 4: |
| | case 5: |
| | case 6: |
| | *limit = properties->sharedMemPerBlock; |
| | break; |
| | case 7: |
| | case 8: |
| | case 9: |
| | case 10: |
| | case 12: |
| | switch (shmemLimitConfig) { |
| | default: |
| | case FUNC_SHMEM_LIMIT_DEFAULT: |
| | *limit = properties->sharedMemPerBlock; |
| | break; |
| | case FUNC_SHMEM_LIMIT_OPTIN: |
| | if (smemPerCta > properties->sharedMemPerBlock) { |
| | *limit = properties->sharedMemPerBlockOptin; |
| | } |
| | else { |
| | *limit = properties->sharedMemPerBlock; |
| | } |
| | break; |
| | } |
| | break; |
| | default: |
| | return CUDA_OCC_ERROR_UNKNOWN_DEVICE; |
| | } |
| |
|
| | |
| | if (properties->computeMajor >= 8) { |
| | *limit += properties->reservedSharedMemPerBlock; |
| | } |
| |
|
| | return CUDA_OCC_SUCCESS; |
| | } |
| |
|
| | |
| | |
| | |
| | static __OCC_INLINE cudaOccError cudaOccPartitionedGlobalCachingModeSupport(cudaOccPartitionedGCSupport *limit, const cudaOccDeviceProp *properties) |
| | { |
| | *limit = PARTITIONED_GC_NOT_SUPPORTED; |
| |
|
| | if ((properties->computeMajor == 5 && (properties->computeMinor == 2 || properties->computeMinor == 3)) || |
| | properties->computeMajor == 6) { |
| | *limit = PARTITIONED_GC_SUPPORTED; |
| | } |
| |
|
| | if (properties->computeMajor == 6 && properties->computeMinor == 0) { |
| | *limit = PARTITIONED_GC_NOT_SUPPORTED; |
| | } |
| |
|
| | return CUDA_OCC_SUCCESS; |
| | } |
| |
|
| | |
| | |
| | |
| |
|
| | static __OCC_INLINE cudaOccError cudaOccDevicePropCheck(const cudaOccDeviceProp *properties) |
| | { |
| | |
| | |
| | |
| | |
| | |
| | |
| | if (properties->maxThreadsPerBlock <= 0 || |
| | properties->maxThreadsPerMultiprocessor <= 0 || |
| | properties->regsPerBlock <= 0 || |
| | properties->regsPerMultiprocessor <= 0 || |
| | properties->warpSize <= 0 || |
| | properties->sharedMemPerBlock <= 0 || |
| | properties->sharedMemPerMultiprocessor <= 0 || |
| | properties->numSms <= 0) { |
| | return CUDA_OCC_ERROR_INVALID_INPUT; |
| | } |
| |
|
| | return CUDA_OCC_SUCCESS; |
| | } |
| |
|
| | static __OCC_INLINE cudaOccError cudaOccFuncAttributesCheck(const cudaOccFuncAttributes *attributes) |
| | { |
| | |
| | |
| | if (attributes->maxThreadsPerBlock <= 0 || |
| | attributes->numRegs < 0) { |
| | |
| | |
| | return CUDA_OCC_ERROR_INVALID_INPUT; |
| | } |
| |
|
| | return CUDA_OCC_SUCCESS; |
| | } |
| |
|
| | static __OCC_INLINE cudaOccError cudaOccDeviceStateCheck(const cudaOccDeviceState *state) |
| | { |
| | (void)state; |
| | |
| | |
| |
|
| | return CUDA_OCC_SUCCESS; |
| | } |
| |
|
| | static __OCC_INLINE cudaOccError cudaOccInputCheck( |
| | const cudaOccDeviceProp *properties, |
| | const cudaOccFuncAttributes *attributes, |
| | const cudaOccDeviceState *state) |
| | { |
| | cudaOccError status = CUDA_OCC_SUCCESS; |
| |
|
| | status = cudaOccDevicePropCheck(properties); |
| | if (status != CUDA_OCC_SUCCESS) { |
| | return status; |
| | } |
| |
|
| | status = cudaOccFuncAttributesCheck(attributes); |
| | if (status != CUDA_OCC_SUCCESS) { |
| | return status; |
| | } |
| |
|
| | status = cudaOccDeviceStateCheck(state); |
| | if (status != CUDA_OCC_SUCCESS) { |
| | return status; |
| | } |
| |
|
| | return status; |
| | } |
| |
|
| | |
| | |
| | |
| |
|
| | static __OCC_INLINE cudaOccPartitionedGCConfig cudaOccPartitionedGCExpected( |
| | const cudaOccDeviceProp *properties, |
| | const cudaOccFuncAttributes *attributes) |
| | { |
| | cudaOccPartitionedGCSupport gcSupport; |
| | cudaOccPartitionedGCConfig gcConfig; |
| |
|
| | cudaOccPartitionedGlobalCachingModeSupport(&gcSupport, properties); |
| |
|
| | gcConfig = attributes->partitionedGCConfig; |
| |
|
| | if (gcSupport == PARTITIONED_GC_NOT_SUPPORTED) { |
| | gcConfig = PARTITIONED_GC_OFF; |
| | } |
| |
|
| | return gcConfig; |
| | } |
| |
|
| | |
| | |
| | static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMWarpsLimit( |
| | int *limit, |
| | cudaOccPartitionedGCConfig gcConfig, |
| | const cudaOccDeviceProp *properties, |
| | const cudaOccFuncAttributes *attributes, |
| | int blockSize) |
| | { |
| | cudaOccError status = CUDA_OCC_SUCCESS; |
| | int maxWarpsPerSm; |
| | int warpsAllocatedPerCTA; |
| | int maxBlocks; |
| | (void)attributes; |
| |
|
| | if (blockSize > properties->maxThreadsPerBlock) { |
| | maxBlocks = 0; |
| | } |
| | else { |
| | maxWarpsPerSm = properties->maxThreadsPerMultiprocessor / properties->warpSize; |
| | warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize); |
| | maxBlocks = 0; |
| |
|
| | if (gcConfig != PARTITIONED_GC_OFF) { |
| | int maxBlocksPerSmPartition; |
| | int maxWarpsPerSmPartition; |
| |
|
| | |
| | |
| | |
| | |
| | maxWarpsPerSmPartition = maxWarpsPerSm / 2; |
| | maxBlocksPerSmPartition = maxWarpsPerSmPartition / warpsAllocatedPerCTA; |
| | maxBlocks = maxBlocksPerSmPartition * 2; |
| | } |
| | |
| | |
| | |
| | |
| | |
| | else { |
| | maxBlocks = maxWarpsPerSm / warpsAllocatedPerCTA; |
| | } |
| | } |
| |
|
| | *limit = maxBlocks; |
| |
|
| | return status; |
| | } |
| |
|
| | |
| | |
| | static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMSmemLimit( |
| | int *limit, |
| | cudaOccResult *result, |
| | const cudaOccDeviceProp *properties, |
| | const cudaOccFuncAttributes *attributes, |
| | const cudaOccDeviceState *state, |
| | int blockSize, |
| | size_t dynamicSmemSize) |
| | { |
| | cudaOccError status = CUDA_OCC_SUCCESS; |
| | int allocationGranularity; |
| | size_t userSmemPreference = 0; |
| | size_t totalSmemUsagePerCTA; |
| | size_t maxSmemUsagePerCTA; |
| | size_t smemAllocatedPerCTA; |
| | size_t staticSmemSize; |
| | size_t sharedMemPerMultiprocessor; |
| | size_t smemLimitPerCTA; |
| | int maxBlocks; |
| | int dynamicSmemSizeExceeded = 0; |
| | int totalSmemSizeExceeded = 0; |
| | (void)blockSize; |
| |
|
| | status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties); |
| | if (status != CUDA_OCC_SUCCESS) { |
| | return status; |
| | } |
| |
|
| | |
| | |
| | |
| | status = cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state); |
| | if (status != CUDA_OCC_SUCCESS) { |
| | return status; |
| | } |
| |
|
| | staticSmemSize = attributes->sharedSizeBytes + properties->reservedSharedMemPerBlock; |
| | totalSmemUsagePerCTA = staticSmemSize + dynamicSmemSize; |
| | smemAllocatedPerCTA = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity); |
| |
|
| | maxSmemUsagePerCTA = staticSmemSize + attributes->maxDynamicSharedSizeBytes; |
| |
|
| | dynamicSmemSizeExceeded = 0; |
| | totalSmemSizeExceeded = 0; |
| |
|
| | |
| | |
| | |
| | if (attributes->shmemLimitConfig != FUNC_SHMEM_LIMIT_DEFAULT && |
| | dynamicSmemSize > attributes->maxDynamicSharedSizeBytes) { |
| | dynamicSmemSizeExceeded = 1; |
| | } |
| |
|
| | status = cudaOccSMemPerBlock(&smemLimitPerCTA, properties, attributes->shmemLimitConfig, maxSmemUsagePerCTA); |
| | if (status != CUDA_OCC_SUCCESS) { |
| | return status; |
| | } |
| |
|
| | if (smemAllocatedPerCTA > smemLimitPerCTA) { |
| | totalSmemSizeExceeded = 1; |
| | } |
| |
|
| | if (dynamicSmemSizeExceeded || totalSmemSizeExceeded) { |
| | maxBlocks = 0; |
| | } |
| | else { |
| | |
| | |
| | |
| | if (userSmemPreference >= smemAllocatedPerCTA) { |
| | sharedMemPerMultiprocessor = userSmemPreference; |
| | } |
| | else { |
| | |
| | |
| | |
| | if (properties->computeMajor >= 7) { |
| | sharedMemPerMultiprocessor = smemAllocatedPerCTA; |
| | status = cudaOccAlignUpShmemSizeVoltaPlus(&sharedMemPerMultiprocessor, properties); |
| | if (status != CUDA_OCC_SUCCESS) { |
| | return status; |
| | } |
| | } |
| | else { |
| | sharedMemPerMultiprocessor = properties->sharedMemPerMultiprocessor; |
| | } |
| | } |
| |
|
| | if (smemAllocatedPerCTA > 0) { |
| | maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA); |
| | } |
| | else { |
| | maxBlocks = INT_MAX; |
| | } |
| | } |
| |
|
| | result->allocatedSharedMemPerBlock = smemAllocatedPerCTA; |
| |
|
| | *limit = maxBlocks; |
| |
|
| | return status; |
| | } |
| |
|
| | static __OCC_INLINE |
| | cudaOccError cudaOccMaxBlocksPerSMRegsLimit( |
| | int *limit, |
| | cudaOccPartitionedGCConfig *gcConfig, |
| | cudaOccResult *result, |
| | const cudaOccDeviceProp *properties, |
| | const cudaOccFuncAttributes *attributes, |
| | int blockSize) |
| | { |
| | cudaOccError status = CUDA_OCC_SUCCESS; |
| | int allocationGranularity; |
| | int warpsAllocatedPerCTA; |
| | int regsAllocatedPerCTA; |
| | int regsAssumedPerCTA; |
| | int regsPerWarp; |
| | int regsAllocatedPerWarp; |
| | int numSubPartitions; |
| | int numRegsPerSubPartition; |
| | int numWarpsPerSubPartition; |
| | int numWarpsPerSM; |
| | int maxBlocks; |
| | int maxRegsPerThread; |
| |
|
| | status = cudaOccRegAllocationGranularity( |
| | &allocationGranularity, |
| | properties); |
| | if (status != CUDA_OCC_SUCCESS) { |
| | return status; |
| | } |
| |
|
| | status = cudaOccRegAllocationMaxPerThread( |
| | &maxRegsPerThread, |
| | properties); |
| | if (status != CUDA_OCC_SUCCESS) { |
| | return status; |
| | } |
| |
|
| | status = cudaOccSubPartitionsPerMultiprocessor(&numSubPartitions, properties); |
| | if (status != CUDA_OCC_SUCCESS) { |
| | return status; |
| | } |
| |
|
| | warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize); |
| |
|
| | |
| | |
| | |
| | |
| | |
| | regsPerWarp = attributes->numRegs * properties->warpSize; |
| | regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity); |
| | regsAllocatedPerCTA = regsAllocatedPerWarp * warpsAllocatedPerCTA; |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions); |
| |
|
| | if (properties->regsPerBlock < regsAssumedPerCTA || |
| | properties->regsPerBlock < regsAllocatedPerCTA || |
| | attributes->numRegs > maxRegsPerThread) { |
| | maxBlocks = 0; |
| | } |
| | else { |
| | if (regsAllocatedPerWarp > 0) { |
| | |
| | |
| | |
| | |
| | numRegsPerSubPartition = properties->regsPerMultiprocessor / numSubPartitions; |
| | numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp; |
| |
|
| | maxBlocks = 0; |
| |
|
| | if (*gcConfig != PARTITIONED_GC_OFF) { |
| | int numSubPartitionsPerSmPartition; |
| | int numWarpsPerSmPartition; |
| | int maxBlocksPerSmPartition; |
| |
|
| | |
| | |
| | |
| | |
| | numSubPartitionsPerSmPartition = numSubPartitions / 2; |
| | numWarpsPerSmPartition = numWarpsPerSubPartition * numSubPartitionsPerSmPartition; |
| | maxBlocksPerSmPartition = numWarpsPerSmPartition / warpsAllocatedPerCTA; |
| | maxBlocks = maxBlocksPerSmPartition * 2; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | if (maxBlocks == 0 && *gcConfig != PARTITIONED_GC_ON_STRICT) { |
| | |
| | |
| | |
| | *gcConfig = PARTITIONED_GC_OFF; |
| | numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions; |
| | maxBlocks = numWarpsPerSM / warpsAllocatedPerCTA; |
| | } |
| | } |
| | else { |
| | maxBlocks = INT_MAX; |
| | } |
| | } |
| |
|
| |
|
| | result->allocatedRegistersPerBlock = regsAllocatedPerCTA; |
| |
|
| | *limit = maxBlocks; |
| |
|
| | return status; |
| | } |
| |
|
| | |
| | |
| | static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMBlockBarrierLimit( |
| | int *limit, |
| | int ctaLimitBlocks, |
| | const cudaOccDeviceProp *properties, |
| | const cudaOccFuncAttributes *attributes) |
| | { |
| | cudaOccError status = CUDA_OCC_SUCCESS; |
| | int numBarriersAvailable = 0; |
| | int numBarriersUsed = attributes->numBlockBarriers; |
| | int maxBlocks = INT_MAX; |
| |
|
| | switch(properties->computeMajor) { |
| | case 5: |
| | case 6: |
| | case 7: |
| | numBarriersAvailable = ctaLimitBlocks * 2; |
| | break; |
| | case 8: |
| | if (properties->computeMinor == 0) { |
| | numBarriersAvailable = ctaLimitBlocks * 2; |
| | } |
| | else { |
| | numBarriersAvailable = ctaLimitBlocks; |
| | } |
| | break; |
| | case 9: |
| | numBarriersAvailable = ctaLimitBlocks * 2; |
| | break; |
| | case 10: |
| | switch(properties->computeMinor) { |
| | case 1 : |
| | numBarriersAvailable = ctaLimitBlocks; |
| | break; |
| | case 0 : |
| | default : |
| | numBarriersAvailable = ctaLimitBlocks * 2; |
| | } |
| |
|
| | break; |
| | case 12: |
| | numBarriersAvailable = ctaLimitBlocks; |
| | break; |
| | default: |
| | return CUDA_OCC_ERROR_UNKNOWN_DEVICE; |
| | } |
| |
|
| | if (numBarriersUsed) { |
| | maxBlocks = numBarriersAvailable / numBarriersUsed; |
| | } |
| |
|
| | *limit = maxBlocks; |
| |
|
| | return status; |
| | } |
| |
|
| | |
| | |
| | |
| |
|
| | static __OCC_INLINE |
| | cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor( |
| | cudaOccResult *result, |
| | const cudaOccDeviceProp *properties, |
| | const cudaOccFuncAttributes *attributes, |
| | const cudaOccDeviceState *state, |
| | int blockSize, |
| | size_t dynamicSmemSize) |
| | { |
| | cudaOccError status = CUDA_OCC_SUCCESS; |
| | int ctaLimitWarps = 0; |
| | int ctaLimitBlocks = 0; |
| | int ctaLimitSMem = 0; |
| | int ctaLimitRegs = 0; |
| | int ctaLimitBars = 0; |
| | int ctaLimit = 0; |
| | unsigned int limitingFactors = 0; |
| | |
| | cudaOccPartitionedGCConfig gcConfig = PARTITIONED_GC_OFF; |
| |
|
| | if (!result || !properties || !attributes || !state || blockSize <= 0) { |
| | return CUDA_OCC_ERROR_INVALID_INPUT; |
| | } |
| |
|
| | |
| | |
| | |
| |
|
| | status = cudaOccInputCheck(properties, attributes, state); |
| | if (status != CUDA_OCC_SUCCESS) { |
| | return status; |
| | } |
| |
|
| | |
| | |
| | |
| |
|
| | gcConfig = cudaOccPartitionedGCExpected(properties, attributes); |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegs, &gcConfig, result, properties, attributes, blockSize); |
| | if (status != CUDA_OCC_SUCCESS) { |
| | return status; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | if (properties->computeMajor == 6 && properties->computeMinor == 0 && ctaLimitRegs) { |
| | cudaOccDeviceProp propertiesGP10x; |
| | cudaOccPartitionedGCConfig gcConfigGP10x = gcConfig; |
| | int ctaLimitRegsGP10x = 0; |
| |
|
| | |
| | memcpy(&propertiesGP10x, properties, sizeof(propertiesGP10x)); |
| | propertiesGP10x.computeMinor = 1; |
| |
|
| | status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegsGP10x, &gcConfigGP10x, result, &propertiesGP10x, attributes, blockSize); |
| | if (status != CUDA_OCC_SUCCESS) { |
| | return status; |
| | } |
| |
|
| | if (ctaLimitRegsGP10x == 0) { |
| | ctaLimitRegs = 0; |
| | } |
| | } |
| |
|
| | |
| | |
| | status = cudaOccMaxBlocksPerSMWarpsLimit(&ctaLimitWarps, gcConfig, properties, attributes, blockSize); |
| | if (status != CUDA_OCC_SUCCESS) { |
| | return status; |
| | } |
| |
|
| | |
| | |
| | status = cudaOccMaxBlocksPerMultiprocessor(&ctaLimitBlocks, properties); |
| | if (status != CUDA_OCC_SUCCESS) { |
| | return status; |
| | } |
| |
|
| | |
| | |
| | status = cudaOccMaxBlocksPerSMSmemLimit(&ctaLimitSMem, result, properties, attributes, state, blockSize, dynamicSmemSize); |
| | if (status != CUDA_OCC_SUCCESS) { |
| | return status; |
| | } |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks))); |
| |
|
| | |
| | |
| | if (ctaLimit == ctaLimitWarps) { |
| | limitingFactors |= OCC_LIMIT_WARPS; |
| | } |
| | if (ctaLimit == ctaLimitRegs) { |
| | limitingFactors |= OCC_LIMIT_REGISTERS; |
| | } |
| | if (ctaLimit == ctaLimitSMem) { |
| | limitingFactors |= OCC_LIMIT_SHARED_MEMORY; |
| | } |
| | if (ctaLimit == ctaLimitBlocks) { |
| | limitingFactors |= OCC_LIMIT_BLOCKS; |
| | } |
| |
|
| | |
| | |
| | if (properties->computeMajor >= 9 && attributes->numBlockBarriers > 0) { |
| | |
| | |
| | status = cudaOccMaxBlocksPerSMBlockBarrierLimit(&ctaLimitBars, ctaLimitBlocks, properties, attributes); |
| | if (status != CUDA_OCC_SUCCESS) { |
| | return status; |
| | } |
| |
|
| | |
| | |
| | ctaLimit = __occMin(ctaLimitBars, ctaLimit); |
| |
|
| | |
| | |
| | if (ctaLimit == ctaLimitBars) { |
| | limitingFactors |= OCC_LIMIT_BARRIERS; |
| | } |
| | } |
| | else { |
| | ctaLimitBars = INT_MAX; |
| | } |
| |
|
| | |
| | |
| | result->limitingFactors = limitingFactors; |
| |
|
| | result->blockLimitRegs = ctaLimitRegs; |
| | result->blockLimitSharedMem = ctaLimitSMem; |
| | result->blockLimitWarps = ctaLimitWarps; |
| | result->blockLimitBlocks = ctaLimitBlocks; |
| | result->blockLimitBarriers = ctaLimitBars; |
| | result->partitionedGCConfig = gcConfig; |
| |
|
| | |
| | result->activeBlocksPerMultiprocessor = ctaLimit; |
| |
|
| | return CUDA_OCC_SUCCESS; |
| | } |
| |
|
| | static __OCC_INLINE |
| | cudaOccError cudaOccAvailableDynamicSMemPerBlock( |
| | size_t *bytesAvailable, |
| | const cudaOccDeviceProp *properties, |
| | const cudaOccFuncAttributes *attributes, |
| | const cudaOccDeviceState *state, |
| | int numBlocks, |
| | int blockSize) |
| | { |
| | int allocationGranularity; |
| | size_t smemLimitPerBlock; |
| | size_t smemAvailableForDynamic; |
| | size_t userSmemPreference = 0; |
| | size_t sharedMemPerMultiprocessor; |
| | cudaOccResult result; |
| | cudaOccError status = CUDA_OCC_SUCCESS; |
| |
|
| | if (numBlocks <= 0) |
| | return CUDA_OCC_ERROR_INVALID_INPUT; |
| |
|
| | |
| | |
| | status = cudaOccMaxActiveBlocksPerMultiprocessor(&result, properties, attributes, state, blockSize, 0); |
| | if (status != CUDA_OCC_SUCCESS) { |
| | return status; |
| | } |
| | |
| | |
| | if (result.activeBlocksPerMultiprocessor < numBlocks) { |
| | return CUDA_OCC_ERROR_INVALID_INPUT; |
| | } |
| |
|
| | status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties); |
| | if (status != CUDA_OCC_SUCCESS) { |
| | return status; |
| | } |
| |
|
| | |
| | |
| | status = cudaOccSMemPerBlock(&smemLimitPerBlock, properties, attributes->shmemLimitConfig, properties->sharedMemPerMultiprocessor); |
| | if (status != CUDA_OCC_SUCCESS) { |
| | return status; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state); |
| | if (numBlocks == 1) { |
| | sharedMemPerMultiprocessor = smemLimitPerBlock; |
| | } |
| | else { |
| | if (!userSmemPreference) { |
| | userSmemPreference = 1 ; |
| | status = cudaOccAlignUpShmemSizeVoltaPlus(&userSmemPreference, properties); |
| | if (status != CUDA_OCC_SUCCESS) { |
| | return status; |
| | } |
| | } |
| | sharedMemPerMultiprocessor = userSmemPreference; |
| | } |
| |
|
| | |
| | |
| | smemAvailableForDynamic = sharedMemPerMultiprocessor / numBlocks; |
| | smemAvailableForDynamic = (smemAvailableForDynamic / allocationGranularity) * allocationGranularity; |
| |
|
| | |
| | |
| | if (smemAvailableForDynamic > smemLimitPerBlock) { |
| | smemAvailableForDynamic = smemLimitPerBlock; |
| | } |
| |
|
| | |
| | smemAvailableForDynamic = smemAvailableForDynamic - attributes->sharedSizeBytes; |
| |
|
| | |
| | |
| | if (smemAvailableForDynamic > attributes->maxDynamicSharedSizeBytes) |
| | smemAvailableForDynamic = attributes->maxDynamicSharedSizeBytes; |
| |
|
| | *bytesAvailable = smemAvailableForDynamic; |
| | return CUDA_OCC_SUCCESS; |
| | } |
| |
|
| | static __OCC_INLINE |
| | cudaOccError cudaOccMaxPotentialOccupancyBlockSize( |
| | int *minGridSize, |
| | int *blockSize, |
| | const cudaOccDeviceProp *properties, |
| | const cudaOccFuncAttributes *attributes, |
| | const cudaOccDeviceState *state, |
| | size_t (*blockSizeToDynamicSMemSize)(int), |
| | size_t dynamicSMemSize) |
| | { |
| | cudaOccError status = CUDA_OCC_SUCCESS; |
| | cudaOccResult result; |
| |
|
| | |
| | int occupancyLimit; |
| | int granularity; |
| | int blockSizeLimit; |
| |
|
| | |
| | int maxBlockSize = 0; |
| | int numBlocks = 0; |
| | int maxOccupancy = 0; |
| |
|
| | |
| | int blockSizeToTryAligned; |
| | int blockSizeToTry; |
| | int blockSizeLimitAligned; |
| | int occupancyInBlocks; |
| | int occupancyInThreads; |
| |
|
| | |
| | |
| | |
| |
|
| | if (!minGridSize || !blockSize || !properties || !attributes || !state) { |
| | return CUDA_OCC_ERROR_INVALID_INPUT; |
| | } |
| |
|
| | status = cudaOccInputCheck(properties, attributes, state); |
| | if (status != CUDA_OCC_SUCCESS) { |
| | return status; |
| | } |
| |
|
| | |
| | |
| | |
| |
|
| | occupancyLimit = properties->maxThreadsPerMultiprocessor; |
| | granularity = properties->warpSize; |
| |
|
| | blockSizeLimit = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock); |
| | blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity); |
| |
|
| | for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) { |
| | blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned); |
| |
|
| | |
| | |
| | if (blockSizeToDynamicSMemSize) { |
| | dynamicSMemSize = (*blockSizeToDynamicSMemSize)(blockSizeToTry); |
| | } |
| |
|
| | status = cudaOccMaxActiveBlocksPerMultiprocessor( |
| | &result, |
| | properties, |
| | attributes, |
| | state, |
| | blockSizeToTry, |
| | dynamicSMemSize); |
| |
|
| | if (status != CUDA_OCC_SUCCESS) { |
| | return status; |
| | } |
| |
|
| | occupancyInBlocks = result.activeBlocksPerMultiprocessor; |
| | occupancyInThreads = blockSizeToTry * occupancyInBlocks; |
| |
|
| | if (occupancyInThreads > maxOccupancy) { |
| | maxBlockSize = blockSizeToTry; |
| | numBlocks = occupancyInBlocks; |
| | maxOccupancy = occupancyInThreads; |
| | } |
| |
|
| | |
| | |
| | if (occupancyLimit == maxOccupancy) { |
| | break; |
| | } |
| | } |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | *minGridSize = numBlocks * properties->numSms; |
| | *blockSize = maxBlockSize; |
| |
|
| | return status; |
| | } |
| |
|
| |
|
| | #if defined(__cplusplus) |
| |
|
| | namespace { |
| |
|
| | __OCC_INLINE |
| | cudaOccError cudaOccMaxPotentialOccupancyBlockSize( |
| | int *minGridSize, |
| | int *blockSize, |
| | const cudaOccDeviceProp *properties, |
| | const cudaOccFuncAttributes *attributes, |
| | const cudaOccDeviceState *state, |
| | size_t dynamicSMemSize) |
| | { |
| | return cudaOccMaxPotentialOccupancyBlockSize( |
| | minGridSize, |
| | blockSize, |
| | properties, |
| | attributes, |
| | state, |
| | NULL, |
| | dynamicSMemSize); |
| | } |
| |
|
| | template <typename UnaryFunction> |
| | __OCC_INLINE |
| | cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem( |
| | int *minGridSize, |
| | int *blockSize, |
| | const cudaOccDeviceProp *properties, |
| | const cudaOccFuncAttributes *attributes, |
| | const cudaOccDeviceState *state, |
| | UnaryFunction blockSizeToDynamicSMemSize) |
| | { |
| | cudaOccError status = CUDA_OCC_SUCCESS; |
| | cudaOccResult result; |
| |
|
| | |
| | int occupancyLimit; |
| | int granularity; |
| | int blockSizeLimit; |
| |
|
| | |
| | int maxBlockSize = 0; |
| | int numBlocks = 0; |
| | int maxOccupancy = 0; |
| |
|
| | |
| | int blockSizeToTryAligned; |
| | int blockSizeToTry; |
| | int blockSizeLimitAligned; |
| | int occupancyInBlocks; |
| | int occupancyInThreads; |
| | size_t dynamicSMemSize; |
| |
|
| | |
| | |
| | |
| |
|
| | if (!minGridSize || !blockSize || !properties || !attributes || !state) { |
| | return CUDA_OCC_ERROR_INVALID_INPUT; |
| | } |
| |
|
| | status = cudaOccInputCheck(properties, attributes, state); |
| | if (status != CUDA_OCC_SUCCESS) { |
| | return status; |
| | } |
| |
|
| | |
| | |
| | |
| |
|
| | occupancyLimit = properties->maxThreadsPerMultiprocessor; |
| | granularity = properties->warpSize; |
| | blockSizeLimit = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock); |
| | blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity); |
| |
|
| | for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) { |
| | blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned); |
| |
|
| | dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry); |
| |
|
| | status = cudaOccMaxActiveBlocksPerMultiprocessor( |
| | &result, |
| | properties, |
| | attributes, |
| | state, |
| | blockSizeToTry, |
| | dynamicSMemSize); |
| |
|
| | if (status != CUDA_OCC_SUCCESS) { |
| | return status; |
| | } |
| |
|
| | occupancyInBlocks = result.activeBlocksPerMultiprocessor; |
| |
|
| | occupancyInThreads = blockSizeToTry * occupancyInBlocks; |
| |
|
| | if (occupancyInThreads > maxOccupancy) { |
| | maxBlockSize = blockSizeToTry; |
| | numBlocks = occupancyInBlocks; |
| | maxOccupancy = occupancyInThreads; |
| | } |
| |
|
| | |
| | |
| | if (occupancyLimit == maxOccupancy) { |
| | break; |
| | } |
| | } |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | *minGridSize = numBlocks * properties->numSms; |
| | *blockSize = maxBlockSize; |
| |
|
| | return status; |
| | } |
| |
|
| | } |
| |
|
| | #endif |
| |
|
| | #undef __OCC_INLINE |
| |
|
| | #endif |
| |
|