Upload folder using huggingface_hub

838f737 verified 4 months ago

71.3 kB

	/*
	* Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
	*
	* NOTICE TO LICENSEE:
	*
	* This source code and/or documentation ("Licensed Deliverables") are
	* subject to NVIDIA intellectual property rights under U.S. and
	* international Copyright laws.
	*
	* These Licensed Deliverables contained herein is PROPRIETARY and
	* CONFIDENTIAL to NVIDIA and is being provided under the terms and
	* conditions of a form of NVIDIA software license agreement by and
	* between NVIDIA and Licensee ("License Agreement") or electronically
	* accepted by Licensee. Notwithstanding any terms or conditions to
	* the contrary in the License Agreement, reproduction or disclosure
	* of the Licensed Deliverables to any third party without the express
	* written consent of NVIDIA is prohibited.
	*
	* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
	* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
	* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
	* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
	* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
	* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
	* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
	* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
	* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
	* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
	* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
	* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
	* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
	* OF THESE LICENSED DELIVERABLES.
	*
	* U.S. Government End Users. These Licensed Deliverables are a
	* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
	* 1995), consisting of "commercial computer software" and "commercial
	* computer software documentation" as such terms are used in 48
	* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
	* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
	* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
	* U.S. Government End Users acquire the Licensed Deliverables with
	* only those rights set forth herein.
	*
	* Any use of the Licensed Deliverables in individual and commercial
	* software must include, in the user documentation and internal
	* comments to the code, the above Disclaimer and U.S. Government End
	* Users Notice.
	*/

	/**
	* CUDA Occupancy Calculator
	*
	* NAME
	*
	* cudaOccMaxActiveBlocksPerMultiprocessor,
	* cudaOccMaxPotentialOccupancyBlockSize,
	* cudaOccMaxPotentialOccupancyBlockSizeVariableSMem
	* cudaOccAvailableDynamicSMemPerBlock
	*
	* DESCRIPTION
	*
	* The CUDA occupancy calculator provides a standalone, programmatical
	* interface to compute the occupancy of a function on a device. It can also
	* provide occupancy-oriented launch configuration suggestions.
	*
	* The function and device are defined by the user through
	* cudaOccFuncAttributes, cudaOccDeviceProp, and cudaOccDeviceState
	* structures. All APIs require all 3 of them.
	*
	* See the structure definition for more details about the device / function
	* descriptors.
	*
	* See each API's prototype for API usage.
	*
	* COMPATIBILITY
	*
	* The occupancy calculator will be updated on each major CUDA toolkit
	* release. It does not provide forward compatibility, i.e. new hardwares
	* released after this implementation's release will not be supported.
	*
	* NOTE
	*
	* If there is access to CUDA runtime, and the sole intent is to calculate
	* occupancy related values on one of the accessible CUDA devices, using CUDA
	* runtime's occupancy calculation APIs is recommended.
	*
	*/

	#ifndef __cuda_occupancy_h__
	#define __cuda_occupancy_h__

	#include <stddef.h>
	#include <limits.h>
	#include <string.h>


	// __OCC_INLINE will be undefined at the end of this header
	//
	#ifdef __CUDACC__
	#define __OCC_INLINE inline __host__ __device__
	#elif defined _MSC_VER
	#define __OCC_INLINE __inline
	#else // GNUCC assumed
	#define __OCC_INLINE inline
	#endif

	enum cudaOccError_enum {
	CUDA_OCC_SUCCESS = 0, // no error encountered
	CUDA_OCC_ERROR_INVALID_INPUT = 1, // input parameter is invalid
	CUDA_OCC_ERROR_UNKNOWN_DEVICE = 2, // requested device is not supported in
	// current implementation or device is
	// invalid
	};
	typedef enum cudaOccError_enum cudaOccError;

	typedef struct cudaOccResult cudaOccResult;
	typedef struct cudaOccDeviceProp cudaOccDeviceProp;
	typedef struct cudaOccFuncAttributes cudaOccFuncAttributes;
	typedef struct cudaOccDeviceState cudaOccDeviceState;

	/**
	* The CUDA occupancy calculator computes the occupancy of the function
	* described by attributes with the given block size (blockSize), static device
	* properties (properties), dynamic device states (states) and per-block dynamic
	* shared memory allocation (dynamicSMemSize) in bytes, and output it through
	* result along with other useful information. The occupancy is computed in
	* terms of the maximum number of active blocks per multiprocessor. The user can
	* then convert it to other metrics, such as number of active warps.
	*
	* RETURN VALUE
	*
	* The occupancy and related information is returned through result.
	*
	* If result->activeBlocksPerMultiprocessor is 0, then the given parameter
	* combination cannot run on the device.
	*
	* ERRORS
	*
	* CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid.
	* CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in
	* current implementation or device is invalid
	*/
	static __OCC_INLINE
	cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor(
	cudaOccResult *result, // out
	const cudaOccDeviceProp *properties, // in
	const cudaOccFuncAttributes *attributes, // in
	const cudaOccDeviceState *state, // in
	int blockSize, // in
	size_t dynamicSmemSize); // in

	/**
	* The CUDA launch configurator C API suggests a grid / block size pair (in
	* minGridSize and blockSize) that achieves the best potential occupancy
	* (i.e. maximum number of active warps with the smallest number of blocks) for
	* the given function described by attributes, on a device described by
	* properties with settings in state.
	*
	* If per-block dynamic shared memory allocation is not needed, the user should
	* leave both blockSizeToDynamicSMemSize and dynamicSMemSize as 0.
	*
	* If per-block dynamic shared memory allocation is needed, then if the dynamic
	* shared memory size is constant regardless of block size, the size should be
	* passed through dynamicSMemSize, and blockSizeToDynamicSMemSize should be
	* NULL.
	*
	* Otherwise, if the per-block dynamic shared memory size varies with different
	* block sizes, the user needs to provide a pointer to an unary function through
	* blockSizeToDynamicSMemSize that computes the dynamic shared memory needed by
	* a block of the function for any given block size. dynamicSMemSize is
	* ignored. An example signature is:
	*
	* // Take block size, returns dynamic shared memory needed
	* size_t blockToSmem(int blockSize);
	*
	* RETURN VALUE
	*
	* The suggested block size and the minimum number of blocks needed to achieve
	* the maximum occupancy are returned through blockSize and minGridSize.
	*
	* If *blockSize is 0, then the given combination cannot run on the device.
	*
	* ERRORS
	*
	* CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid.
	* CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in
	* current implementation or device is invalid
	*
	*/
	static __OCC_INLINE
	cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
	int *minGridSize, // out
	int *blockSize, // out
	const cudaOccDeviceProp *properties, // in
	const cudaOccFuncAttributes *attributes, // in
	const cudaOccDeviceState *state, // in
	size_t (*blockSizeToDynamicSMemSize)(int), // in
	size_t dynamicSMemSize); // in

	/**
	* The CUDA launch configurator C++ API suggests a grid / block size pair (in
	* minGridSize and blockSize) that achieves the best potential occupancy
	* (i.e. the maximum number of active warps with the smallest number of blocks)
	* for the given function described by attributes, on a device described by
	* properties with settings in state.
	*
	* If per-block dynamic shared memory allocation is 0 or constant regardless of
	* block size, the user can use cudaOccMaxPotentialOccupancyBlockSize to
	* configure the launch. A constant dynamic shared memory allocation size in
	* bytes can be passed through dynamicSMemSize.
	*
	* Otherwise, if the per-block dynamic shared memory size varies with different
	* block sizes, the user needs to use
	* cudaOccMaxPotentialOccupancyBlockSizeVariableSmem instead, and provide a
	* functor / pointer to an unary function (blockSizeToDynamicSMemSize) that
	* computes the dynamic shared memory needed by func for any given block
	* size. An example signature is:
	*
	* // Take block size, returns per-block dynamic shared memory needed
	* size_t blockToSmem(int blockSize);
	*
	* RETURN VALUE
	*
	* The suggested block size and the minimum number of blocks needed to achieve
	* the maximum occupancy are returned through blockSize and minGridSize.
	*
	* If *blockSize is 0, then the given combination cannot run on the device.
	*
	* ERRORS
	*
	* CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid.
	* CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in
	* current implementation or device is invalid
	*
	*/

	#if defined(__cplusplus)
	namespace {

	__OCC_INLINE
	cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
	int *minGridSize, // out
	int *blockSize, // out
	const cudaOccDeviceProp *properties, // in
	const cudaOccFuncAttributes *attributes, // in
	const cudaOccDeviceState *state, // in
	size_t dynamicSMemSize = 0); // in

	template <typename UnaryFunction>
	__OCC_INLINE
	cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem(
	int *minGridSize, // out
	int *blockSize, // out
	const cudaOccDeviceProp *properties, // in
	const cudaOccFuncAttributes *attributes, // in
	const cudaOccDeviceState *state, // in
	UnaryFunction blockSizeToDynamicSMemSize); // in

	} // namespace anonymous
	#endif // defined(__cplusplus)

	/**
	*
	* The CUDA dynamic shared memory calculator computes the maximum size of
	* per-block dynamic shared memory if we want to place numBlocks blocks
	* on an SM.
	*
	* RETURN VALUE
	*
	* Returns in *dynamicSmemSize the maximum size of dynamic shared memory to allow
	* numBlocks blocks per SM.
	*
	* ERRORS
	*
	* CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid.
	* CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in
	* current implementation or device is invalid
	*
	*/
	static __OCC_INLINE
	cudaOccError cudaOccAvailableDynamicSMemPerBlock(
	size_t *dynamicSmemSize,
	const cudaOccDeviceProp *properties,
	const cudaOccFuncAttributes *attributes,
	const cudaOccDeviceState *state,
	int numBlocks,
	int blockSize);

	/**
	* Data structures
	*
	* These structures are subject to change for future architecture and CUDA
	* releases. C users should initialize the structure as {0}.
	*
	*/

	/**
	* Device descriptor
	*
	* This structure describes a device.
	*/
	struct cudaOccDeviceProp {
	int computeMajor; // Compute capability major version
	int computeMinor; // Compute capability minor
	// version. None supported minor version
	// may cause error
	int maxThreadsPerBlock; // Maximum number of threads per block
	int maxThreadsPerMultiprocessor; // Maximum number of threads per SM
	// i.e. (Max. number of warps) x (warp
	// size)
	int regsPerBlock; // Maximum number of registers per block
	int regsPerMultiprocessor; // Maximum number of registers per SM
	int warpSize; // Warp size
	size_t sharedMemPerBlock; // Maximum shared memory size per block
	size_t sharedMemPerMultiprocessor; // Maximum shared memory size per SM
	int numSms; // Number of SMs available
	size_t sharedMemPerBlockOptin; // Maximum optin shared memory size per block
	size_t reservedSharedMemPerBlock; // Shared memory per block reserved by driver

	#ifdef __cplusplus
	// This structure can be converted from a cudaDeviceProp structure for users
	// that use this header in their CUDA applications.
	//
	// If the application have access to the CUDA Runtime API, the application
	// can obtain the device properties of a CUDA device through
	// cudaGetDeviceProperties, and initialize a cudaOccDeviceProp with the
	// cudaDeviceProp structure.
	//
	// Example:
	/*
	{
	cudaDeviceProp prop;

	cudaGetDeviceProperties(&prop, ...);

	cudaOccDeviceProp occProp = prop;

	...

	cudaOccMaxPotentialOccupancyBlockSize(..., &occProp, ...);
	}
	*/
	//
	template<typename DeviceProp>
	__OCC_INLINE
	cudaOccDeviceProp(const DeviceProp &props)
	: computeMajor (props.major),
	computeMinor (props.minor),
	maxThreadsPerBlock (props.maxThreadsPerBlock),
	maxThreadsPerMultiprocessor (props.maxThreadsPerMultiProcessor),
	regsPerBlock (props.regsPerBlock),
	regsPerMultiprocessor (props.regsPerMultiprocessor),
	warpSize (props.warpSize),
	sharedMemPerBlock (props.sharedMemPerBlock),
	sharedMemPerMultiprocessor (props.sharedMemPerMultiprocessor),
	numSms (props.multiProcessorCount),
	sharedMemPerBlockOptin (props.sharedMemPerBlockOptin),
	reservedSharedMemPerBlock (props.reservedSharedMemPerBlock)
	{}

	__OCC_INLINE
	cudaOccDeviceProp()
	: computeMajor (0),
	computeMinor (0),
	maxThreadsPerBlock (0),
	maxThreadsPerMultiprocessor (0),
	regsPerBlock (0),
	regsPerMultiprocessor (0),
	warpSize (0),
	sharedMemPerBlock (0),
	sharedMemPerMultiprocessor (0),
	numSms (0),
	sharedMemPerBlockOptin (0),
	reservedSharedMemPerBlock (0)
	{}
	#endif // __cplusplus
	};

	/**
	* Partitioned global caching option
	*/
	typedef enum cudaOccPartitionedGCConfig_enum {
	PARTITIONED_GC_OFF, // Disable partitioned global caching
	PARTITIONED_GC_ON, // Prefer partitioned global caching
	PARTITIONED_GC_ON_STRICT // Force partitioned global caching
	} cudaOccPartitionedGCConfig;

	/**
	* Per function opt in maximum dynamic shared memory limit
	*/
	typedef enum cudaOccFuncShmemConfig_enum {
	FUNC_SHMEM_LIMIT_DEFAULT, // Default shmem limit
	FUNC_SHMEM_LIMIT_OPTIN, // Use the optin shmem limit
	} cudaOccFuncShmemConfig;

	/**
	* Function descriptor
	*
	* This structure describes a CUDA function.
	*/
	struct cudaOccFuncAttributes {
	int maxThreadsPerBlock; // Maximum block size the function can work with. If
	// unlimited, use INT_MAX or any value greater than
	// or equal to maxThreadsPerBlock of the device
	int numRegs; // Number of registers used. When the function is
	// launched on device, the register count may change
	// due to internal tools requirements.
	size_t sharedSizeBytes; // Number of static shared memory used

	cudaOccPartitionedGCConfig partitionedGCConfig;
	// Partitioned global caching is required to enable
	// caching on certain chips, such as sm_52
	// devices. Partitioned global caching can be
	// automatically disabled if the occupancy
	// requirement of the launch cannot support caching.
	//
	// To override this behavior with caching on and
	// calculate occupancy strictly according to the
	// preference, set partitionedGCConfig to
	// PARTITIONED_GC_ON_STRICT. This is especially
	// useful for experimenting and finding launch
	// configurations (MaxPotentialOccupancyBlockSize)
	// that allow global caching to take effect.
	//
	// This flag only affects the occupancy calculation.

	cudaOccFuncShmemConfig shmemLimitConfig;
	// Certain chips like sm_70 allow a user to opt into
	// a higher per block limit of dynamic shared memory
	// This optin is performed on a per function basis
	// using the cuFuncSetAttribute function

	size_t maxDynamicSharedSizeBytes;
	// User set limit on maximum dynamic shared memory
	// usable by the kernel
	// This limit is set using the cuFuncSetAttribute
	// function.

	int numBlockBarriers; // Number of block barriers used (default to 1)
	#ifdef __cplusplus
	// This structure can be converted from a cudaFuncAttributes structure for
	// users that use this header in their CUDA applications.
	//
	// If the application have access to the CUDA Runtime API, the application
	// can obtain the function attributes of a CUDA kernel function through
	// cudaFuncGetAttributes, and initialize a cudaOccFuncAttributes with the
	// cudaFuncAttributes structure.
	//
	// Example:
	/*
	__global__ void foo() {...}

	...

	{
	cudaFuncAttributes attr;

	cudaFuncGetAttributes(&attr, foo);

	cudaOccFuncAttributes occAttr = attr;

	...

	cudaOccMaxPotentialOccupancyBlockSize(..., &occAttr, ...);
	}
	*/
	//
	template<typename FuncAttributes>
	__OCC_INLINE
	cudaOccFuncAttributes(const FuncAttributes &attr)
	: maxThreadsPerBlock (attr.maxThreadsPerBlock),
	numRegs (attr.numRegs),
	sharedSizeBytes (attr.sharedSizeBytes),
	partitionedGCConfig (PARTITIONED_GC_OFF),
	shmemLimitConfig (FUNC_SHMEM_LIMIT_OPTIN),
	maxDynamicSharedSizeBytes (attr.maxDynamicSharedSizeBytes),
	numBlockBarriers (1)
	{}

	__OCC_INLINE
	cudaOccFuncAttributes()
	: maxThreadsPerBlock (0),
	numRegs (0),
	sharedSizeBytes (0),
	partitionedGCConfig (PARTITIONED_GC_OFF),
	shmemLimitConfig (FUNC_SHMEM_LIMIT_DEFAULT),
	maxDynamicSharedSizeBytes (0),
	numBlockBarriers (0)
	{}
	#endif
	};

	typedef enum cudaOccCacheConfig_enum {
	CACHE_PREFER_NONE = 0x00, // no preference for shared memory or L1 (default)
	CACHE_PREFER_SHARED = 0x01, // prefer larger shared memory and smaller L1 cache
	CACHE_PREFER_L1 = 0x02, // prefer larger L1 cache and smaller shared memory
	CACHE_PREFER_EQUAL = 0x03 // prefer equal sized L1 cache and shared memory
	} cudaOccCacheConfig;

	typedef enum cudaOccCarveoutConfig_enum {
	SHAREDMEM_CARVEOUT_DEFAULT = -1, // no preference for shared memory or L1 (default)
	SHAREDMEM_CARVEOUT_MAX_SHARED = 100, // prefer maximum available shared memory, minimum L1 cache
	SHAREDMEM_CARVEOUT_MAX_L1 = 0, // prefer maximum available L1 cache, minimum shared memory
	SHAREDMEM_CARVEOUT_HALF = 50 // prefer half of maximum available shared memory, with the rest as L1 cache
	} cudaOccCarveoutConfig;

	/**
	* Device state descriptor
	*
	* This structure describes device settings that affect occupancy calculation.
	*/
	struct cudaOccDeviceState
	{
	// Cache / shared memory split preference. Deprecated on Volta
	cudaOccCacheConfig cacheConfig;
	// Shared memory / L1 split preference. Supported on only Volta
	int carveoutConfig;

	#ifdef __cplusplus
	__OCC_INLINE
	cudaOccDeviceState()
	: cacheConfig (CACHE_PREFER_NONE),
	carveoutConfig (SHAREDMEM_CARVEOUT_DEFAULT)
	{}
	#endif
	};

	typedef enum cudaOccLimitingFactor_enum {
	// Occupancy limited due to:
	OCC_LIMIT_WARPS = 0x01, // - warps available
	OCC_LIMIT_REGISTERS = 0x02, // - registers available
	OCC_LIMIT_SHARED_MEMORY = 0x04, // - shared memory available
	OCC_LIMIT_BLOCKS = 0x08, // - blocks available
	OCC_LIMIT_BARRIERS = 0x10 // - barrier available
	} cudaOccLimitingFactor;

	/**
	* Occupancy output
	*
	* This structure contains occupancy calculator's output.
	*/
	struct cudaOccResult {
	int activeBlocksPerMultiprocessor; // Occupancy
	unsigned int limitingFactors; // Factors that limited occupancy. A bit
	// field that counts the limiting
	// factors, see cudaOccLimitingFactor
	int blockLimitRegs; // Occupancy due to register
	// usage, INT_MAX if the kernel does not
	// use any register.
	int blockLimitSharedMem; // Occupancy due to shared memory
	// usage, INT_MAX if the kernel does not
	// use shared memory.
	int blockLimitWarps; // Occupancy due to block size limit
	int blockLimitBlocks; // Occupancy due to maximum number of blocks
	// managable per SM
	int blockLimitBarriers; // Occupancy due to block barrier usage
	int allocatedRegistersPerBlock; // Actual number of registers allocated per
	// block
	size_t allocatedSharedMemPerBlock; // Actual size of shared memory allocated
	// per block
	cudaOccPartitionedGCConfig partitionedGCConfig;
	// Report if partitioned global caching
	// is actually enabled.
	};

	/**
	* Partitioned global caching support
	*
	* See cudaOccPartitionedGlobalCachingModeSupport
	*/
	typedef enum cudaOccPartitionedGCSupport_enum {
	PARTITIONED_GC_NOT_SUPPORTED, // Partitioned global caching is not supported
	PARTITIONED_GC_SUPPORTED, // Partitioned global caching is supported
	} cudaOccPartitionedGCSupport;

	/**
	* Implementation
	*/

	/**
	* Max compute capability supported
	*/

	#define __CUDA_OCC_MAJOR__ 12
	#define __CUDA_OCC_MINOR__ 0

	//////////////////////////////////////////
	// Mathematical Helper Functions //
	//////////////////////////////////////////

	static __OCC_INLINE int __occMin(int lhs, int rhs)
	{
	return rhs < lhs ? rhs : lhs;
	}

	static __OCC_INLINE int __occDivideRoundUp(int x, int y)
	{
	return (x + (y - 1)) / y;
	}

	static __OCC_INLINE int __occRoundUp(int x, int y)
	{
	return y * __occDivideRoundUp(x, y);
	}

	//////////////////////////////////////////
	// Architectural Properties //
	//////////////////////////////////////////

	/**
	* Granularity of shared memory allocation
	*/
	static __OCC_INLINE cudaOccError cudaOccSMemAllocationGranularity(int limit, const cudaOccDeviceProp properties)
	{
	int value;

	switch(properties->computeMajor) {
	case 3:
	case 5:
	case 6:
	case 7:
	value = 256;
	break;
	case 8:
	case 9:
	case 10:
	case 12:
	value = 128;
	break;
	default:
	return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
	}

	*limit = value;

	return CUDA_OCC_SUCCESS;
	}

	/**
	* Maximum number of registers per thread
	*/
	static __OCC_INLINE cudaOccError cudaOccRegAllocationMaxPerThread(int limit, const cudaOccDeviceProp properties)
	{
	int value;

	switch(properties->computeMajor) {
	case 3:
	case 5:
	case 6:
	value = 255;
	break;
	case 7:
	case 8:
	case 9:
	case 10:
	case 12:
	value = 256;
	break;
	default:
	return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
	}

	*limit = value;

	return CUDA_OCC_SUCCESS;
	}

	/**
	* Granularity of register allocation
	*/
	static __OCC_INLINE cudaOccError cudaOccRegAllocationGranularity(int limit, const cudaOccDeviceProp properties)
	{
	int value;

	switch(properties->computeMajor) {
	case 3:
	case 5:
	case 6:
	case 7:
	case 8:
	case 9:
	case 10:
	case 12:
	value = 256;
	break;
	default:
	return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
	}

	*limit = value;

	return CUDA_OCC_SUCCESS;
	}

	/**
	* Number of sub-partitions
	*/
	static __OCC_INLINE cudaOccError cudaOccSubPartitionsPerMultiprocessor(int limit, const cudaOccDeviceProp properties)
	{
	int value;

	switch(properties->computeMajor) {
	case 3:
	case 5:
	case 7:
	case 8:
	case 9:
	case 10:
	case 12:
	value = 4;
	break;
	case 6:
	value = properties->computeMinor ? 4 : 2;
	break;
	default:
	return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
	}

	*limit = value;

	return CUDA_OCC_SUCCESS;
	}


	/**
	* Maximum number of blocks that can run simultaneously on a multiprocessor
	*/
	static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerMultiprocessor(int* limit, const cudaOccDeviceProp *properties)
	{
	int value;

	switch(properties->computeMajor) {
	case 3:
	value = 16;
	break;
	case 5:
	case 6:
	value = 32;
	break;
	case 7: {
	int isTuring = properties->computeMinor == 5;
	value = (isTuring) ? 16 : 32;
	break;
	}
	case 8:
	if (properties->computeMinor == 0) {
	value = 32;
	}
	else if (properties->computeMinor == 9) {
	value = 24;
	}
	else {
	value = 16;
	}
	break;
	case 9:
	value = 32;
	break;
	case 10:
	switch(properties->computeMinor) {
	case 1 :
	value = 24;
	break;
	case 0 : /* explicitly added to avoid build failure in WDDM driver components */
	default :
	value = 32;
	}
	break;
	case 12:
	value = 24;
	break;
	default:
	return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
	}

	*limit = value;

	return CUDA_OCC_SUCCESS;
	}

	/**
	* Align up shared memory based on compute major configurations
	*/
	static __OCC_INLINE cudaOccError cudaOccAlignUpShmemSizeVoltaPlus(size_t shMemSize, const cudaOccDeviceProp properties)
	{
	// Volta and Turing have shared L1 cache / shared memory, and support cache
	// configuration to trade one for the other. These values are needed to
	// map carveout config ratio to the next available architecture size
	size_t size = *shMemSize;

	switch (properties->computeMajor) {
	case 7: {
	// Turing supports 32KB and 64KB shared mem.
	int isTuring = properties->computeMinor == 5;
	if (isTuring) {
	if (size <= 32 * 1024) {
	shMemSize = 32 1024;
	}
	else if (size <= 64 * 1024) {
	shMemSize = 64 1024;
	}
	else {
	return CUDA_OCC_ERROR_INVALID_INPUT;
	}
	}
	// Volta supports 0KB, 8KB, 16KB, 32KB, 64KB, and 96KB shared mem.
	else {
	if (size == 0) {
	*shMemSize = 0;
	}
	else if (size <= 8 * 1024) {
	shMemSize = 8 1024;
	}
	else if (size <= 16 * 1024) {
	shMemSize = 16 1024;
	}
	else if (size <= 32 * 1024) {
	shMemSize = 32 1024;
	}
	else if (size <= 64 * 1024) {
	shMemSize = 64 1024;
	}
	else if (size <= 96 * 1024) {
	shMemSize = 96 1024;
	}
	else {
	return CUDA_OCC_ERROR_INVALID_INPUT;
	}
	}
	break;
	}
	case 8:
	if (properties->computeMinor == 0 \|\| properties->computeMinor == 7) {
	if (size == 0) {
	*shMemSize = 0;
	}
	else if (size <= 8 * 1024) {
	shMemSize = 8 1024;
	}
	else if (size <= 16 * 1024) {
	shMemSize = 16 1024;
	}
	else if (size <= 32 * 1024) {
	shMemSize = 32 1024;
	}
	else if (size <= 64 * 1024) {
	shMemSize = 64 1024;
	}
	else if (size <= 100 * 1024) {
	shMemSize = 100 1024;
	}
	else if (size <= 132 * 1024) {
	shMemSize = 132 1024;
	}
	else if (size <= 164 * 1024) {
	shMemSize = 164 1024;
	}
	else {
	return CUDA_OCC_ERROR_INVALID_INPUT;
	}
	}
	else {
	if (size == 0) {
	*shMemSize = 0;
	}
	else if (size <= 8 * 1024) {
	shMemSize = 8 1024;
	}
	else if (size <= 16 * 1024) {
	shMemSize = 16 1024;
	}
	else if (size <= 32 * 1024) {
	shMemSize = 32 1024;
	}
	else if (size <= 64 * 1024) {
	shMemSize = 64 1024;
	}
	else if (size <= 100 * 1024) {
	shMemSize = 100 1024;
	}
	else {
	return CUDA_OCC_ERROR_INVALID_INPUT;
	}
	}
	break;
	case 9: {
	if (size == 0) {
	*shMemSize = 0;
	}
	else if (size <= 8 * 1024) {
	shMemSize = 8 1024;
	}
	else if (size <= 16 * 1024) {
	shMemSize = 16 1024;
	}
	else if (size <= 32 * 1024) {
	shMemSize = 32 1024;
	}
	else if (size <= 64 * 1024) {
	shMemSize = 64 1024;
	}
	else if (size <= 100 * 1024) {
	shMemSize = 100 1024;
	}
	else if (size <= 132 * 1024) {
	shMemSize = 132 1024;
	}
	else if (size <= 164 * 1024) {
	shMemSize = 164 1024;
	}
	else if (size <= 196 * 1024) {
	shMemSize = 196 1024;
	}
	else if (size <= 228 * 1024) {
	shMemSize = 228 1024;
	}
	else {
	return CUDA_OCC_ERROR_INVALID_INPUT;
	}
	break;
	}
	case 10: {
	switch (properties->computeMinor) {
	// GB10x GPUs in Blackwell family have the below compute minors and corresponding
	// shared memory configs
	case 0:
	case 1:
	if (size == 0) {
	*shMemSize = 0;
	}
	else if (size <= 8 * 1024) {
	shMemSize = 8 1024;
	}
	else if (size <= 16 * 1024) {
	shMemSize = 16 1024;
	}
	else if (size <= 32 * 1024) {
	shMemSize = 32 1024;
	}
	else if (size <= 64 * 1024) {
	shMemSize = 64 1024;
	}
	else if (size <= 100 * 1024) {
	shMemSize = 100 1024;
	}
	else if (size <= 132 * 1024) {
	shMemSize = 132 1024;
	}
	else if (size <= 164 * 1024) {
	shMemSize = 164 1024;
	}
	else if (size <= 196 * 1024) {
	shMemSize = 196 1024;
	}
	else if (size <= 228 * 1024) {
	shMemSize = 228 1024;
	}
	else {
	return CUDA_OCC_ERROR_INVALID_INPUT;
	}
	break;
	default:
	return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
	}
	break;
	}
	case 12: {
	switch (properties->computeMinor) {
	case 0:
	if (size == 0) {
	*shMemSize = 0;
	}
	else if (size <= 8 * 1024) {
	shMemSize = 8 1024;
	}
	else if (size <= 16 * 1024) {
	shMemSize = 16 1024;
	}
	else if (size <= 32 * 1024) {
	shMemSize = 32 1024;
	}
	else if (size <= 64 * 1024) {
	shMemSize = 64 1024;
	}
	else if (size <= 100 * 1024) {
	shMemSize = 100 1024;
	}
	else {
	return CUDA_OCC_ERROR_INVALID_INPUT;
	}
	break;
	default:
	return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
	}
	break;
	}
	break;
	default:
	return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
	}

	return CUDA_OCC_SUCCESS;
	}

	/**
	* Shared memory based on the new carveoutConfig API introduced with Volta
	*/
	static __OCC_INLINE cudaOccError cudaOccSMemPreferenceVoltaPlus(size_t limit, const cudaOccDeviceProp properties, const cudaOccDeviceState *state)
	{
	cudaOccError status = CUDA_OCC_SUCCESS;
	size_t preferenceShmemSize;

	// CUDA 9.0 introduces a new API to set shared memory - L1 configuration on supported
	// devices. This preference will take precedence over the older cacheConfig setting.
	// Map cacheConfig to its effective preference value.
	int effectivePreference = state->carveoutConfig;
	if ((effectivePreference < SHAREDMEM_CARVEOUT_DEFAULT) \|\| (effectivePreference > SHAREDMEM_CARVEOUT_MAX_SHARED)) {
	return CUDA_OCC_ERROR_INVALID_INPUT;
	}

	if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) {
	switch (state->cacheConfig)
	{
	case CACHE_PREFER_L1:
	effectivePreference = SHAREDMEM_CARVEOUT_MAX_L1;
	break;
	case CACHE_PREFER_SHARED:
	effectivePreference = SHAREDMEM_CARVEOUT_MAX_SHARED;
	break;
	case CACHE_PREFER_EQUAL:
	effectivePreference = SHAREDMEM_CARVEOUT_HALF;
	break;
	default:
	effectivePreference = SHAREDMEM_CARVEOUT_DEFAULT;
	break;
	}
	}

	if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) {
	preferenceShmemSize = properties->sharedMemPerMultiprocessor;
	}
	else {
	preferenceShmemSize = (size_t) (effectivePreference * properties->sharedMemPerMultiprocessor) / 100;
	}

	status = cudaOccAlignUpShmemSizeVoltaPlus(&preferenceShmemSize, properties);
	*limit = preferenceShmemSize;
	return status;
	}

	/**
	* Shared memory based on the cacheConfig
	*/
	static __OCC_INLINE cudaOccError cudaOccSMemPreference(size_t limit, const cudaOccDeviceProp properties, const cudaOccDeviceState *state)
	{
	size_t bytes = 0;
	size_t sharedMemPerMultiprocessorHigh = properties->sharedMemPerMultiprocessor;
	cudaOccCacheConfig cacheConfig = state->cacheConfig;

	// Kepler has shared L1 cache / shared memory, and support cache
	// configuration to trade one for the other. These values are needed to
	// calculate the correct shared memory size for user requested cache
	// configuration.
	//
	size_t minCacheSize = 16384;
	size_t maxCacheSize = 49152;
	size_t cacheAndSharedTotal = sharedMemPerMultiprocessorHigh + minCacheSize;
	size_t sharedMemPerMultiprocessorLow = cacheAndSharedTotal - maxCacheSize;

	switch (properties->computeMajor) {
	case 3:
	// Kepler supports 16KB, 32KB, or 48KB partitions for L1. The rest
	// is shared memory.
	//
	switch (cacheConfig) {
	default :
	case CACHE_PREFER_NONE:
	case CACHE_PREFER_SHARED:
	bytes = sharedMemPerMultiprocessorHigh;
	break;
	case CACHE_PREFER_L1:
	bytes = sharedMemPerMultiprocessorLow;
	break;
	case CACHE_PREFER_EQUAL:
	// Equal is the mid-point between high and low. It should be
	// equivalent to low + 16KB.
	//
	bytes = (sharedMemPerMultiprocessorHigh + sharedMemPerMultiprocessorLow) / 2;
	break;
	}
	break;
	case 5:
	case 6:
	// Maxwell and Pascal have dedicated shared memory.
	//
	bytes = sharedMemPerMultiprocessorHigh;
	break;
	default:
	return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
	}

	*limit = bytes;

	return CUDA_OCC_SUCCESS;
	}

	/**
	* Shared memory based on config requested by User
	*/
	static __OCC_INLINE cudaOccError cudaOccSMemPerMultiprocessor(size_t limit, const cudaOccDeviceProp properties, const cudaOccDeviceState *state)
	{
	// Volta introduces a new API that allows for shared memory carveout preference. Because it is a shared memory preference,
	// it is handled separately from the cache config preference.
	if (properties->computeMajor >= 7) {
	return cudaOccSMemPreferenceVoltaPlus(limit, properties, state);
	}
	return cudaOccSMemPreference(limit, properties, state);
	}

	/**
	* Return the per block shared memory limit based on function config
	*/
	static __OCC_INLINE cudaOccError cudaOccSMemPerBlock(size_t limit, const cudaOccDeviceProp properties, cudaOccFuncShmemConfig shmemLimitConfig, size_t smemPerCta)
	{
	switch (properties->computeMajor) {
	case 2:
	case 3:
	case 4:
	case 5:
	case 6:
	*limit = properties->sharedMemPerBlock;
	break;
	case 7:
	case 8:
	case 9:
	case 10:
	case 12:
	switch (shmemLimitConfig) {
	default:
	case FUNC_SHMEM_LIMIT_DEFAULT:
	*limit = properties->sharedMemPerBlock;
	break;
	case FUNC_SHMEM_LIMIT_OPTIN:
	if (smemPerCta > properties->sharedMemPerBlock) {
	*limit = properties->sharedMemPerBlockOptin;
	}
	else {
	*limit = properties->sharedMemPerBlock;
	}
	break;
	}
	break;
	default:
	return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
	}

	// Starting Ampere, CUDA driver reserves additional shared memory per block
	if (properties->computeMajor >= 8) {
	*limit += properties->reservedSharedMemPerBlock;
	}

	return CUDA_OCC_SUCCESS;
	}

	/**
	* Partitioned global caching mode support
	*/
	static __OCC_INLINE cudaOccError cudaOccPartitionedGlobalCachingModeSupport(cudaOccPartitionedGCSupport limit, const cudaOccDeviceProp properties)
	{
	*limit = PARTITIONED_GC_NOT_SUPPORTED;

	if ((properties->computeMajor == 5 && (properties->computeMinor == 2 \|\| properties->computeMinor == 3)) \|\|
	properties->computeMajor == 6) {
	*limit = PARTITIONED_GC_SUPPORTED;
	}

	if (properties->computeMajor == 6 && properties->computeMinor == 0) {
	*limit = PARTITIONED_GC_NOT_SUPPORTED;
	}

	return CUDA_OCC_SUCCESS;
	}

	///////////////////////////////////////////////
	// User Input Sanity //
	///////////////////////////////////////////////

	static __OCC_INLINE cudaOccError cudaOccDevicePropCheck(const cudaOccDeviceProp *properties)
	{
	// Verify device properties
	//
	// Each of these limits must be a positive number.
	//
	// Compute capacity is checked during the occupancy calculation
	//
	if (properties->maxThreadsPerBlock <= 0 \|\|
	properties->maxThreadsPerMultiprocessor <= 0 \|\|
	properties->regsPerBlock <= 0 \|\|
	properties->regsPerMultiprocessor <= 0 \|\|
	properties->warpSize <= 0 \|\|
	properties->sharedMemPerBlock <= 0 \|\|
	properties->sharedMemPerMultiprocessor <= 0 \|\|
	properties->numSms <= 0) {
	return CUDA_OCC_ERROR_INVALID_INPUT;
	}

	return CUDA_OCC_SUCCESS;
	}

	static __OCC_INLINE cudaOccError cudaOccFuncAttributesCheck(const cudaOccFuncAttributes *attributes)
	{
	// Verify function attributes
	//
	if (attributes->maxThreadsPerBlock <= 0 \|\|
	attributes->numRegs < 0) { // Compiler may choose not to use
	// any register (empty kernels,
	// etc.)
	return CUDA_OCC_ERROR_INVALID_INPUT;
	}

	return CUDA_OCC_SUCCESS;
	}

	static __OCC_INLINE cudaOccError cudaOccDeviceStateCheck(const cudaOccDeviceState *state)
	{
	(void)state; // silence unused-variable warning
	// Placeholder
	//

	return CUDA_OCC_SUCCESS;
	}

	static __OCC_INLINE cudaOccError cudaOccInputCheck(
	const cudaOccDeviceProp *properties,
	const cudaOccFuncAttributes *attributes,
	const cudaOccDeviceState *state)
	{
	cudaOccError status = CUDA_OCC_SUCCESS;

	status = cudaOccDevicePropCheck(properties);
	if (status != CUDA_OCC_SUCCESS) {
	return status;
	}

	status = cudaOccFuncAttributesCheck(attributes);
	if (status != CUDA_OCC_SUCCESS) {
	return status;
	}

	status = cudaOccDeviceStateCheck(state);
	if (status != CUDA_OCC_SUCCESS) {
	return status;
	}

	return status;
	}

	///////////////////////////////////////////////
	// Occupancy calculation Functions //
	///////////////////////////////////////////////

	static __OCC_INLINE cudaOccPartitionedGCConfig cudaOccPartitionedGCExpected(
	const cudaOccDeviceProp *properties,
	const cudaOccFuncAttributes *attributes)
	{
	cudaOccPartitionedGCSupport gcSupport;
	cudaOccPartitionedGCConfig gcConfig;

	cudaOccPartitionedGlobalCachingModeSupport(&gcSupport, properties);

	gcConfig = attributes->partitionedGCConfig;

	if (gcSupport == PARTITIONED_GC_NOT_SUPPORTED) {
	gcConfig = PARTITIONED_GC_OFF;
	}

	return gcConfig;
	}

	// Warp limit
	//
	static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMWarpsLimit(
	int *limit,
	cudaOccPartitionedGCConfig gcConfig,
	const cudaOccDeviceProp *properties,
	const cudaOccFuncAttributes *attributes,
	int blockSize)
	{
	cudaOccError status = CUDA_OCC_SUCCESS;
	int maxWarpsPerSm;
	int warpsAllocatedPerCTA;
	int maxBlocks;
	(void)attributes; // silence unused-variable warning

	if (blockSize > properties->maxThreadsPerBlock) {
	maxBlocks = 0;
	}
	else {
	maxWarpsPerSm = properties->maxThreadsPerMultiprocessor / properties->warpSize;
	warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize);
	maxBlocks = 0;

	if (gcConfig != PARTITIONED_GC_OFF) {
	int maxBlocksPerSmPartition;
	int maxWarpsPerSmPartition;

	// If partitioned global caching is on, then a CTA can only use a SM
	// partition (a half SM), and thus a half of the warp slots
	// available per SM
	//
	maxWarpsPerSmPartition = maxWarpsPerSm / 2;
	maxBlocksPerSmPartition = maxWarpsPerSmPartition / warpsAllocatedPerCTA;
	maxBlocks = maxBlocksPerSmPartition * 2;
	}
	// On hardware that supports partitioned global caching, each half SM is
	// guaranteed to support at least 32 warps (maximum number of warps of a
	// CTA), so caching will not cause 0 occupancy due to insufficient warp
	// allocation slots.
	//
	else {
	maxBlocks = maxWarpsPerSm / warpsAllocatedPerCTA;
	}
	}

	*limit = maxBlocks;

	return status;
	}

	// Shared memory limit
	//
	static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMSmemLimit(
	int *limit,
	cudaOccResult *result,
	const cudaOccDeviceProp *properties,
	const cudaOccFuncAttributes *attributes,
	const cudaOccDeviceState *state,
	int blockSize,
	size_t dynamicSmemSize)
	{
	cudaOccError status = CUDA_OCC_SUCCESS;
	int allocationGranularity;
	size_t userSmemPreference = 0;
	size_t totalSmemUsagePerCTA;
	size_t maxSmemUsagePerCTA;
	size_t smemAllocatedPerCTA;
	size_t staticSmemSize;
	size_t sharedMemPerMultiprocessor;
	size_t smemLimitPerCTA;
	int maxBlocks;
	int dynamicSmemSizeExceeded = 0;
	int totalSmemSizeExceeded = 0;
	(void)blockSize; // silence unused-variable warning

	status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties);
	if (status != CUDA_OCC_SUCCESS) {
	return status;
	}

	// Obtain the user preferred shared memory size. This setting is ignored if
	// user requests more shared memory than preferred.
	//
	status = cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state);
	if (status != CUDA_OCC_SUCCESS) {
	return status;
	}

	staticSmemSize = attributes->sharedSizeBytes + properties->reservedSharedMemPerBlock;
	totalSmemUsagePerCTA = staticSmemSize + dynamicSmemSize;
	smemAllocatedPerCTA = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity);

	maxSmemUsagePerCTA = staticSmemSize + attributes->maxDynamicSharedSizeBytes;

	dynamicSmemSizeExceeded = 0;
	totalSmemSizeExceeded = 0;

	// Obtain the user set maximum dynamic size if it exists
	// If so, the current launch dynamic shared memory must not
	// exceed the set limit
	if (attributes->shmemLimitConfig != FUNC_SHMEM_LIMIT_DEFAULT &&
	dynamicSmemSize > attributes->maxDynamicSharedSizeBytes) {
	dynamicSmemSizeExceeded = 1;
	}

	status = cudaOccSMemPerBlock(&smemLimitPerCTA, properties, attributes->shmemLimitConfig, maxSmemUsagePerCTA);
	if (status != CUDA_OCC_SUCCESS) {
	return status;
	}

	if (smemAllocatedPerCTA > smemLimitPerCTA) {
	totalSmemSizeExceeded = 1;
	}

	if (dynamicSmemSizeExceeded \|\| totalSmemSizeExceeded) {
	maxBlocks = 0;
	}
	else {
	// User requested shared memory limit is used as long as it is greater
	// than the total shared memory used per CTA, i.e. as long as at least
	// one CTA can be launched.
	if (userSmemPreference >= smemAllocatedPerCTA) {
	sharedMemPerMultiprocessor = userSmemPreference;
	}
	else {
	// On Volta+, user requested shared memory will limit occupancy
	// if it's less than shared memory per CTA. Otherwise, the
	// maximum shared memory limit is used.
	if (properties->computeMajor >= 7) {
	sharedMemPerMultiprocessor = smemAllocatedPerCTA;
	status = cudaOccAlignUpShmemSizeVoltaPlus(&sharedMemPerMultiprocessor, properties);
	if (status != CUDA_OCC_SUCCESS) {
	return status;
	}
	}
	else {
	sharedMemPerMultiprocessor = properties->sharedMemPerMultiprocessor;
	}
	}

	if (smemAllocatedPerCTA > 0) {
	maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA);
	}
	else {
	maxBlocks = INT_MAX;
	}
	}

	result->allocatedSharedMemPerBlock = smemAllocatedPerCTA;

	*limit = maxBlocks;

	return status;
	}

	static __OCC_INLINE
	cudaOccError cudaOccMaxBlocksPerSMRegsLimit(
	int *limit,
	cudaOccPartitionedGCConfig *gcConfig,
	cudaOccResult *result,
	const cudaOccDeviceProp *properties,
	const cudaOccFuncAttributes *attributes,
	int blockSize)
	{
	cudaOccError status = CUDA_OCC_SUCCESS;
	int allocationGranularity;
	int warpsAllocatedPerCTA;
	int regsAllocatedPerCTA;
	int regsAssumedPerCTA;
	int regsPerWarp;
	int regsAllocatedPerWarp;
	int numSubPartitions;
	int numRegsPerSubPartition;
	int numWarpsPerSubPartition;
	int numWarpsPerSM;
	int maxBlocks;
	int maxRegsPerThread;

	status = cudaOccRegAllocationGranularity(
	&allocationGranularity,
	properties);
	if (status != CUDA_OCC_SUCCESS) {
	return status;
	}

	status = cudaOccRegAllocationMaxPerThread(
	&maxRegsPerThread,
	properties);
	if (status != CUDA_OCC_SUCCESS) {
	return status;
	}

	status = cudaOccSubPartitionsPerMultiprocessor(&numSubPartitions, properties);
	if (status != CUDA_OCC_SUCCESS) {
	return status;
	}

	warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize);

	// GPUs of compute capability 2.x and higher allocate registers to warps
	//
	// Number of regs per warp is regs per thread x warp size, rounded up to
	// register allocation granularity
	//
	regsPerWarp = attributes->numRegs * properties->warpSize;
	regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity);
	regsAllocatedPerCTA = regsAllocatedPerWarp * warpsAllocatedPerCTA;

	// Hardware verifies if a launch fits the per-CTA register limit. For
	// historical reasons, the verification logic assumes register
	// allocations are made to all partitions simultaneously. Therefore, to
	// simulate the hardware check, the warp allocation needs to be rounded
	// up to the number of partitions.
	//
	regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions);

	if (properties->regsPerBlock < regsAssumedPerCTA \|\| // Hardware check
	properties->regsPerBlock < regsAllocatedPerCTA \|\| // Software check
	attributes->numRegs > maxRegsPerThread) { // Per thread limit check
	maxBlocks = 0;
	}
	else {
	if (regsAllocatedPerWarp > 0) {
	// Registers are allocated in each sub-partition. The max number
	// of warps that can fit on an SM is equal to the max number of
	// warps per sub-partition x number of sub-partitions.
	//
	numRegsPerSubPartition = properties->regsPerMultiprocessor / numSubPartitions;
	numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp;

	maxBlocks = 0;

	if (*gcConfig != PARTITIONED_GC_OFF) {
	int numSubPartitionsPerSmPartition;
	int numWarpsPerSmPartition;
	int maxBlocksPerSmPartition;

	// If partitioned global caching is on, then a CTA can only
	// use a half SM, and thus a half of the registers available
	// per SM
	//
	numSubPartitionsPerSmPartition = numSubPartitions / 2;
	numWarpsPerSmPartition = numWarpsPerSubPartition * numSubPartitionsPerSmPartition;
	maxBlocksPerSmPartition = numWarpsPerSmPartition / warpsAllocatedPerCTA;
	maxBlocks = maxBlocksPerSmPartition * 2;
	}

	// Try again if partitioned global caching is not enabled, or if
	// the CTA cannot fit on the SM with caching on (maxBlocks == 0). In the latter
	// case, the device will automatically turn off caching, except
	// if the user forces enablement via PARTITIONED_GC_ON_STRICT to calculate
	// occupancy and launch configuration.
	//
	if (maxBlocks == 0 && *gcConfig != PARTITIONED_GC_ON_STRICT) {
	// In case *gcConfig was PARTITIONED_GC_ON flip it OFF since
	// this is what it will be if we spread CTA across partitions.
	//
	*gcConfig = PARTITIONED_GC_OFF;
	numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions;
	maxBlocks = numWarpsPerSM / warpsAllocatedPerCTA;
	}
	}
	else {
	maxBlocks = INT_MAX;
	}
	}


	result->allocatedRegistersPerBlock = regsAllocatedPerCTA;

	*limit = maxBlocks;

	return status;
	}

	// Barrier limit
	//
	static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMBlockBarrierLimit(
	int *limit,
	int ctaLimitBlocks,
	const cudaOccDeviceProp *properties,
	const cudaOccFuncAttributes *attributes)
	{
	cudaOccError status = CUDA_OCC_SUCCESS;
	int numBarriersAvailable = 0;
	int numBarriersUsed = attributes->numBlockBarriers;
	int maxBlocks = INT_MAX;

	switch(properties->computeMajor) {
	case 5:
	case 6:
	case 7:
	numBarriersAvailable = ctaLimitBlocks * 2;
	break;
	case 8:
	if (properties->computeMinor == 0) {
	numBarriersAvailable = ctaLimitBlocks * 2;
	}
	else {
	numBarriersAvailable = ctaLimitBlocks;
	}
	break;
	case 9:
	numBarriersAvailable = ctaLimitBlocks * 2;
	break;
	case 10:
	switch(properties->computeMinor) {
	case 1 :
	numBarriersAvailable = ctaLimitBlocks;
	break;
	case 0 : /* explicitly added to avoid build failure in WDDM driver components. */
	default :
	numBarriersAvailable = ctaLimitBlocks * 2;
	}

	break;
	case 12:
	numBarriersAvailable = ctaLimitBlocks;
	break;
	default:
	return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
	}

	if (numBarriersUsed) {
	maxBlocks = numBarriersAvailable / numBarriersUsed;
	}

	*limit = maxBlocks;

	return status;
	}

	///////////////////////////////////
	// API Implementations //
	///////////////////////////////////

	static __OCC_INLINE
	cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor(
	cudaOccResult *result,
	const cudaOccDeviceProp *properties,
	const cudaOccFuncAttributes *attributes,
	const cudaOccDeviceState *state,
	int blockSize,
	size_t dynamicSmemSize)
	{
	cudaOccError status = CUDA_OCC_SUCCESS;
	int ctaLimitWarps = 0;
	int ctaLimitBlocks = 0;
	int ctaLimitSMem = 0;
	int ctaLimitRegs = 0;
	int ctaLimitBars = 0;
	int ctaLimit = 0;
	unsigned int limitingFactors = 0;

	cudaOccPartitionedGCConfig gcConfig = PARTITIONED_GC_OFF;

	if (!result \|\| !properties \|\| !attributes \|\| !state \|\| blockSize <= 0) {
	return CUDA_OCC_ERROR_INVALID_INPUT;
	}

	///////////////////////////
	// Check user input
	///////////////////////////

	status = cudaOccInputCheck(properties, attributes, state);
	if (status != CUDA_OCC_SUCCESS) {
	return status;
	}

	///////////////////////////
	// Initialization
	///////////////////////////

	gcConfig = cudaOccPartitionedGCExpected(properties, attributes);

	///////////////////////////
	// Compute occupancy
	///////////////////////////

	// Limits due to registers/SM
	// Also compute if partitioned global caching has to be turned off
	//
	status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegs, &gcConfig, result, properties, attributes, blockSize);
	if (status != CUDA_OCC_SUCCESS) {
	return status;
	}

	// SMs on GP100 (6.0) have 2 subpartitions, while those on GP10x have 4.
	// As a result, an SM on GP100 may be able to run more CTAs than the one on GP10x.
	// For forward compatibility within Pascal family, if a function cannot run on GP10x (maxBlock == 0),
	// we do not let it run on any Pascal processor, even though it may be able to run on GP100.
	// Therefore, we check the occupancy on GP10x when it can run on GP100
	//
	if (properties->computeMajor == 6 && properties->computeMinor == 0 && ctaLimitRegs) {
	cudaOccDeviceProp propertiesGP10x;
	cudaOccPartitionedGCConfig gcConfigGP10x = gcConfig;
	int ctaLimitRegsGP10x = 0;

	// Set up properties for GP10x
	memcpy(&propertiesGP10x, properties, sizeof(propertiesGP10x));
	propertiesGP10x.computeMinor = 1;

	status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegsGP10x, &gcConfigGP10x, result, &propertiesGP10x, attributes, blockSize);
	if (status != CUDA_OCC_SUCCESS) {
	return status;
	}

	if (ctaLimitRegsGP10x == 0) {
	ctaLimitRegs = 0;
	}
	}

	// Limits due to warps/SM
	//
	status = cudaOccMaxBlocksPerSMWarpsLimit(&ctaLimitWarps, gcConfig, properties, attributes, blockSize);
	if (status != CUDA_OCC_SUCCESS) {
	return status;
	}

	// Limits due to blocks/SM
	//
	status = cudaOccMaxBlocksPerMultiprocessor(&ctaLimitBlocks, properties);
	if (status != CUDA_OCC_SUCCESS) {
	return status;
	}

	// Limits due to shared memory/SM
	//
	status = cudaOccMaxBlocksPerSMSmemLimit(&ctaLimitSMem, result, properties, attributes, state, blockSize, dynamicSmemSize);
	if (status != CUDA_OCC_SUCCESS) {
	return status;
	}

	///////////////////////////
	// Overall occupancy
	///////////////////////////

	// Overall limit is min() of limits due to above reasons
	//
	ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks)));

	// Determine occupancy limiting factors
	//
	if (ctaLimit == ctaLimitWarps) {
	limitingFactors \|= OCC_LIMIT_WARPS;
	}
	if (ctaLimit == ctaLimitRegs) {
	limitingFactors \|= OCC_LIMIT_REGISTERS;
	}
	if (ctaLimit == ctaLimitSMem) {
	limitingFactors \|= OCC_LIMIT_SHARED_MEMORY;
	}
	if (ctaLimit == ctaLimitBlocks) {
	limitingFactors \|= OCC_LIMIT_BLOCKS;
	}

	// For Hopper onwards compute the limits to occupancy based on block barrier count
	//
	if (properties->computeMajor >= 9 && attributes->numBlockBarriers > 0) {
	// Limits due to barrier/SM
	//
	status = cudaOccMaxBlocksPerSMBlockBarrierLimit(&ctaLimitBars, ctaLimitBlocks, properties, attributes);
	if (status != CUDA_OCC_SUCCESS) {
	return status;
	}

	// Recompute overall limit based on barrier/SM
	//
	ctaLimit = __occMin(ctaLimitBars, ctaLimit);

	// Determine if this is occupancy limiting factor
	//
	if (ctaLimit == ctaLimitBars) {
	limitingFactors \|= OCC_LIMIT_BARRIERS;
	}
	}
	else {
	ctaLimitBars = INT_MAX;
	}

	// Fill in the return values
	//
	result->limitingFactors = limitingFactors;

	result->blockLimitRegs = ctaLimitRegs;
	result->blockLimitSharedMem = ctaLimitSMem;
	result->blockLimitWarps = ctaLimitWarps;
	result->blockLimitBlocks = ctaLimitBlocks;
	result->blockLimitBarriers = ctaLimitBars;
	result->partitionedGCConfig = gcConfig;

	// Final occupancy
	result->activeBlocksPerMultiprocessor = ctaLimit;

	return CUDA_OCC_SUCCESS;
	}

	static __OCC_INLINE
	cudaOccError cudaOccAvailableDynamicSMemPerBlock(
	size_t *bytesAvailable,
	const cudaOccDeviceProp *properties,
	const cudaOccFuncAttributes *attributes,
	const cudaOccDeviceState *state,
	int numBlocks,
	int blockSize)
	{
	int allocationGranularity;
	size_t smemLimitPerBlock;
	size_t smemAvailableForDynamic;
	size_t userSmemPreference = 0;
	size_t sharedMemPerMultiprocessor;
	cudaOccResult result;
	cudaOccError status = CUDA_OCC_SUCCESS;

	if (numBlocks <= 0)
	return CUDA_OCC_ERROR_INVALID_INPUT;

	// First compute occupancy of potential kernel launch.
	//
	status = cudaOccMaxActiveBlocksPerMultiprocessor(&result, properties, attributes, state, blockSize, 0);
	if (status != CUDA_OCC_SUCCESS) {
	return status;
	}
	// Check if occupancy is achievable given user requested number of blocks.
	//
	if (result.activeBlocksPerMultiprocessor < numBlocks) {
	return CUDA_OCC_ERROR_INVALID_INPUT;
	}

	status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties);
	if (status != CUDA_OCC_SUCCESS) {
	return status;
	}

	// Return the per block shared memory limit based on function config.
	//
	status = cudaOccSMemPerBlock(&smemLimitPerBlock, properties, attributes->shmemLimitConfig, properties->sharedMemPerMultiprocessor);
	if (status != CUDA_OCC_SUCCESS) {
	return status;
	}

	// If there is only a single block needed per SM, then the user preference can be ignored and the fully SW
	// limit is allowed to be used as shared memory otherwise if more than one block is needed, then the user
	// preference sets the total limit of available shared memory.
	//
	cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state);
	if (numBlocks == 1) {
	sharedMemPerMultiprocessor = smemLimitPerBlock;
	}
	else {
	if (!userSmemPreference) {
	userSmemPreference = 1 ;
	status = cudaOccAlignUpShmemSizeVoltaPlus(&userSmemPreference, properties);
	if (status != CUDA_OCC_SUCCESS) {
	return status;
	}
	}
	sharedMemPerMultiprocessor = userSmemPreference;
	}

	// Compute total shared memory available per SM
	//
	smemAvailableForDynamic = sharedMemPerMultiprocessor / numBlocks;
	smemAvailableForDynamic = (smemAvailableForDynamic / allocationGranularity) * allocationGranularity;

	// Cap shared memory
	//
	if (smemAvailableForDynamic > smemLimitPerBlock) {
	smemAvailableForDynamic = smemLimitPerBlock;
	}

	// Now compute dynamic shared memory size
	smemAvailableForDynamic = smemAvailableForDynamic - attributes->sharedSizeBytes;

	// Cap computed dynamic SM by user requested limit specified via cuFuncSetAttribute()
	//
	if (smemAvailableForDynamic > attributes->maxDynamicSharedSizeBytes)
	smemAvailableForDynamic = attributes->maxDynamicSharedSizeBytes;

	*bytesAvailable = smemAvailableForDynamic;
	return CUDA_OCC_SUCCESS;
	}

	static __OCC_INLINE
	cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
	int *minGridSize,
	int *blockSize,
	const cudaOccDeviceProp *properties,
	const cudaOccFuncAttributes *attributes,
	const cudaOccDeviceState *state,
	size_t (*blockSizeToDynamicSMemSize)(int),
	size_t dynamicSMemSize)
	{
	cudaOccError status = CUDA_OCC_SUCCESS;
	cudaOccResult result;

	// Limits
	int occupancyLimit;
	int granularity;
	int blockSizeLimit;

	// Recorded maximum
	int maxBlockSize = 0;
	int numBlocks = 0;
	int maxOccupancy = 0;

	// Temporary
	int blockSizeToTryAligned;
	int blockSizeToTry;
	int blockSizeLimitAligned;
	int occupancyInBlocks;
	int occupancyInThreads;

	///////////////////////////
	// Check user input
	///////////////////////////

	if (!minGridSize \|\| !blockSize \|\| !properties \|\| !attributes \|\| !state) {
	return CUDA_OCC_ERROR_INVALID_INPUT;
	}

	status = cudaOccInputCheck(properties, attributes, state);
	if (status != CUDA_OCC_SUCCESS) {
	return status;
	}

	/////////////////////////////////////////////////////////////////////////////////
	// Try each block size, and pick the block size with maximum occupancy
	/////////////////////////////////////////////////////////////////////////////////

	occupancyLimit = properties->maxThreadsPerMultiprocessor;
	granularity = properties->warpSize;

	blockSizeLimit = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock);
	blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);

	for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
	blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);

	// Ignore dynamicSMemSize if the user provides a mapping
	//
	if (blockSizeToDynamicSMemSize) {
	dynamicSMemSize = (*blockSizeToDynamicSMemSize)(blockSizeToTry);
	}

	status = cudaOccMaxActiveBlocksPerMultiprocessor(
	&result,
	properties,
	attributes,
	state,
	blockSizeToTry,
	dynamicSMemSize);

	if (status != CUDA_OCC_SUCCESS) {
	return status;
	}

	occupancyInBlocks = result.activeBlocksPerMultiprocessor;
	occupancyInThreads = blockSizeToTry * occupancyInBlocks;

	if (occupancyInThreads > maxOccupancy) {
	maxBlockSize = blockSizeToTry;
	numBlocks = occupancyInBlocks;
	maxOccupancy = occupancyInThreads;
	}

	// Early out if we have reached the maximum
	//
	if (occupancyLimit == maxOccupancy) {
	break;
	}
	}

	///////////////////////////
	// Return best available
	///////////////////////////

	// Suggested min grid size to achieve a full machine launch
	//
	minGridSize = numBlocks properties->numSms;
	*blockSize = maxBlockSize;

	return status;
	}


	#if defined(__cplusplus)

	namespace {

	__OCC_INLINE
	cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
	int *minGridSize,
	int *blockSize,
	const cudaOccDeviceProp *properties,
	const cudaOccFuncAttributes *attributes,
	const cudaOccDeviceState *state,
	size_t dynamicSMemSize)
	{
	return cudaOccMaxPotentialOccupancyBlockSize(
	minGridSize,
	blockSize,
	properties,
	attributes,
	state,
	NULL,
	dynamicSMemSize);
	}

	template <typename UnaryFunction>
	__OCC_INLINE
	cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem(
	int *minGridSize,
	int *blockSize,
	const cudaOccDeviceProp *properties,
	const cudaOccFuncAttributes *attributes,
	const cudaOccDeviceState *state,
	UnaryFunction blockSizeToDynamicSMemSize)
	{
	cudaOccError status = CUDA_OCC_SUCCESS;
	cudaOccResult result;

	// Limits
	int occupancyLimit;
	int granularity;
	int blockSizeLimit;

	// Recorded maximum
	int maxBlockSize = 0;
	int numBlocks = 0;
	int maxOccupancy = 0;

	// Temporary
	int blockSizeToTryAligned;
	int blockSizeToTry;
	int blockSizeLimitAligned;
	int occupancyInBlocks;
	int occupancyInThreads;
	size_t dynamicSMemSize;

	///////////////////////////
	// Check user input
	///////////////////////////

	if (!minGridSize \|\| !blockSize \|\| !properties \|\| !attributes \|\| !state) {
	return CUDA_OCC_ERROR_INVALID_INPUT;
	}

	status = cudaOccInputCheck(properties, attributes, state);
	if (status != CUDA_OCC_SUCCESS) {
	return status;
	}

	/////////////////////////////////////////////////////////////////////////////////
	// Try each block size, and pick the block size with maximum occupancy
	/////////////////////////////////////////////////////////////////////////////////

	occupancyLimit = properties->maxThreadsPerMultiprocessor;
	granularity = properties->warpSize;
	blockSizeLimit = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock);
	blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);

	for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
	blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);

	dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);

	status = cudaOccMaxActiveBlocksPerMultiprocessor(
	&result,
	properties,
	attributes,
	state,
	blockSizeToTry,
	dynamicSMemSize);

	if (status != CUDA_OCC_SUCCESS) {
	return status;
	}

	occupancyInBlocks = result.activeBlocksPerMultiprocessor;

	occupancyInThreads = blockSizeToTry * occupancyInBlocks;

	if (occupancyInThreads > maxOccupancy) {
	maxBlockSize = blockSizeToTry;
	numBlocks = occupancyInBlocks;
	maxOccupancy = occupancyInThreads;
	}

	// Early out if we have reached the maximum
	//
	if (occupancyLimit == maxOccupancy) {
	break;
	}
	}

	///////////////////////////
	// Return best available
	///////////////////////////

	// Suggested min grid size to achieve a full machine launch
	//
	minGridSize = numBlocks properties->numSms;
	*blockSize = maxBlockSize;

	return status;
	}

	} // namespace anonymous

	#endif /__cplusplus /

	#undef __OCC_INLINE

	#endif /__cuda_occupancy_h__/