/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Properties of a given CUDA device and the corresponding PTX bundle */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /** * \addtogroup UtilMgmt * @{ */ #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** * \brief Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed). */ template __host__ __device__ __forceinline__ cudaError_t AliasTemporaries( void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes, ///< [in,out] Size in bytes of \t d_temp_storage allocation void* (&allocations)[ALLOCATIONS], ///< [in,out] Pointers to device allocations needed size_t (&allocation_sizes)[ALLOCATIONS]) ///< [in] Sizes in bytes of device allocations needed { const int ALIGN_BYTES = 256; const int ALIGN_MASK = ~(ALIGN_BYTES - 1); // Compute exclusive prefix sum over allocation requests size_t allocation_offsets[ALLOCATIONS]; size_t bytes_needed = 0; for (int i = 0; i < ALLOCATIONS; ++i) { size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK; allocation_offsets[i] = bytes_needed; bytes_needed += allocation_bytes; } bytes_needed += ALIGN_BYTES - 1; // Check if the caller is simply requesting the size of the storage allocation if (!d_temp_storage) { temp_storage_bytes = bytes_needed; return cudaSuccess; } // Check if enough storage provided if (temp_storage_bytes < bytes_needed) { return CubDebug(cudaErrorInvalidValue); } // Alias d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK); for (int i = 0; i < ALLOCATIONS; ++i) { allocations[i] = static_cast(d_temp_storage) + allocation_offsets[i]; } return cudaSuccess; } /** * \brief Empty kernel for querying PTX manifest metadata (e.g., version) for the current device */ template __global__ void EmptyKernel(void) { } #endif // DOXYGEN_SHOULD_SKIP_THIS /** * \brief Returns the current device or -1 if an error occurred. */ CUB_RUNTIME_FUNCTION inline int CurrentDevice() { int device = -1; if (CubDebug(cudaGetDevice(&device))) return -1; return device; } /** * \brief RAII helper which saves the current device and switches to the * specified device on construction and switches to the saved device on * destruction. */ struct SwitchDevice { private: int const old_device; bool const needs_reset; public: __host__ inline SwitchDevice(int new_device) : old_device(CurrentDevice()), needs_reset(old_device != new_device) { if (needs_reset) CubDebug(cudaSetDevice(new_device)); } __host__ inline ~SwitchDevice() { if (needs_reset) CubDebug(cudaSetDevice(old_device)); } }; /** * \brief Returns the number of CUDA devices available or -1 if an error * occurred. */ CUB_RUNTIME_FUNCTION inline int DeviceCountUncached() { int count = -1; if (CubDebug(cudaGetDeviceCount(&count))) // CUDA makes no guarantees about the state of the output parameter if // `cudaGetDeviceCount` fails; in practice, they don't, but out of // paranoia we'll reset `count` to `-1`. count = -1; return count; } /** * \brief Cache for an arbitrary value produced by a nullary function. */ template struct ValueCache { T const value; /** * \brief Call the nullary function to produce the value and construct the * cache. */ __host__ inline ValueCache() : value(Function()) {} }; // Host code, only safely usable in C++11 or newer, where thread-safe // initialization of static locals is guaranteed. This is a separate function // to avoid defining a local static in a host/device function. __host__ inline int DeviceCountCachedValue() { static ValueCache cache; return cache.value; } /** * \brief Returns the number of CUDA devices available. * * \note This function may cache the result internally. * * \note This function is thread safe. */ CUB_RUNTIME_FUNCTION inline int DeviceCount() { int result = -1; NV_IF_TARGET(NV_IS_HOST, (result = DeviceCountCachedValue();), (result = DeviceCountUncached();)); return result; } /** * \brief Per-device cache for a CUDA attribute value; the attribute is queried * and stored for each device upon construction. */ struct PerDeviceAttributeCache { struct DevicePayload { int attribute; cudaError_t error; }; // Each entry starts in the `DeviceEntryEmpty` state, then proceeds to the // `DeviceEntryInitializing` state, and then proceeds to the // `DeviceEntryReady` state. These are the only state transitions allowed; // e.g. a linear sequence of transitions. enum DeviceEntryStatus { DeviceEntryEmpty = 0, DeviceEntryInitializing, DeviceEntryReady }; struct DeviceEntry { std::atomic flag; DevicePayload payload; }; private: std::array entries_; public: /** * \brief Construct the cache. */ __host__ inline PerDeviceAttributeCache() : entries_() { assert(DeviceCount() <= CUB_MAX_DEVICES); } /** * \brief Retrieves the payload of the cached function \p f for \p device. * * \note You must pass a morally equivalent function in to every call or * this function has undefined behavior. */ template __host__ DevicePayload operator()(Invocable&& f, int device) { if (device >= DeviceCount()) return DevicePayload{0, cudaErrorInvalidDevice}; auto& entry = entries_[device]; auto& flag = entry.flag; auto& payload = entry.payload; DeviceEntryStatus old_status = DeviceEntryEmpty; // First, check for the common case of the entry being ready. if (flag.load(std::memory_order_acquire) != DeviceEntryReady) { // Assume the entry is empty and attempt to lock it so we can fill // it by trying to set the state from `DeviceEntryReady` to // `DeviceEntryInitializing`. if (flag.compare_exchange_strong(old_status, DeviceEntryInitializing, std::memory_order_acq_rel, std::memory_order_acquire)) { // We successfully set the state to `DeviceEntryInitializing`; // we have the lock and it's our job to initialize this entry // and then release it. // We don't use `CubDebug` here because we let the user code // decide whether or not errors are hard errors. payload.error = ::cuda::std::forward(f)(payload.attribute); if (payload.error) // Clear the global CUDA error state which may have been // set by the last call. Otherwise, errors may "leak" to // unrelated kernel launches. cudaGetLastError(); // Release the lock by setting the state to `DeviceEntryReady`. flag.store(DeviceEntryReady, std::memory_order_release); } // If the `compare_exchange_weak` failed, then `old_status` has // been updated with the value of `flag` that it observed. else if (old_status == DeviceEntryInitializing) { // Another execution agent is initializing this entry; we need // to wait for them to finish; we'll know they're done when we // observe the entry status as `DeviceEntryReady`. do { old_status = flag.load(std::memory_order_acquire); } while (old_status != DeviceEntryReady); // FIXME: Use `atomic::wait` instead when we have access to // host-side C++20 atomics. We could use libcu++, but it only // supports atomics for SM60 and up, even if you're only using // them in host code. } } // We now know that the state of our entry is `DeviceEntryReady`, so // just return the entry's payload. return entry.payload; } }; /** * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10). */ CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersionUncached(int& ptx_version) { // Instantiate `EmptyKernel` in both host and device code to ensure // it can be called. typedef void (*EmptyKernelPtr)(); EmptyKernelPtr empty_kernel = EmptyKernel; // This is necessary for unused variable warnings in host compilers. The // usual syntax of (void)empty_kernel; was not sufficient on MSVC2015. (void)reinterpret_cast(empty_kernel); // Define a temporary macro that expands to the current target ptx version // in device code. // may provide an abstraction for this eventually. For now, // we have to keep this usage of __CUDA_ARCH__. #if defined(_NVHPC_CUDA) #define CUB_TEMP_GET_PTX __builtin_current_device_sm() #else #define CUB_TEMP_GET_PTX __CUDA_ARCH__ #endif cudaError_t result = cudaSuccess; NV_IF_TARGET( NV_IS_HOST, ( cudaFuncAttributes empty_kernel_attrs; result = cudaFuncGetAttributes(&empty_kernel_attrs, reinterpret_cast(empty_kernel)); CubDebug(result); ptx_version = empty_kernel_attrs.ptxVersion * 10; ), // NV_IS_DEVICE ( // This is necessary to ensure instantiation of EmptyKernel in device // code. The `reinterpret_cast` is necessary to suppress a // set-but-unused warnings. This is a meme now: // https://twitter.com/blelbach/status/1222391615576100864 (void)reinterpret_cast(empty_kernel); ptx_version = CUB_TEMP_GET_PTX; )); #undef CUB_TEMP_GET_PTX return result; } /** * \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10). */ __host__ inline cudaError_t PtxVersionUncached(int& ptx_version, int device) { SwitchDevice sd(device); (void)sd; return PtxVersionUncached(ptx_version); } template __host__ inline PerDeviceAttributeCache& GetPerDeviceAttributeCache() { // C++11 guarantees that initialization of static locals is thread safe. static PerDeviceAttributeCache cache; return cache; } struct PtxVersionCacheTag {}; struct SmVersionCacheTag {}; /** * \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10). * * \note This function may cache the result internally. * * \note This function is thread safe. */ __host__ inline cudaError_t PtxVersion(int& ptx_version, int device) { auto const payload = GetPerDeviceAttributeCache()( // If this call fails, then we get the error code back in the payload, // which we check with `CubDebug` below. [=] (int& pv) { return PtxVersionUncached(pv, device); }, device); if (!CubDebug(payload.error)) ptx_version = payload.attribute; return payload.error; } /** * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10). * * \note This function may cache the result internally. * * \note This function is thread safe. */ CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersion(int &ptx_version) { cudaError_t result = cudaErrorUnknown; NV_IF_TARGET( NV_IS_HOST, ( auto const device = CurrentDevice(); auto const payload = GetPerDeviceAttributeCache()( // If this call fails, then we get the error code back in the payload, // which we check with `CubDebug` below. [=](int &pv) { return PtxVersionUncached(pv, device); }, device); if (!CubDebug(payload.error)) { ptx_version = payload.attribute; } result = payload.error; ), ( // NV_IS_DEVICE: result = PtxVersionUncached(ptx_version); )); return result; } /** * \brief Retrieves the SM version of \p device (major * 100 + minor * 10) */ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersionUncached(int& sm_version, int device = CurrentDevice()) { cudaError_t error = cudaSuccess; do { int major = 0, minor = 0; if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device))) break; if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device))) break; sm_version = major * 100 + minor * 10; } while (0); return error; } /** * \brief Retrieves the SM version of \p device (major * 100 + minor * 10) * * \note This function may cache the result internally. * * \note This function is thread safe. */ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int &sm_version, int device = CurrentDevice()) { cudaError_t result = cudaErrorUnknown; NV_IF_TARGET( NV_IS_HOST, ( auto const payload = GetPerDeviceAttributeCache()( // If this call fails, then we get the error code back in // the payload, which we check with `CubDebug` below. [=](int &pv) { return SmVersionUncached(pv, device); }, device); if (!CubDebug(payload.error)) { sm_version = payload.attribute; }; result = payload.error; ), ( // NV_IS_DEVICE result = SmVersionUncached(sm_version, device); )); return result; } /** * Synchronize the specified \p stream. */ CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream(cudaStream_t stream) { cudaError_t result = cudaErrorNotSupported; NV_IF_TARGET(NV_IS_HOST, (result = CubDebug(cudaStreamSynchronize(stream));), ((void)stream; result = CubDebug(cub::detail::device_synchronize());)); return result; } namespace detail { /** * Same as SyncStream, but intended for use with the debug_synchronous flags * in device algorithms. This should not be used if synchronization is required * for correctness. * * If `debug_synchronous` is false, this function will immediately return * cudaSuccess. If true, one of the following will occur: * * If synchronization is supported by the current compilation target and * settings, the sync is performed and the sync result is returned. * * If syncs are not supported then no sync is performed, but a message is logged * via _CubLog and cudaSuccess is returned. */ CUB_RUNTIME_FUNCTION inline cudaError_t DebugSyncStream(cudaStream_t stream) { #ifndef CUB_DETAIL_DEBUG_ENABLE_SYNC (void)stream; return cudaSuccess; #else // CUB_DETAIL_DEBUG_ENABLE_SYNC: #define CUB_TMP_SYNC_AVAILABLE \ _CubLog("%s\n", "Synchronizing..."); \ return SyncStream(stream) #define CUB_TMP_DEVICE_SYNC_UNAVAILABLE \ (void)stream; \ _CubLog("WARNING: Skipping CUB `debug_synchronous` synchronization (%s).\n", \ "device-side sync requires // or equivalently * * template * __global__ void ExampleKernel() * { * // Allocate shared memory for BlockScan * __shared__ volatile T buffer[4096]; * * ... * } * * ... * * // Determine SM occupancy for ExampleKernel specialized for unsigned char * int max_sm_occupancy; * MaxSmOccupancy(max_sm_occupancy, ExampleKernel, 64); * * // max_sm_occupancy <-- 4 on SM10 * // max_sm_occupancy <-- 8 on SM20 * // max_sm_occupancy <-- 12 on SM35 * * \endcode * */ template CUB_RUNTIME_FUNCTION inline cudaError_t MaxSmOccupancy( int& max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy int block_threads, ///< [in] Number of threads per thread block int dynamic_smem_bytes = 0) ///< [in] Dynamically allocated shared memory in bytes. Default is 0. { return CubDebug(cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_sm_occupancy, kernel_ptr, block_threads, dynamic_smem_bytes)); } /****************************************************************************** * Policy management ******************************************************************************/ /** * Kernel dispatch configuration */ struct KernelConfig { int block_threads; int items_per_thread; int tile_size; int sm_occupancy; CUB_RUNTIME_FUNCTION __forceinline__ KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {} template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Init(KernelPtrT kernel_ptr) { block_threads = AgentPolicyT::BLOCK_THREADS; items_per_thread = AgentPolicyT::ITEMS_PER_THREAD; tile_size = block_threads * items_per_thread; cudaError_t retval = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads); return retval; } }; /// Helper for dispatching into a policy chain template struct ChainedPolicy { /// The policy for the active compiler pass using ActivePolicy = cub::detail::conditional_t<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>; /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version template CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Invoke(int ptx_version, FunctorT& op) { if (ptx_version < PTX_VERSION) { return PrevPolicyT::Invoke(ptx_version, op); } return op.template Invoke(); } }; /// Helper for dispatching into a policy chain (end-of-chain specialization) template struct ChainedPolicy { /// The policy for the active compiler pass typedef PolicyT ActivePolicy; /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version template CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Invoke(int /*ptx_version*/, FunctorT& op) { return op.template Invoke(); } }; /** @} */ // end group UtilMgmt CUB_NAMESPACE_END