//===----------------------------------------------------------------------===// // // Part of libcu++, the C++ Standard Library for your entire system, // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700 # error "CUDA synchronization primitives are only supported for sm_70 and up." #endif #ifndef _CUDA_BARRIER #define _CUDA_BARRIER #include "atomic" #include "cstddef" #include "detail/__config" #include "detail/__pragma_push" #include "detail/libcxx/include/barrier" _LIBCUDACXX_BEGIN_NAMESPACE_CUDA template struct aligned_size_t { static constexpr std::size_t align = _Alignment; std::size_t value; __host__ __device__ explicit aligned_size_t(size_t __s) : value(__s) { } __host__ __device__ operator size_t() const { return value; } }; template class barrier : public std::__barrier_base<_CompletionF, _Sco> { template friend class pipeline; using std::__barrier_base<_CompletionF, _Sco>::__try_wait; public: barrier() = default; barrier(const barrier &) = delete; barrier & operator=(const barrier &) = delete; _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR barrier(std::ptrdiff_t __expected, _CompletionF __completion = _CompletionF()) : std::__barrier_base<_CompletionF, _Sco>(__expected, __completion) { } _LIBCUDACXX_INLINE_VISIBILITY friend void init(barrier * __b, std::ptrdiff_t __expected) { new (__b) barrier(__expected); } _LIBCUDACXX_INLINE_VISIBILITY friend void init(barrier * __b, std::ptrdiff_t __expected, _CompletionF __completion) { new (__b) barrier(__expected, __completion); } }; struct __block_scope_barrier_base {}; _LIBCUDACXX_END_NAMESPACE_CUDA _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE __device__ inline std::uint64_t * barrier_native_handle(barrier & b); _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE _LIBCUDACXX_BEGIN_NAMESPACE_CUDA template<> class barrier : public __block_scope_barrier_base { using __barrier_base = std::__barrier_base; __barrier_base __barrier; __device__ friend inline std::uint64_t * device::_LIBCUDACXX_CUDA_ABI_NAMESPACE::barrier_native_handle(barrier & b); public: using arrival_token = typename __barrier_base::arrival_token; private: struct __poll_tester { barrier const* __this; arrival_token __phase; _LIBCUDACXX_INLINE_VISIBILITY __poll_tester(barrier const* __this_, arrival_token&& __phase_) : __this(__this_) , __phase(_CUDA_VSTD::move(__phase_)) {} inline _LIBCUDACXX_INLINE_VISIBILITY bool operator()() const { return __this->__try_wait(__phase); } }; _LIBCUDACXX_INLINE_VISIBILITY bool __try_wait(arrival_token __phase) const { #if __CUDA_ARCH__ >= 800 if (__isShared(&__barrier)) { int __ready = 0; asm volatile ("{\n\t" ".reg .pred p;\n\t" "mbarrier.test_wait.shared.b64 p, [%1], %2;\n\t" "selp.b32 %0, 1, 0, p;\n\t" "}" : "=r"(__ready) : "r"(static_cast(__cvta_generic_to_shared(&__barrier))), "l"(__phase) : "memory"); return __ready; } else #endif { return __barrier.__try_wait(std::move(__phase)); } } template friend class pipeline; public: barrier() = default; barrier(const barrier &) = delete; barrier & operator=(const barrier &) = delete; _LIBCUDACXX_INLINE_VISIBILITY barrier(std::ptrdiff_t __expected, std::__empty_completion __completion = std::__empty_completion()) { static_assert(_LIBCUDACXX_OFFSET_IS_ZERO(barrier, __barrier), "fatal error: bad barrier layout"); init(this, __expected, __completion); } _LIBCUDACXX_INLINE_VISIBILITY ~barrier() { #if __CUDA_ARCH__ >= 800 if (__isShared(&__barrier)) { asm volatile ("mbarrier.inval.shared.b64 [%0];" :: "r"(static_cast(__cvta_generic_to_shared(&__barrier))) : "memory"); } #endif } _LIBCUDACXX_INLINE_VISIBILITY friend void init(barrier * __b, std::ptrdiff_t __expected, std::__empty_completion __completion = std::__empty_completion()) { #if __CUDA_ARCH__ >= 800 if (__isShared(&__b->__barrier)) { asm volatile ("mbarrier.init.shared.b64 [%0], %1;" :: "r"(static_cast(__cvta_generic_to_shared(&__b->__barrier))), "r"(static_cast(__expected)) : "memory"); } else #endif { new (&__b->__barrier) __barrier_base(__expected); } } _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_INLINE_VISIBILITY arrival_token arrive(std::ptrdiff_t __update = 1) { #if __CUDA_ARCH__ if (__isShared(&__barrier)) { arrival_token __token; #if __CUDA_ARCH__ >= 800 if (__update > 1) { asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;" : "=l"(__token) : "r"(static_cast(__cvta_generic_to_shared(&__barrier))), "r"(static_cast(__update - 1)) : "memory"); } asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];" : "=l"(__token) : "r"(static_cast(__cvta_generic_to_shared(&__barrier))) : "memory"); #else unsigned int __activeA = __match_any_sync(__activemask(), __update); unsigned int __activeB = __match_any_sync(__activemask(), reinterpret_cast(&__barrier)); unsigned int __active = __activeA & __activeB; int __inc = __popc(__active) * __update; unsigned __laneid; asm volatile ("mov.u32 %0, %laneid;" : "=r"(__laneid)); int __leader = __ffs(__active) - 1; if(__leader == __laneid) { __token = __barrier.arrive(__inc); } __token = __shfl_sync(__active, __token, __leader); #endif return __token; } else #endif { return __barrier.arrive(__update); } } _LIBCUDACXX_INLINE_VISIBILITY void wait(arrival_token && __phase) const { _CUDA_VSTD::__libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase))); } inline _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_wait() { wait(arrive()); } _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_drop() { #if __CUDA_ARCH__ >= 800 if (__isShared(&__barrier)) { asm volatile ("mbarrier.arrive_drop.shared.b64 _, [%0];" :: "r"(static_cast(__cvta_generic_to_shared(&__barrier))) : "memory"); } else #endif { __barrier.arrive_and_drop(); } } _LIBCUDACXX_INLINE_VISIBILITY static constexpr ptrdiff_t max() noexcept { return (1 << 20) - 1; } }; _LIBCUDACXX_END_NAMESPACE_CUDA _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE __device__ inline std::uint64_t * barrier_native_handle(barrier & b) { return reinterpret_cast(&b.__barrier); } _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE _LIBCUDACXX_BEGIN_NAMESPACE_CUDA template<> class barrier : private barrier { using __base = barrier; public: using __base::__base; _LIBCUDACXX_INLINE_VISIBILITY friend void init(barrier * __b, std::ptrdiff_t __expected, std::__empty_completion __completion = std::__empty_completion()) { init(static_cast<__base *>(__b), __expected, __completion); } using __base::arrive; using __base::wait; using __base::arrive_and_wait; using __base::arrive_and_drop; using __base::max; }; template _LIBCUDACXX_INLINE_VISIBILITY inline void __strided_memcpy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride = 1) { if (__stride == 1) { memcpy(__destination, __source, __total_size); } else { for (std::size_t __offset = __rank * _Alignment; __offset < __total_size; __offset += __stride * _Alignment) { memcpy(__destination + __offset, __source + __offset, _Alignment); } } } #if __CUDA_ARCH__ >= 800 template 16)> struct __memcpy_async_impl { __device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) { __strided_memcpy<_Alignment>(__destination, __source, __total_size, __rank, __stride); return false; } }; template<> struct __memcpy_async_impl<4, false> { __device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) { for (std::size_t __offset = __rank * 4; __offset < __total_size; __offset += __stride * 4) { asm volatile ("cp.async.ca.shared.global [%0], [%1], 4, 4;" :: "r"(static_cast(__cvta_generic_to_shared(__destination + __offset))), "l"(__source + __offset) : "memory"); } return true; } }; template<> struct __memcpy_async_impl<8, false> { __device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) { for (std::size_t __offset = __rank * 8; __offset < __total_size; __offset += __stride * 8) { asm volatile ("cp.async.ca.shared.global [%0], [%1], 8, 8;" :: "r"(static_cast(__cvta_generic_to_shared(__destination + __offset))), "l"(__source + __offset) : "memory"); } return true; } }; template<> struct __memcpy_async_impl<16, false> { __device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) { for (std::size_t __offset = __rank * 16; __offset < __total_size; __offset += __stride * 16) { asm volatile ("cp.async.cg.shared.global [%0], [%1], 16, 16;" :: "r"(static_cast(__cvta_generic_to_shared(__destination + __offset))), "l"(__source + __offset) : "memory"); } return true; } }; template struct __memcpy_async_impl<_Alignment, true> : public __memcpy_async_impl<16, false> { }; #endif template= thread_scope_block) && std::is_same<_CompF, std::__empty_completion>::value> _LIBCUDACXX_INLINE_VISIBILITY inline void __memcpy_async_synchronize(barrier<_Sco, _CompF> & __barrier, bool __is_async) { #if __CUDA_ARCH__ >= 800 if (__is_async) { if (_Is_mbarrier && __isShared(&__barrier)) { asm volatile ("cp.async.mbarrier.arrive.shared.b64 [%0];" :: "r"(static_cast(__cvta_generic_to_shared(&__barrier))) : "memory"); } else { asm volatile ("cp.async.wait_all;" ::: "memory"); } } #endif } template _LIBCUDACXX_INLINE_VISIBILITY void inline __memcpy_async(_Group const & __group, char * __destination, char const * __source, std::size_t __size, _Sync & __sync) { bool __is_async = false; #if __CUDA_ARCH__ >= 800 __is_async = __isShared(__destination) && __isGlobal(__source); if (__is_async) { if (_Native_alignment < 4) { auto __source_address = reinterpret_cast(__source); auto __destination_address = reinterpret_cast(__destination); // Lowest bit set will tell us what the common alignment of the three values is. auto _Alignment = __ffs(__source_address | __destination_address | __size); switch (_Alignment) { default: __is_async = __memcpy_async_impl<16>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); break; case 4: __is_async = __memcpy_async_impl<8>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); break; case 3: __is_async = __memcpy_async_impl<4>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); break; case 2: // fallthrough case 1: __is_async = __memcpy_async_impl<1>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); break; } } else { __is_async = __memcpy_async_impl<_Native_alignment>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); } } else #endif { __strided_memcpy<_Native_alignment>(__destination, __source, __size, __group.thread_rank(), __group.size()); } __memcpy_async_synchronize(__sync, __is_async); } struct __single_thread_group { _LIBCUDACXX_INLINE_VISIBILITY void sync() const {} _LIBCUDACXX_INLINE_VISIBILITY constexpr std::size_t size() const { return 1; }; _LIBCUDACXX_INLINE_VISIBILITY constexpr std::size_t thread_rank() const { return 0; }; }; template _LIBCUDACXX_INLINE_VISIBILITY void memcpy_async(_Group const & __group, _Tp * __destination, _Tp const * __source, std::size_t __size, barrier<_Sco, _CompF> & __barrier) { // When compiling with NVCC and GCC 4.8, certain user defined types that _are_ trivially copyable are // incorrectly classified as not trivially copyable. Remove this assertion to allow for their usage with // memcpy_async when compiling with GCC 4.8. // FIXME: remove the #if once GCC 4.8 is no longer supported. #if !defined(_LIBCUDACXX_COMPILER_GCC) || _GNUC_VER > 408 static_assert(std::is_trivially_copyable<_Tp>::value, "memcpy_async requires a trivially copyable type"); #endif __memcpy_async(__group, reinterpret_cast(__destination), reinterpret_cast(__source), __size, __barrier); } template _Alignment) ? alignof(_Tp) : _Alignment> _LIBCUDACXX_INLINE_VISIBILITY void memcpy_async(_Group const & __group, _Tp * __destination, _Tp const * __source, aligned_size_t<_Alignment> __size, barrier<_Sco, _CompF> & __barrier) { // When compiling with NVCC and GCC 4.8, certain user defined types that _are_ trivially copyable are // incorrectly classified as not trivially copyable. Remove this assertion to allow for their usage with // memcpy_async when compiling with GCC 4.8. // FIXME: remove the #if once GCC 4.8 is no longer supported. #if !defined(_LIBCUDACXX_COMPILER_GCC) || _GNUC_VER > 408 static_assert(std::is_trivially_copyable<_Tp>::value, "memcpy_async requires a trivially copyable type"); #endif __memcpy_async<_Larger_alignment>(__group, reinterpret_cast(__destination), reinterpret_cast(__source), __size, __barrier); } template _LIBCUDACXX_INLINE_VISIBILITY void memcpy_async(_Tp * __destination, _Tp const * __source, _Size __size, barrier<_Sco, _CompF> & __barrier) { memcpy_async(__single_thread_group{}, __destination, __source, __size, __barrier); } template _LIBCUDACXX_INLINE_VISIBILITY void memcpy_async(_Group const & __group, void * __destination, void const * __source, std::size_t __size, barrier<_Sco, _CompF> & __barrier) { __memcpy_async<1>(__group, reinterpret_cast(__destination), reinterpret_cast(__source), __size, __barrier); } template _LIBCUDACXX_INLINE_VISIBILITY void memcpy_async(_Group const & __group, void * __destination, void const * __source, aligned_size_t<_Alignment> __size, barrier<_Sco, _CompF> & __barrier) { __memcpy_async<_Alignment>(__group, reinterpret_cast(__destination), reinterpret_cast(__source), __size, __barrier); } template _LIBCUDACXX_INLINE_VISIBILITY void memcpy_async(void * __destination, void const * __source, _Size __size, barrier<_Sco, _CompF> & __barrier) { memcpy_async(__single_thread_group{}, __destination, __source, __size, __barrier); } _LIBCUDACXX_END_NAMESPACE_CUDA #include "detail/__pragma_pop" #endif //_CUDA_BARRIER