| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700 |
| # error "CUDA synchronization primitives are only supported for sm_70 and up." |
| #endif |
|
|
| #ifndef _CUDA_BARRIER |
| #define _CUDA_BARRIER |
|
|
| #include "atomic" |
| #include "cstddef" |
|
|
| #include "detail/__config" |
|
|
| #include "detail/__pragma_push" |
|
|
| #include "detail/libcxx/include/barrier" |
|
|
| _LIBCUDACXX_BEGIN_NAMESPACE_CUDA |
|
|
| template<std::size_t _Alignment> |
| struct aligned_size_t { |
| static constexpr std::size_t align = _Alignment; |
| std::size_t value; |
| __host__ __device__ |
| explicit aligned_size_t(size_t __s) : value(__s) { } |
| __host__ __device__ |
| operator size_t() const { return value; } |
| }; |
|
|
| template<thread_scope _Sco, class _CompletionF = std::__empty_completion> |
| class barrier : public std::__barrier_base<_CompletionF, _Sco> { |
| template<thread_scope> |
| friend class pipeline; |
|
|
| using std::__barrier_base<_CompletionF, _Sco>::__try_wait; |
|
|
| public: |
| barrier() = default; |
|
|
| barrier(const barrier &) = delete; |
| barrier & operator=(const barrier &) = delete; |
|
|
| _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR |
| barrier(std::ptrdiff_t __expected, _CompletionF __completion = _CompletionF()) |
| : std::__barrier_base<_CompletionF, _Sco>(__expected, __completion) { |
| } |
|
|
| _LIBCUDACXX_INLINE_VISIBILITY |
| friend void init(barrier * __b, std::ptrdiff_t __expected) { |
| new (__b) barrier(__expected); |
| } |
|
|
| _LIBCUDACXX_INLINE_VISIBILITY |
| friend void init(barrier * __b, std::ptrdiff_t __expected, _CompletionF __completion) { |
| new (__b) barrier(__expected, __completion); |
| } |
| }; |
|
|
| struct __block_scope_barrier_base {}; |
|
|
| _LIBCUDACXX_END_NAMESPACE_CUDA |
| |
| _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE |
| |
| __device__ |
| inline std::uint64_t * barrier_native_handle(barrier<thread_scope_block> & b); |
|
|
| _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE |
|
|
| _LIBCUDACXX_BEGIN_NAMESPACE_CUDA |
|
|
| template<> |
| class barrier<thread_scope_block, std::__empty_completion> : public __block_scope_barrier_base { |
| using __barrier_base = std::__barrier_base<std::__empty_completion, (int)thread_scope_block>; |
| __barrier_base __barrier; |
|
|
| __device__ |
| friend inline std::uint64_t * device::_LIBCUDACXX_CUDA_ABI_NAMESPACE::barrier_native_handle(barrier<thread_scope_block> & b); |
|
|
| public: |
| using arrival_token = typename __barrier_base::arrival_token; |
|
|
| private: |
| struct __poll_tester { |
| barrier const* __this; |
| arrival_token __phase; |
|
|
| _LIBCUDACXX_INLINE_VISIBILITY |
| __poll_tester(barrier const* __this_, arrival_token&& __phase_) |
| : __this(__this_) |
| , __phase(_CUDA_VSTD::move(__phase_)) |
| {} |
|
|
| inline _LIBCUDACXX_INLINE_VISIBILITY |
| bool operator()() const |
| { |
| return __this->__try_wait(__phase); |
| } |
| }; |
|
|
| _LIBCUDACXX_INLINE_VISIBILITY |
| bool __try_wait(arrival_token __phase) const { |
| #if __CUDA_ARCH__ >= 800 |
| if (__isShared(&__barrier)) { |
| int __ready = 0; |
| asm volatile ("{\n\t" |
| ".reg .pred p;\n\t" |
| "mbarrier.test_wait.shared.b64 p, [%1], %2;\n\t" |
| "selp.b32 %0, 1, 0, p;\n\t" |
| "}" |
| : "=r"(__ready) |
| : "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))), "l"(__phase) |
| : "memory"); |
| return __ready; |
| } |
| else |
| #endif |
| { |
| return __barrier.__try_wait(std::move(__phase)); |
| } |
| } |
|
|
| template<thread_scope> |
| friend class pipeline; |
|
|
| public: |
| barrier() = default; |
|
|
| barrier(const barrier &) = delete; |
| barrier & operator=(const barrier &) = delete; |
|
|
| _LIBCUDACXX_INLINE_VISIBILITY |
| barrier(std::ptrdiff_t __expected, std::__empty_completion __completion = std::__empty_completion()) { |
| static_assert(_LIBCUDACXX_OFFSET_IS_ZERO(barrier<thread_scope_block>, __barrier), "fatal error: bad barrier layout"); |
| init(this, __expected, __completion); |
| } |
|
|
| _LIBCUDACXX_INLINE_VISIBILITY |
| ~barrier() { |
| #if __CUDA_ARCH__ >= 800 |
| if (__isShared(&__barrier)) { |
| asm volatile ("mbarrier.inval.shared.b64 [%0];" |
| :: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))) |
| : "memory"); |
| } |
| #endif |
| } |
|
|
| _LIBCUDACXX_INLINE_VISIBILITY |
| friend void init(barrier * __b, std::ptrdiff_t __expected, std::__empty_completion __completion = std::__empty_completion()) { |
| #if __CUDA_ARCH__ >= 800 |
| if (__isShared(&__b->__barrier)) { |
| asm volatile ("mbarrier.init.shared.b64 [%0], %1;" |
| :: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__b->__barrier))), |
| "r"(static_cast<std::uint32_t>(__expected)) |
| : "memory"); |
| } |
| else |
| #endif |
| { |
| new (&__b->__barrier) __barrier_base(__expected); |
| } |
| } |
|
|
| _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_INLINE_VISIBILITY |
| arrival_token arrive(std::ptrdiff_t __update = 1) |
| { |
| #if __CUDA_ARCH__ |
| if (__isShared(&__barrier)) { |
| arrival_token __token; |
| #if __CUDA_ARCH__ >= 800 |
| if (__update > 1) { |
| asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;" |
| : "=l"(__token) |
| : "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))), |
| "r"(static_cast<std::uint32_t>(__update - 1)) |
| : "memory"); |
| } |
| asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];" |
| : "=l"(__token) |
| : "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))) |
| : "memory"); |
| #else |
| unsigned int __activeA = __match_any_sync(__activemask(), __update); |
| unsigned int __activeB = __match_any_sync(__activemask(), reinterpret_cast<std::uintptr_t>(&__barrier)); |
| unsigned int __active = __activeA & __activeB; |
| int __inc = __popc(__active) * __update; |
|
|
| unsigned __laneid; |
| asm volatile ("mov.u32 %0, %laneid;" : "=r"(__laneid)); |
| int __leader = __ffs(__active) - 1; |
|
|
| if(__leader == __laneid) |
| { |
| __token = __barrier.arrive(__inc); |
| } |
| __token = __shfl_sync(__active, __token, __leader); |
| #endif |
| return __token; |
| } |
| else |
| #endif |
| { |
| return __barrier.arrive(__update); |
| } |
| } |
|
|
| _LIBCUDACXX_INLINE_VISIBILITY |
| void wait(arrival_token && __phase) const |
| { |
| _CUDA_VSTD::__libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase))); |
| } |
|
|
| inline _LIBCUDACXX_INLINE_VISIBILITY |
| void arrive_and_wait() |
| { |
| wait(arrive()); |
| } |
|
|
| _LIBCUDACXX_INLINE_VISIBILITY |
| void arrive_and_drop() |
| { |
| #if __CUDA_ARCH__ >= 800 |
| if (__isShared(&__barrier)) { |
| asm volatile ("mbarrier.arrive_drop.shared.b64 _, [%0];" |
| :: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))) |
| : "memory"); |
| } |
| else |
| #endif |
| { |
| __barrier.arrive_and_drop(); |
| } |
| } |
|
|
| _LIBCUDACXX_INLINE_VISIBILITY |
| static constexpr ptrdiff_t max() noexcept |
| { |
| return (1 << 20) - 1; |
| } |
| }; |
|
|
| _LIBCUDACXX_END_NAMESPACE_CUDA |
| |
| _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE |
| |
| __device__ |
| inline std::uint64_t * barrier_native_handle(barrier<thread_scope_block> & b) { |
| return reinterpret_cast<std::uint64_t *>(&b.__barrier); |
| } |
|
|
| _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE |
|
|
| _LIBCUDACXX_BEGIN_NAMESPACE_CUDA |
|
|
| template<> |
| class barrier<thread_scope_thread, std::__empty_completion> : private barrier<thread_scope_block> { |
| using __base = barrier<thread_scope_block>; |
|
|
| public: |
| using __base::__base; |
|
|
| _LIBCUDACXX_INLINE_VISIBILITY |
| friend void init(barrier * __b, std::ptrdiff_t __expected, std::__empty_completion __completion = std::__empty_completion()) { |
| init(static_cast<__base *>(__b), __expected, __completion); |
| } |
|
|
| using __base::arrive; |
| using __base::wait; |
| using __base::arrive_and_wait; |
| using __base::arrive_and_drop; |
| using __base::max; |
| }; |
|
|
| template<std::size_t _Alignment> |
| _LIBCUDACXX_INLINE_VISIBILITY |
| inline void __strided_memcpy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride = 1) { |
| if (__stride == 1) { |
| memcpy(__destination, __source, __total_size); |
| } |
| else { |
| for (std::size_t __offset = __rank * _Alignment; __offset < __total_size; __offset += __stride * _Alignment) { |
| memcpy(__destination + __offset, __source + __offset, _Alignment); |
| } |
| } |
| } |
|
|
| #if __CUDA_ARCH__ >= 800 |
| template<std::size_t _Alignment, bool _Large = (_Alignment > 16)> |
| struct __memcpy_async_impl { |
| __device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) { |
| __strided_memcpy<_Alignment>(__destination, __source, __total_size, __rank, __stride); |
| return false; |
| } |
| }; |
|
|
| template<> |
| struct __memcpy_async_impl<4, false> { |
| __device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) { |
| for (std::size_t __offset = __rank * 4; __offset < __total_size; __offset += __stride * 4) { |
| asm volatile ("cp.async.ca.shared.global [%0], [%1], 4, 4;" |
| :: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(__destination + __offset))), |
| "l"(__source + __offset) |
| : "memory"); |
| } |
| return true; |
| } |
| }; |
|
|
| template<> |
| struct __memcpy_async_impl<8, false> { |
| __device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) { |
| for (std::size_t __offset = __rank * 8; __offset < __total_size; __offset += __stride * 8) { |
| asm volatile ("cp.async.ca.shared.global [%0], [%1], 8, 8;" |
| :: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(__destination + __offset))), |
| "l"(__source + __offset) |
| : "memory"); |
| } |
| return true; |
| } |
| }; |
|
|
| template<> |
| struct __memcpy_async_impl<16, false> { |
| __device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) { |
| for (std::size_t __offset = __rank * 16; __offset < __total_size; __offset += __stride * 16) { |
| asm volatile ("cp.async.cg.shared.global [%0], [%1], 16, 16;" |
| :: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(__destination + __offset))), |
| "l"(__source + __offset) |
| : "memory"); |
| } |
| return true; |
| } |
| }; |
|
|
| template<std::size_t _Alignment> |
| struct __memcpy_async_impl<_Alignment, true> : public __memcpy_async_impl<16, false> { }; |
| #endif |
|
|
| template<thread_scope _Sco, typename _CompF, bool _Is_mbarrier = (_Sco >= thread_scope_block) && std::is_same<_CompF, std::__empty_completion>::value> |
| _LIBCUDACXX_INLINE_VISIBILITY |
| inline void __memcpy_async_synchronize(barrier<_Sco, _CompF> & __barrier, bool __is_async) { |
| #if __CUDA_ARCH__ >= 800 |
| if (__is_async) { |
| if (_Is_mbarrier && __isShared(&__barrier)) { |
| asm volatile ("cp.async.mbarrier.arrive.shared.b64 [%0];" |
| :: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))) |
| : "memory"); |
| } |
| else { |
| asm volatile ("cp.async.wait_all;" |
| ::: "memory"); |
| } |
| } |
| #endif |
| } |
|
|
| template<std::size_t _Native_alignment, typename _Group, typename _Sync> |
| _LIBCUDACXX_INLINE_VISIBILITY |
| void inline __memcpy_async(_Group const & __group, char * __destination, char const * __source, std::size_t __size, _Sync & __sync) { |
| bool __is_async = false; |
|
|
| #if __CUDA_ARCH__ >= 800 |
| __is_async = __isShared(__destination) && __isGlobal(__source); |
|
|
| if (__is_async) { |
| if (_Native_alignment < 4) { |
| auto __source_address = reinterpret_cast<std::uintptr_t>(__source); |
| auto __destination_address = reinterpret_cast<std::uintptr_t>(__destination); |
|
|
| |
| auto _Alignment = __ffs(__source_address | __destination_address | __size); |
|
|
| switch (_Alignment) { |
| default: __is_async = __memcpy_async_impl<16>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); break; |
| case 4: __is_async = __memcpy_async_impl<8>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); break; |
| case 3: __is_async = __memcpy_async_impl<4>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); break; |
| case 2: |
| case 1: __is_async = __memcpy_async_impl<1>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); break; |
| } |
| } |
| else { |
| __is_async = __memcpy_async_impl<_Native_alignment>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); |
| } |
| } |
| else |
| #endif |
| { |
| __strided_memcpy<_Native_alignment>(__destination, __source, __size, __group.thread_rank(), __group.size()); |
| } |
|
|
| __memcpy_async_synchronize(__sync, __is_async); |
| } |
|
|
| struct __single_thread_group { |
| _LIBCUDACXX_INLINE_VISIBILITY |
| void sync() const {} |
| _LIBCUDACXX_INLINE_VISIBILITY |
| constexpr std::size_t size() const { return 1; }; |
| _LIBCUDACXX_INLINE_VISIBILITY |
| constexpr std::size_t thread_rank() const { return 0; }; |
| }; |
|
|
| template<typename _Group, class _Tp, thread_scope _Sco, typename _CompF> |
| _LIBCUDACXX_INLINE_VISIBILITY |
| void memcpy_async(_Group const & __group, _Tp * __destination, _Tp const * __source, std::size_t __size, barrier<_Sco, _CompF> & __barrier) { |
| |
| |
| |
| |
| #if !defined(_LIBCUDACXX_COMPILER_GCC) || _GNUC_VER > 408 |
| static_assert(std::is_trivially_copyable<_Tp>::value, "memcpy_async requires a trivially copyable type"); |
| #endif |
|
|
| __memcpy_async<alignof(_Tp)>(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __barrier); |
| } |
|
|
| template<typename _Group, class _Tp, std::size_t _Alignment, thread_scope _Sco, typename _CompF, std::size_t _Larger_alignment = (alignof(_Tp) > _Alignment) ? alignof(_Tp) : _Alignment> |
| _LIBCUDACXX_INLINE_VISIBILITY |
| void memcpy_async(_Group const & __group, _Tp * __destination, _Tp const * __source, aligned_size_t<_Alignment> __size, barrier<_Sco, _CompF> & __barrier) { |
| |
| |
| |
| |
| #if !defined(_LIBCUDACXX_COMPILER_GCC) || _GNUC_VER > 408 |
| static_assert(std::is_trivially_copyable<_Tp>::value, "memcpy_async requires a trivially copyable type"); |
| #endif |
|
|
| __memcpy_async<_Larger_alignment>(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __barrier); |
| } |
|
|
| template<class _Tp, typename _Size, thread_scope _Sco, typename _CompF> |
| _LIBCUDACXX_INLINE_VISIBILITY |
| void memcpy_async(_Tp * __destination, _Tp const * __source, _Size __size, barrier<_Sco, _CompF> & __barrier) { |
| memcpy_async(__single_thread_group{}, __destination, __source, __size, __barrier); |
| } |
|
|
| template<typename _Group, thread_scope _Sco, typename _CompF> |
| _LIBCUDACXX_INLINE_VISIBILITY |
| void memcpy_async(_Group const & __group, void * __destination, void const * __source, std::size_t __size, barrier<_Sco, _CompF> & __barrier) { |
| __memcpy_async<1>(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __barrier); |
| } |
|
|
| template<typename _Group, std::size_t _Alignment, thread_scope _Sco, typename _CompF> |
| _LIBCUDACXX_INLINE_VISIBILITY |
| void memcpy_async(_Group const & __group, void * __destination, void const * __source, aligned_size_t<_Alignment> __size, barrier<_Sco, _CompF> & __barrier) { |
| __memcpy_async<_Alignment>(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __barrier); |
| } |
|
|
| template<typename _Size, thread_scope _Sco, typename _CompF> |
| _LIBCUDACXX_INLINE_VISIBILITY |
| void memcpy_async(void * __destination, void const * __source, _Size __size, barrier<_Sco, _CompF> & __barrier) { |
| memcpy_async(__single_thread_group{}, __destination, __source, __size, __barrier); |
| } |
|
|
| _LIBCUDACXX_END_NAMESPACE_CUDA |
|
|
| #include "detail/__pragma_pop" |
|
|
| #endif |
|
|