| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700 |
| | # error "CUDA synchronization primitives are only supported for sm_70 and up." |
| | #endif |
| |
|
| | #ifndef _CUDA_BARRIER |
| | #define _CUDA_BARRIER |
| |
|
| | #include "atomic" |
| | #include "cstddef" |
| |
|
| | #include "detail/__config" |
| |
|
| | #include "detail/__pragma_push" |
| |
|
| | #include "detail/libcxx/include/barrier" |
| |
|
| | _LIBCUDACXX_BEGIN_NAMESPACE_CUDA |
| |
|
| | template<std::size_t _Alignment> |
| | struct aligned_size_t { |
| | static constexpr std::size_t align = _Alignment; |
| | std::size_t value; |
| | __host__ __device__ |
| | explicit aligned_size_t(size_t __s) : value(__s) { } |
| | __host__ __device__ |
| | operator size_t() const { return value; } |
| | }; |
| |
|
| | template<thread_scope _Sco, class _CompletionF = std::__empty_completion> |
| | class barrier : public std::__barrier_base<_CompletionF, _Sco> { |
| | template<thread_scope> |
| | friend class pipeline; |
| |
|
| | using std::__barrier_base<_CompletionF, _Sco>::__try_wait; |
| |
|
| | public: |
| | barrier() = default; |
| |
|
| | barrier(const barrier &) = delete; |
| | barrier & operator=(const barrier &) = delete; |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR |
| | barrier(std::ptrdiff_t __expected, _CompletionF __completion = _CompletionF()) |
| | : std::__barrier_base<_CompletionF, _Sco>(__expected, __completion) { |
| | } |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | friend void init(barrier * __b, std::ptrdiff_t __expected) { |
| | new (__b) barrier(__expected); |
| | } |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | friend void init(barrier * __b, std::ptrdiff_t __expected, _CompletionF __completion) { |
| | new (__b) barrier(__expected, __completion); |
| | } |
| | }; |
| |
|
| | struct __block_scope_barrier_base {}; |
| |
|
| | _LIBCUDACXX_END_NAMESPACE_CUDA |
| | |
| | _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE |
| | |
| | __device__ |
| | inline std::uint64_t * barrier_native_handle(barrier<thread_scope_block> & b); |
| |
|
| | _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE |
| |
|
| | _LIBCUDACXX_BEGIN_NAMESPACE_CUDA |
| |
|
| | template<> |
| | class barrier<thread_scope_block, std::__empty_completion> : public __block_scope_barrier_base { |
| | using __barrier_base = std::__barrier_base<std::__empty_completion, (int)thread_scope_block>; |
| | __barrier_base __barrier; |
| |
|
| | __device__ |
| | friend inline std::uint64_t * device::_LIBCUDACXX_CUDA_ABI_NAMESPACE::barrier_native_handle(barrier<thread_scope_block> & b); |
| |
|
| | public: |
| | using arrival_token = typename __barrier_base::arrival_token; |
| |
|
| | private: |
| | struct __poll_tester { |
| | barrier const* __this; |
| | arrival_token __phase; |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | __poll_tester(barrier const* __this_, arrival_token&& __phase_) |
| | : __this(__this_) |
| | , __phase(_CUDA_VSTD::move(__phase_)) |
| | {} |
| |
|
| | inline _LIBCUDACXX_INLINE_VISIBILITY |
| | bool operator()() const |
| | { |
| | return __this->__try_wait(__phase); |
| | } |
| | }; |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | bool __try_wait(arrival_token __phase) const { |
| | #if __CUDA_ARCH__ >= 800 |
| | if (__isShared(&__barrier)) { |
| | int __ready = 0; |
| | asm volatile ("{\n\t" |
| | ".reg .pred p;\n\t" |
| | "mbarrier.test_wait.shared.b64 p, [%1], %2;\n\t" |
| | "selp.b32 %0, 1, 0, p;\n\t" |
| | "}" |
| | : "=r"(__ready) |
| | : "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))), "l"(__phase) |
| | : "memory"); |
| | return __ready; |
| | } |
| | else |
| | #endif |
| | { |
| | return __barrier.__try_wait(std::move(__phase)); |
| | } |
| | } |
| |
|
| | template<thread_scope> |
| | friend class pipeline; |
| |
|
| | public: |
| | barrier() = default; |
| |
|
| | barrier(const barrier &) = delete; |
| | barrier & operator=(const barrier &) = delete; |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | barrier(std::ptrdiff_t __expected, std::__empty_completion __completion = std::__empty_completion()) { |
| | static_assert(_LIBCUDACXX_OFFSET_IS_ZERO(barrier<thread_scope_block>, __barrier), "fatal error: bad barrier layout"); |
| | init(this, __expected, __completion); |
| | } |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | ~barrier() { |
| | #if __CUDA_ARCH__ >= 800 |
| | if (__isShared(&__barrier)) { |
| | asm volatile ("mbarrier.inval.shared.b64 [%0];" |
| | :: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))) |
| | : "memory"); |
| | } |
| | #endif |
| | } |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | friend void init(barrier * __b, std::ptrdiff_t __expected, std::__empty_completion __completion = std::__empty_completion()) { |
| | #if __CUDA_ARCH__ >= 800 |
| | if (__isShared(&__b->__barrier)) { |
| | asm volatile ("mbarrier.init.shared.b64 [%0], %1;" |
| | :: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__b->__barrier))), |
| | "r"(static_cast<std::uint32_t>(__expected)) |
| | : "memory"); |
| | } |
| | else |
| | #endif |
| | { |
| | new (&__b->__barrier) __barrier_base(__expected); |
| | } |
| | } |
| |
|
| | _LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_INLINE_VISIBILITY |
| | arrival_token arrive(std::ptrdiff_t __update = 1) |
| | { |
| | #if __CUDA_ARCH__ |
| | if (__isShared(&__barrier)) { |
| | arrival_token __token; |
| | #if __CUDA_ARCH__ >= 800 |
| | if (__update > 1) { |
| | asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;" |
| | : "=l"(__token) |
| | : "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))), |
| | "r"(static_cast<std::uint32_t>(__update - 1)) |
| | : "memory"); |
| | } |
| | asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];" |
| | : "=l"(__token) |
| | : "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))) |
| | : "memory"); |
| | #else |
| | unsigned int __activeA = __match_any_sync(__activemask(), __update); |
| | unsigned int __activeB = __match_any_sync(__activemask(), reinterpret_cast<std::uintptr_t>(&__barrier)); |
| | unsigned int __active = __activeA & __activeB; |
| | int __inc = __popc(__active) * __update; |
| |
|
| | unsigned __laneid; |
| | asm volatile ("mov.u32 %0, %laneid;" : "=r"(__laneid)); |
| | int __leader = __ffs(__active) - 1; |
| |
|
| | if(__leader == __laneid) |
| | { |
| | __token = __barrier.arrive(__inc); |
| | } |
| | __token = __shfl_sync(__active, __token, __leader); |
| | #endif |
| | return __token; |
| | } |
| | else |
| | #endif |
| | { |
| | return __barrier.arrive(__update); |
| | } |
| | } |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void wait(arrival_token && __phase) const |
| | { |
| | _CUDA_VSTD::__libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase))); |
| | } |
| |
|
| | inline _LIBCUDACXX_INLINE_VISIBILITY |
| | void arrive_and_wait() |
| | { |
| | wait(arrive()); |
| | } |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void arrive_and_drop() |
| | { |
| | #if __CUDA_ARCH__ >= 800 |
| | if (__isShared(&__barrier)) { |
| | asm volatile ("mbarrier.arrive_drop.shared.b64 _, [%0];" |
| | :: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))) |
| | : "memory"); |
| | } |
| | else |
| | #endif |
| | { |
| | __barrier.arrive_and_drop(); |
| | } |
| | } |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | static constexpr ptrdiff_t max() noexcept |
| | { |
| | return (1 << 20) - 1; |
| | } |
| | }; |
| |
|
| | _LIBCUDACXX_END_NAMESPACE_CUDA |
| | |
| | _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE |
| | |
| | __device__ |
| | inline std::uint64_t * barrier_native_handle(barrier<thread_scope_block> & b) { |
| | return reinterpret_cast<std::uint64_t *>(&b.__barrier); |
| | } |
| |
|
| | _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE |
| |
|
| | _LIBCUDACXX_BEGIN_NAMESPACE_CUDA |
| |
|
| | template<> |
| | class barrier<thread_scope_thread, std::__empty_completion> : private barrier<thread_scope_block> { |
| | using __base = barrier<thread_scope_block>; |
| |
|
| | public: |
| | using __base::__base; |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | friend void init(barrier * __b, std::ptrdiff_t __expected, std::__empty_completion __completion = std::__empty_completion()) { |
| | init(static_cast<__base *>(__b), __expected, __completion); |
| | } |
| |
|
| | using __base::arrive; |
| | using __base::wait; |
| | using __base::arrive_and_wait; |
| | using __base::arrive_and_drop; |
| | using __base::max; |
| | }; |
| |
|
| | template<std::size_t _Alignment> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | inline void __strided_memcpy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride = 1) { |
| | if (__stride == 1) { |
| | memcpy(__destination, __source, __total_size); |
| | } |
| | else { |
| | for (std::size_t __offset = __rank * _Alignment; __offset < __total_size; __offset += __stride * _Alignment) { |
| | memcpy(__destination + __offset, __source + __offset, _Alignment); |
| | } |
| | } |
| | } |
| |
|
| | #if __CUDA_ARCH__ >= 800 |
| | template<std::size_t _Alignment, bool _Large = (_Alignment > 16)> |
| | struct __memcpy_async_impl { |
| | __device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) { |
| | __strided_memcpy<_Alignment>(__destination, __source, __total_size, __rank, __stride); |
| | return false; |
| | } |
| | }; |
| |
|
| | template<> |
| | struct __memcpy_async_impl<4, false> { |
| | __device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) { |
| | for (std::size_t __offset = __rank * 4; __offset < __total_size; __offset += __stride * 4) { |
| | asm volatile ("cp.async.ca.shared.global [%0], [%1], 4, 4;" |
| | :: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(__destination + __offset))), |
| | "l"(__source + __offset) |
| | : "memory"); |
| | } |
| | return true; |
| | } |
| | }; |
| |
|
| | template<> |
| | struct __memcpy_async_impl<8, false> { |
| | __device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) { |
| | for (std::size_t __offset = __rank * 8; __offset < __total_size; __offset += __stride * 8) { |
| | asm volatile ("cp.async.ca.shared.global [%0], [%1], 8, 8;" |
| | :: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(__destination + __offset))), |
| | "l"(__source + __offset) |
| | : "memory"); |
| | } |
| | return true; |
| | } |
| | }; |
| |
|
| | template<> |
| | struct __memcpy_async_impl<16, false> { |
| | __device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) { |
| | for (std::size_t __offset = __rank * 16; __offset < __total_size; __offset += __stride * 16) { |
| | asm volatile ("cp.async.cg.shared.global [%0], [%1], 16, 16;" |
| | :: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(__destination + __offset))), |
| | "l"(__source + __offset) |
| | : "memory"); |
| | } |
| | return true; |
| | } |
| | }; |
| |
|
| | template<std::size_t _Alignment> |
| | struct __memcpy_async_impl<_Alignment, true> : public __memcpy_async_impl<16, false> { }; |
| | #endif |
| |
|
| | template<thread_scope _Sco, typename _CompF, bool _Is_mbarrier = (_Sco >= thread_scope_block) && std::is_same<_CompF, std::__empty_completion>::value> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | inline void __memcpy_async_synchronize(barrier<_Sco, _CompF> & __barrier, bool __is_async) { |
| | #if __CUDA_ARCH__ >= 800 |
| | if (__is_async) { |
| | if (_Is_mbarrier && __isShared(&__barrier)) { |
| | asm volatile ("cp.async.mbarrier.arrive.shared.b64 [%0];" |
| | :: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))) |
| | : "memory"); |
| | } |
| | else { |
| | asm volatile ("cp.async.wait_all;" |
| | ::: "memory"); |
| | } |
| | } |
| | #endif |
| | } |
| |
|
| | template<std::size_t _Native_alignment, typename _Group, typename _Sync> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void inline __memcpy_async(_Group const & __group, char * __destination, char const * __source, std::size_t __size, _Sync & __sync) { |
| | bool __is_async = false; |
| |
|
| | #if __CUDA_ARCH__ >= 800 |
| | __is_async = __isShared(__destination) && __isGlobal(__source); |
| |
|
| | if (__is_async) { |
| | if (_Native_alignment < 4) { |
| | auto __source_address = reinterpret_cast<std::uintptr_t>(__source); |
| | auto __destination_address = reinterpret_cast<std::uintptr_t>(__destination); |
| |
|
| | |
| | auto _Alignment = __ffs(__source_address | __destination_address | __size); |
| |
|
| | switch (_Alignment) { |
| | default: __is_async = __memcpy_async_impl<16>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); break; |
| | case 4: __is_async = __memcpy_async_impl<8>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); break; |
| | case 3: __is_async = __memcpy_async_impl<4>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); break; |
| | case 2: |
| | case 1: __is_async = __memcpy_async_impl<1>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); break; |
| | } |
| | } |
| | else { |
| | __is_async = __memcpy_async_impl<_Native_alignment>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); |
| | } |
| | } |
| | else |
| | #endif |
| | { |
| | __strided_memcpy<_Native_alignment>(__destination, __source, __size, __group.thread_rank(), __group.size()); |
| | } |
| |
|
| | __memcpy_async_synchronize(__sync, __is_async); |
| | } |
| |
|
| | struct __single_thread_group { |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void sync() const {} |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | constexpr std::size_t size() const { return 1; }; |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | constexpr std::size_t thread_rank() const { return 0; }; |
| | }; |
| |
|
| | template<typename _Group, class _Tp, thread_scope _Sco, typename _CompF> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void memcpy_async(_Group const & __group, _Tp * __destination, _Tp const * __source, std::size_t __size, barrier<_Sco, _CompF> & __barrier) { |
| | |
| | |
| | |
| | |
| | #if !defined(_LIBCUDACXX_COMPILER_GCC) || _GNUC_VER > 408 |
| | static_assert(std::is_trivially_copyable<_Tp>::value, "memcpy_async requires a trivially copyable type"); |
| | #endif |
| |
|
| | __memcpy_async<alignof(_Tp)>(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __barrier); |
| | } |
| |
|
| | template<typename _Group, class _Tp, std::size_t _Alignment, thread_scope _Sco, typename _CompF, std::size_t _Larger_alignment = (alignof(_Tp) > _Alignment) ? alignof(_Tp) : _Alignment> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void memcpy_async(_Group const & __group, _Tp * __destination, _Tp const * __source, aligned_size_t<_Alignment> __size, barrier<_Sco, _CompF> & __barrier) { |
| | |
| | |
| | |
| | |
| | #if !defined(_LIBCUDACXX_COMPILER_GCC) || _GNUC_VER > 408 |
| | static_assert(std::is_trivially_copyable<_Tp>::value, "memcpy_async requires a trivially copyable type"); |
| | #endif |
| |
|
| | __memcpy_async<_Larger_alignment>(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __barrier); |
| | } |
| |
|
| | template<class _Tp, typename _Size, thread_scope _Sco, typename _CompF> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void memcpy_async(_Tp * __destination, _Tp const * __source, _Size __size, barrier<_Sco, _CompF> & __barrier) { |
| | memcpy_async(__single_thread_group{}, __destination, __source, __size, __barrier); |
| | } |
| |
|
| | template<typename _Group, thread_scope _Sco, typename _CompF> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void memcpy_async(_Group const & __group, void * __destination, void const * __source, std::size_t __size, barrier<_Sco, _CompF> & __barrier) { |
| | __memcpy_async<1>(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __barrier); |
| | } |
| |
|
| | template<typename _Group, std::size_t _Alignment, thread_scope _Sco, typename _CompF> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void memcpy_async(_Group const & __group, void * __destination, void const * __source, aligned_size_t<_Alignment> __size, barrier<_Sco, _CompF> & __barrier) { |
| | __memcpy_async<_Alignment>(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __barrier); |
| | } |
| |
|
| | template<typename _Size, thread_scope _Sco, typename _CompF> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void memcpy_async(void * __destination, void const * __source, _Size __size, barrier<_Sco, _CompF> & __barrier) { |
| | memcpy_async(__single_thread_group{}, __destination, __source, __size, __barrier); |
| | } |
| |
|
| | _LIBCUDACXX_END_NAMESPACE_CUDA |
| |
|
| | #include "detail/__pragma_pop" |
| |
|
| | #endif |
| |
|