thanks to nvidia ❤

0dc1b04 almost 3 years ago

17.7 kB

	//===----------------------------------------------------------------------===//
	//
	// Part of libcu++, the C++ Standard Library for your entire system,
	// under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
	# error "CUDA synchronization primitives are only supported for sm_70 and up."
	#endif

	#ifndef _CUDA_BARRIER
	#define _CUDA_BARRIER

	#include "atomic"
	#include "cstddef"

	#include "detail/__config"

	#include "detail/__pragma_push"

	#include "detail/libcxx/include/barrier"

	_LIBCUDACXX_BEGIN_NAMESPACE_CUDA

	template<std::size_t _Alignment>
	struct aligned_size_t {
	static constexpr std::size_t align = _Alignment;
	std::size_t value;
	__host__ __device__
	explicit aligned_size_t(size_t __s) : value(__s) { }
	__host__ __device__
	operator size_t() const { return value; }
	};

	template<thread_scope _Sco, class _CompletionF = std::__empty_completion>
	class barrier : public std::__barrier_base<_CompletionF, _Sco> {
	template<thread_scope>
	friend class pipeline;

	using std::__barrier_base<_CompletionF, _Sco>::__try_wait;

	public:
	barrier() = default;

	barrier(const barrier &) = delete;
	barrier & operator=(const barrier &) = delete;

	_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_CONSTEXPR
	barrier(std::ptrdiff_t __expected, _CompletionF __completion = _CompletionF())
	: std::__barrier_base<_CompletionF, _Sco>(__expected, __completion) {
	}

	_LIBCUDACXX_INLINE_VISIBILITY
	friend void init(barrier * __b, std::ptrdiff_t __expected) {
	new (__b) barrier(__expected);
	}

	_LIBCUDACXX_INLINE_VISIBILITY
	friend void init(barrier * __b, std::ptrdiff_t __expected, _CompletionF __completion) {
	new (__b) barrier(__expected, __completion);
	}
	};

	struct __block_scope_barrier_base {};

	_LIBCUDACXX_END_NAMESPACE_CUDA

	_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE

	__device__
	inline std::uint64_t * barrier_native_handle(barrier<thread_scope_block> & b);

	_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE

	_LIBCUDACXX_BEGIN_NAMESPACE_CUDA

	template<>
	class barrier<thread_scope_block, std::__empty_completion> : public __block_scope_barrier_base {
	using __barrier_base = std::__barrier_base<std::__empty_completion, (int)thread_scope_block>;
	__barrier_base __barrier;

	__device__
	friend inline std::uint64_t * device::_LIBCUDACXX_CUDA_ABI_NAMESPACE::barrier_native_handle(barrier<thread_scope_block> & b);

	public:
	using arrival_token = typename __barrier_base::arrival_token;

	private:
	struct __poll_tester {
	barrier const* __this;
	arrival_token __phase;

	_LIBCUDACXX_INLINE_VISIBILITY
	__poll_tester(barrier const* __this_, arrival_token&& __phase_)
	: __this(__this_)
	, __phase(_CUDA_VSTD::move(__phase_))
	{}

	inline _LIBCUDACXX_INLINE_VISIBILITY
	bool operator()() const
	{
	return __this->__try_wait(__phase);
	}
	};

	_LIBCUDACXX_INLINE_VISIBILITY
	bool __try_wait(arrival_token __phase) const {
	#if __CUDA_ARCH__ >= 800
	if (__isShared(&__barrier)) {
	int __ready = 0;
	asm volatile ("{\n\t"
	".reg .pred p;\n\t"
	"mbarrier.test_wait.shared.b64 p, [%1], %2;\n\t"
	"selp.b32 %0, 1, 0, p;\n\t"
	"}"
	: "=r"(__ready)
	: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))), "l"(__phase)
	: "memory");
	return __ready;
	}
	else
	#endif
	{
	return __barrier.__try_wait(std::move(__phase));
	}
	}

	template<thread_scope>
	friend class pipeline;

	public:
	barrier() = default;

	barrier(const barrier &) = delete;
	barrier & operator=(const barrier &) = delete;

	_LIBCUDACXX_INLINE_VISIBILITY
	barrier(std::ptrdiff_t __expected, std::__empty_completion __completion = std::__empty_completion()) {
	static_assert(_LIBCUDACXX_OFFSET_IS_ZERO(barrier<thread_scope_block>, __barrier), "fatal error: bad barrier layout");
	init(this, __expected, __completion);
	}

	_LIBCUDACXX_INLINE_VISIBILITY
	~barrier() {
	#if __CUDA_ARCH__ >= 800
	if (__isShared(&__barrier)) {
	asm volatile ("mbarrier.inval.shared.b64 [%0];"
	:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier)))
	: "memory");
	}
	#endif
	}

	_LIBCUDACXX_INLINE_VISIBILITY
	friend void init(barrier * __b, std::ptrdiff_t __expected, std::__empty_completion __completion = std::__empty_completion()) {
	#if __CUDA_ARCH__ >= 800
	if (__isShared(&__b->__barrier)) {
	asm volatile ("mbarrier.init.shared.b64 [%0], %1;"
	:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__b->__barrier))),
	"r"(static_cast<std::uint32_t>(__expected))
	: "memory");
	}
	else
	#endif
	{
	new (&__b->__barrier) __barrier_base(__expected);
	}
	}

	_LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_INLINE_VISIBILITY
	arrival_token arrive(std::ptrdiff_t __update = 1)
	{
	#if __CUDA_ARCH__
	if (__isShared(&__barrier)) {
	arrival_token __token;
	#if __CUDA_ARCH__ >= 800
	if (__update > 1) {
	asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;"
	: "=l"(__token)
	: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier))),
	"r"(static_cast<std::uint32_t>(__update - 1))
	: "memory");
	}
	asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];"
	: "=l"(__token)
	: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier)))
	: "memory");
	#else
	unsigned int __activeA = __match_any_sync(__activemask(), __update);
	unsigned int __activeB = __match_any_sync(__activemask(), reinterpret_cast<std::uintptr_t>(&__barrier));
	unsigned int __active = __activeA & __activeB;
	int __inc = __popc(__active) * __update;

	unsigned __laneid;
	asm volatile ("mov.u32 %0, %laneid;" : "=r"(__laneid));
	int __leader = __ffs(__active) - 1;

	if(__leader == __laneid)
	{
	__token = __barrier.arrive(__inc);
	}
	__token = __shfl_sync(__active, __token, __leader);
	#endif
	return __token;
	}
	else
	#endif
	{
	return __barrier.arrive(__update);
	}
	}

	_LIBCUDACXX_INLINE_VISIBILITY
	void wait(arrival_token && __phase) const
	{
	_CUDA_VSTD::__libcpp_thread_poll_with_backoff(__poll_tester(this, _CUDA_VSTD::move(__phase)));
	}

	inline _LIBCUDACXX_INLINE_VISIBILITY
	void arrive_and_wait()
	{
	wait(arrive());
	}

	_LIBCUDACXX_INLINE_VISIBILITY
	void arrive_and_drop()
	{
	#if __CUDA_ARCH__ >= 800
	if (__isShared(&__barrier)) {
	asm volatile ("mbarrier.arrive_drop.shared.b64 _, [%0];"
	:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier)))
	: "memory");
	}
	else
	#endif
	{
	__barrier.arrive_and_drop();
	}
	}

	_LIBCUDACXX_INLINE_VISIBILITY
	static constexpr ptrdiff_t max() noexcept
	{
	return (1 << 20) - 1;
	}
	};

	_LIBCUDACXX_END_NAMESPACE_CUDA

	_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE

	__device__
	inline std::uint64_t * barrier_native_handle(barrier<thread_scope_block> & b) {
	return reinterpret_cast<std::uint64_t *>(&b.__barrier);
	}

	_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE

	_LIBCUDACXX_BEGIN_NAMESPACE_CUDA

	template<>
	class barrier<thread_scope_thread, std::__empty_completion> : private barrier<thread_scope_block> {
	using __base = barrier<thread_scope_block>;

	public:
	using __base::__base;

	_LIBCUDACXX_INLINE_VISIBILITY
	friend void init(barrier * __b, std::ptrdiff_t __expected, std::__empty_completion __completion = std::__empty_completion()) {
	init(static_cast<__base *>(__b), __expected, __completion);
	}

	using __base::arrive;
	using __base::wait;
	using __base::arrive_and_wait;
	using __base::arrive_and_drop;
	using __base::max;
	};

	template<std::size_t _Alignment>
	_LIBCUDACXX_INLINE_VISIBILITY
	inline void __strided_memcpy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride = 1) {
	if (__stride == 1) {
	memcpy(__destination, __source, __total_size);
	}
	else {
	for (std::size_t __offset = __rank * _Alignment; __offset < __total_size; __offset += __stride * _Alignment) {
	memcpy(__destination + __offset, __source + __offset, _Alignment);
	}
	}
	}

	#if __CUDA_ARCH__ >= 800
	template<std::size_t _Alignment, bool _Large = (_Alignment > 16)>
	struct __memcpy_async_impl {
	__device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) {
	__strided_memcpy<_Alignment>(__destination, __source, __total_size, __rank, __stride);
	return false;
	}
	};

	template<>
	struct __memcpy_async_impl<4, false> {
	__device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) {
	for (std::size_t __offset = __rank * 4; __offset < __total_size; __offset += __stride * 4) {
	asm volatile ("cp.async.ca.shared.global [%0], [%1], 4, 4;"
	:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(__destination + __offset))),
	"l"(__source + __offset)
	: "memory");
	}
	return true;
	}
	};

	template<>
	struct __memcpy_async_impl<8, false> {
	__device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) {
	for (std::size_t __offset = __rank * 8; __offset < __total_size; __offset += __stride * 8) {
	asm volatile ("cp.async.ca.shared.global [%0], [%1], 8, 8;"
	:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(__destination + __offset))),
	"l"(__source + __offset)
	: "memory");
	}
	return true;
	}
	};

	template<>
	struct __memcpy_async_impl<16, false> {
	__device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) {
	for (std::size_t __offset = __rank * 16; __offset < __total_size; __offset += __stride * 16) {
	asm volatile ("cp.async.cg.shared.global [%0], [%1], 16, 16;"
	:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(__destination + __offset))),
	"l"(__source + __offset)
	: "memory");
	}
	return true;
	}
	};

	template<std::size_t _Alignment>
	struct __memcpy_async_impl<_Alignment, true> : public __memcpy_async_impl<16, false> { };
	#endif

	template<thread_scope _Sco, typename _CompF, bool _Is_mbarrier = (_Sco >= thread_scope_block) && std::is_same<_CompF, std::__empty_completion>::value>
	_LIBCUDACXX_INLINE_VISIBILITY
	inline void __memcpy_async_synchronize(barrier<_Sco, _CompF> & __barrier, bool __is_async) {
	#if __CUDA_ARCH__ >= 800
	if (__is_async) {
	if (_Is_mbarrier && __isShared(&__barrier)) {
	asm volatile ("cp.async.mbarrier.arrive.shared.b64 [%0];"
	:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(&__barrier)))
	: "memory");
	}
	else {
	asm volatile ("cp.async.wait_all;"
	::: "memory");
	}
	}
	#endif
	}

	template<std::size_t _Native_alignment, typename _Group, typename _Sync>
	_LIBCUDACXX_INLINE_VISIBILITY
	void inline __memcpy_async(_Group const & __group, char * __destination, char const * __source, std::size_t __size, _Sync & __sync) {
	bool __is_async = false;

	#if __CUDA_ARCH__ >= 800
	__is_async = __isShared(__destination) && __isGlobal(__source);

	if (__is_async) {
	if (_Native_alignment < 4) {
	auto __source_address = reinterpret_cast<std::uintptr_t>(__source);
	auto __destination_address = reinterpret_cast<std::uintptr_t>(__destination);

	// Lowest bit set will tell us what the common alignment of the three values is.
	auto _Alignment = __ffs(__source_address \| __destination_address \| __size);

	switch (_Alignment) {
	default: __is_async = __memcpy_async_impl<16>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); break;
	case 4: __is_async = __memcpy_async_impl<8>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); break;
	case 3: __is_async = __memcpy_async_impl<4>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); break;
	case 2: // fallthrough
	case 1: __is_async = __memcpy_async_impl<1>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size()); break;
	}
	}
	else {
	__is_async = __memcpy_async_impl<_Native_alignment>::__copy(__destination, __source, __size, __group.thread_rank(), __group.size());
	}
	}
	else
	#endif
	{
	__strided_memcpy<_Native_alignment>(__destination, __source, __size, __group.thread_rank(), __group.size());
	}

	__memcpy_async_synchronize(__sync, __is_async);
	}

	struct __single_thread_group {
	_LIBCUDACXX_INLINE_VISIBILITY
	void sync() const {}
	_LIBCUDACXX_INLINE_VISIBILITY
	constexpr std::size_t size() const { return 1; };
	_LIBCUDACXX_INLINE_VISIBILITY
	constexpr std::size_t thread_rank() const { return 0; };
	};

	template<typename _Group, class _Tp, thread_scope _Sco, typename _CompF>
	_LIBCUDACXX_INLINE_VISIBILITY
	void memcpy_async(_Group const & __group, _Tp * __destination, _Tp const * __source, std::size_t __size, barrier<_Sco, _CompF> & __barrier) {
	// When compiling with NVCC and GCC 4.8, certain user defined types that _are_ trivially copyable are
	// incorrectly classified as not trivially copyable. Remove this assertion to allow for their usage with
	// memcpy_async when compiling with GCC 4.8.
	// FIXME: remove the #if once GCC 4.8 is no longer supported.
	#if !defined(_LIBCUDACXX_COMPILER_GCC) \|\| _GNUC_VER > 408
	static_assert(std::is_trivially_copyable<_Tp>::value, "memcpy_async requires a trivially copyable type");
	#endif

	__memcpy_async<alignof(_Tp)>(__group, reinterpret_cast<char >(__destination), reinterpret_cast<char const >(__source), __size, __barrier);
	}

	template<typename _Group, class _Tp, std::size_t _Alignment, thread_scope _Sco, typename _CompF, std::size_t _Larger_alignment = (alignof(_Tp) > _Alignment) ? alignof(_Tp) : _Alignment>
	_LIBCUDACXX_INLINE_VISIBILITY
	void memcpy_async(_Group const & __group, _Tp * __destination, _Tp const * __source, aligned_size_t<_Alignment> __size, barrier<_Sco, _CompF> & __barrier) {
	// When compiling with NVCC and GCC 4.8, certain user defined types that _are_ trivially copyable are
	// incorrectly classified as not trivially copyable. Remove this assertion to allow for their usage with
	// memcpy_async when compiling with GCC 4.8.
	// FIXME: remove the #if once GCC 4.8 is no longer supported.
	#if !defined(_LIBCUDACXX_COMPILER_GCC) \|\| _GNUC_VER > 408
	static_assert(std::is_trivially_copyable<_Tp>::value, "memcpy_async requires a trivially copyable type");
	#endif

	__memcpy_async<_Larger_alignment>(__group, reinterpret_cast<char >(__destination), reinterpret_cast<char const >(__source), __size, __barrier);
	}

	template<class _Tp, typename _Size, thread_scope _Sco, typename _CompF>
	_LIBCUDACXX_INLINE_VISIBILITY
	void memcpy_async(_Tp * __destination, _Tp const * __source, _Size __size, barrier<_Sco, _CompF> & __barrier) {
	memcpy_async(__single_thread_group{}, __destination, __source, __size, __barrier);
	}

	template<typename _Group, thread_scope _Sco, typename _CompF>
	_LIBCUDACXX_INLINE_VISIBILITY
	void memcpy_async(_Group const & __group, void * __destination, void const * __source, std::size_t __size, barrier<_Sco, _CompF> & __barrier) {
	__memcpy_async<1>(__group, reinterpret_cast<char >(__destination), reinterpret_cast<char const >(__source), __size, __barrier);
	}

	template<typename _Group, std::size_t _Alignment, thread_scope _Sco, typename _CompF>
	_LIBCUDACXX_INLINE_VISIBILITY
	void memcpy_async(_Group const & __group, void * __destination, void const * __source, aligned_size_t<_Alignment> __size, barrier<_Sco, _CompF> & __barrier) {
	__memcpy_async<_Alignment>(__group, reinterpret_cast<char >(__destination), reinterpret_cast<char const >(__source), __size, __barrier);
	}

	template<typename _Size, thread_scope _Sco, typename _CompF>
	_LIBCUDACXX_INLINE_VISIBILITY
	void memcpy_async(void * __destination, void const * __source, _Size __size, barrier<_Sco, _CompF> & __barrier) {
	memcpy_async(__single_thread_group{}, __destination, __source, __size, __barrier);
	}

	_LIBCUDACXX_END_NAMESPACE_CUDA

	#include "detail/__pragma_pop"

	#endif //_CUDA_BARRIER