| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | #ifndef _CUDA_PIPELINE |
| | #define _CUDA_PIPELINE |
| |
|
| | #include <nv/target> |
| | #include <cuda/barrier> |
| | #include <cuda/atomic> |
| |
|
| | _LIBCUDACXX_BEGIN_NAMESPACE_CUDA |
| |
|
| | template<thread_scope _Scope> |
| | class pipeline; |
| |
|
| | enum class pipeline_role { |
| | producer, |
| | consumer |
| | }; |
| |
|
| | template<thread_scope _Scope> |
| | struct __pipeline_stage { |
| | barrier<_Scope> __produced; |
| | barrier<_Scope> __consumed; |
| | }; |
| |
|
| | template<thread_scope _Scope, uint8_t _Stages_count> |
| | class pipeline_shared_state { |
| | public: |
| | pipeline_shared_state() = default; |
| | pipeline_shared_state(const pipeline_shared_state &) = delete; |
| | pipeline_shared_state(pipeline_shared_state &&) = delete; |
| | pipeline_shared_state & operator=(pipeline_shared_state &&) = delete; |
| | pipeline_shared_state & operator=(const pipeline_shared_state &) = delete; |
| |
|
| | private: |
| | __pipeline_stage<_Scope> __stages[_Stages_count]; |
| | atomic<uint32_t, _Scope> __refcount; |
| |
|
| | template<thread_scope _Pipeline_scope> |
| | friend class pipeline; |
| |
|
| | template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count> |
| | friend _LIBCUDACXX_INLINE_VISIBILITY |
| | pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state); |
| |
|
| | template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count> |
| | friend _LIBCUDACXX_INLINE_VISIBILITY |
| | pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state, size_t __producer_count); |
| | |
| | template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count> |
| | friend _LIBCUDACXX_INLINE_VISIBILITY |
| | pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state, pipeline_role __role); |
| | }; |
| |
|
| | struct __pipeline_asm_helper { |
| | __device__ |
| | static inline uint32_t __lane_id() |
| | { |
| | uint32_t __lane_id; |
| | asm volatile ("mov.u32 %0, %%laneid;" : "=r"(__lane_id)); |
| | return __lane_id; |
| | } |
| | }; |
| |
|
| | template<thread_scope _Scope> |
| | class pipeline { |
| | public: |
| | pipeline(pipeline &&) = default; |
| | pipeline(const pipeline &) = delete; |
| | pipeline & operator=(pipeline &&) = delete; |
| | pipeline & operator=(const pipeline &) = delete; |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | ~pipeline() |
| | { |
| | if (__active) { |
| | (void)quit(); |
| | } |
| | } |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | bool quit() |
| | { |
| | bool __elected; |
| | uint32_t __sub_count; |
| | NV_IF_TARGET(NV_IS_DEVICE, |
| | const uint32_t __match_mask = __match_any_sync(__activemask(), reinterpret_cast<uintptr_t>(__shared_state_get_refcount())); |
| | const uint32_t __elected_id = __ffs(__match_mask) - 1; |
| | __elected = (__pipeline_asm_helper::__lane_id() == __elected_id); |
| | __sub_count = __popc(__match_mask); |
| | , |
| | __elected = true; |
| | __sub_count = 1; |
| | ) |
| | bool __released = false; |
| | if (__elected) { |
| | const uint32_t __old = __shared_state_get_refcount()->fetch_sub(__sub_count); |
| | const bool __last = (__old == __sub_count); |
| | if (__last) { |
| | for (uint8_t __stage = 0; __stage < __stages_count; ++__stage) { |
| | __shared_state_get_stage(__stage)->__produced.~barrier(); |
| | if (__partitioned) { |
| | __shared_state_get_stage(__stage)->__consumed.~barrier(); |
| | } |
| | } |
| | __released = true; |
| | } |
| | } |
| | __active = false; |
| | return __released; |
| | } |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void producer_acquire() |
| | { |
| | if (__partitioned) { |
| | barrier<_Scope> & __stage_barrier = __shared_state_get_stage(__head)->__consumed; |
| | (void)_CUDA_VSTD::__libcpp_thread_poll_with_backoff(__poll_tester(__stage_barrier, __consumed_phase_parity)); |
| | } |
| | } |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void producer_commit() |
| | { |
| | barrier<_Scope> & __stage_barrier = __shared_state_get_stage(__head)->__produced; |
| | __memcpy_async_synchronize(__stage_barrier, true); |
| | (void)__stage_barrier.arrive(); |
| | if (++__head == __stages_count) { |
| | __head = 0; |
| | if (__partitioned) { |
| | __consumed_phase_parity = !__consumed_phase_parity; |
| | } |
| | } |
| | } |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void consumer_wait() |
| | { |
| | barrier<_Scope> & __stage_barrier = __shared_state_get_stage(__tail)->__produced; |
| | (void)_CUDA_VSTD::__libcpp_thread_poll_with_backoff(__poll_tester(__stage_barrier, __produced_phase_parity)); |
| | } |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void consumer_release() |
| | { |
| | if (__partitioned) { |
| | (void)__shared_state_get_stage(__tail)->__consumed.arrive(); |
| | } |
| | if (++__tail == __stages_count) { |
| | __tail = 0; |
| | __produced_phase_parity = !__produced_phase_parity; |
| | } |
| | } |
| |
|
| | template<class _Rep, class _Period> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | bool consumer_wait_for(const _CUDA_VSTD::chrono::duration<_Rep, _Period> & __duration) |
| | { |
| | barrier<_Scope> & __stage_barrier = __shared_state_get_stage(__tail)->__produced; |
| | return _CUDA_VSTD::__libcpp_thread_poll_with_backoff( |
| | __poll_tester(__stage_barrier, __produced_phase_parity), |
| | _CUDA_VSTD::chrono::duration_cast<_CUDA_VSTD::chrono::nanoseconds>(__duration) |
| | ); |
| | } |
| |
|
| | template<class _Clock, class _Duration> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | bool consumer_wait_until(const _CUDA_VSTD::chrono::time_point<_Clock, _Duration> & __time_point) |
| | { |
| | return consumer_wait_for(__time_point - _Clock::now()); |
| | } |
| |
|
| | private: |
| | uint8_t __head : 8; |
| | uint8_t __tail : 8; |
| | const uint8_t __stages_count : 8; |
| | bool __consumed_phase_parity : 1; |
| | bool __produced_phase_parity : 1; |
| | bool __active : 1; |
| | const bool __partitioned : 1; |
| | char * const __shared_state; |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | pipeline(char * __shared_state, uint8_t __stages_count, bool __partitioned) |
| | : __head(0) |
| | , __tail(0) |
| | , __stages_count(__stages_count) |
| | , __consumed_phase_parity(true) |
| | , __produced_phase_parity(false) |
| | , __active(true) |
| | , __partitioned(__partitioned) |
| | , __shared_state(__shared_state) |
| | {} |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | static bool __barrier_try_wait_parity_impl(barrier<_Scope> & __barrier, bool __phase_parity) |
| | { |
| | typename barrier<_Scope>::arrival_token __synthesized_token = (__phase_parity ? 1ull : 0ull) << 63; |
| | return __barrier.__try_wait(_CUDA_VSTD::move(__synthesized_token)); |
| | } |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | static bool __barrier_try_wait_parity(barrier<_Scope> & __barrier, bool __phase_parity) |
| | { |
| | return __barrier_try_wait_parity_impl(__barrier, __phase_parity); |
| | } |
| |
|
| | struct __poll_tester { |
| | barrier<_Scope> & __barrier; |
| | bool __phase_parity; |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | __poll_tester(barrier<_Scope> & __barrier, bool __phase_parity) |
| | : __barrier(__barrier) |
| | , __phase_parity(__phase_parity) |
| | {} |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | bool operator()() const |
| | { |
| | return __barrier_try_wait_parity(__barrier, __phase_parity); |
| | } |
| | }; |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | __pipeline_stage<_Scope> * __shared_state_get_stage(uint8_t __stage) |
| | { |
| | ptrdiff_t __stage_offset = __stage * sizeof(__pipeline_stage<_Scope>); |
| | return reinterpret_cast<__pipeline_stage<_Scope>*>(__shared_state + __stage_offset); |
| | } |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | atomic<uint32_t, _Scope> * __shared_state_get_refcount() |
| | { |
| | ptrdiff_t __refcount_offset = __stages_count * sizeof(__pipeline_stage<_Scope>); |
| | return reinterpret_cast<atomic<uint32_t, _Scope>*>(__shared_state + __refcount_offset); |
| | } |
| |
|
| | template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count> |
| | friend _LIBCUDACXX_INLINE_VISIBILITY |
| | pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state); |
| |
|
| | template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count> |
| | friend _LIBCUDACXX_INLINE_VISIBILITY |
| | pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state, size_t __producer_count); |
| | |
| | template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count> |
| | friend _LIBCUDACXX_INLINE_VISIBILITY |
| | pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state, pipeline_role __role); |
| | }; |
| |
|
| | template<> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | inline bool pipeline<thread_scope_block>::__barrier_try_wait_parity(barrier<thread_scope_block> & __barrier, bool __phase_parity) |
| | { |
| | NV_IF_TARGET(NV_PROVIDES_SM_80, |
| | if (__isShared(&__barrier)) { |
| | uint64_t * __mbarrier = device::barrier_native_handle(__barrier); |
| | uint16_t __wait_complete; |
| |
|
| | asm volatile ("{" |
| | " .reg .pred %p;" |
| | " mbarrier.test_wait.parity.shared.b64 %p, [%1], %2;" |
| | " selp.u16 %0, 1, 0, %p;" |
| | "}" |
| | : "=h"(__wait_complete) |
| | : "r"(static_cast<uint32_t>(__cvta_generic_to_shared(__mbarrier))), "r"(static_cast<uint32_t>(__phase_parity)) |
| | : "memory"); |
| |
|
| | return bool(__wait_complete); |
| | } else |
| | { |
| | return __barrier_try_wait_parity_impl(__barrier, __phase_parity); |
| | } |
| | , |
| | return __barrier_try_wait_parity_impl(__barrier, __phase_parity); |
| | ) |
| | } |
| |
|
| | template<class _Group, thread_scope _Scope, uint8_t _Stages_count> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | pipeline<_Scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Scope, _Stages_count> * __shared_state) |
| | { |
| | const uint32_t __group_size = static_cast<uint32_t>(__group.size()); |
| | const uint32_t __thread_rank = static_cast<uint32_t>(__group.thread_rank()); |
| |
|
| | if (__thread_rank == 0) { |
| | for (uint8_t __stage = 0; __stage < _Stages_count; ++__stage) { |
| | init(&__shared_state->__stages[__stage].__produced, __group_size); |
| | } |
| | __shared_state->__refcount.store(__group_size, std::memory_order_relaxed); |
| | } |
| | __group.sync(); |
| |
|
| | return pipeline<_Scope>(reinterpret_cast<char*>(__shared_state->__stages), _Stages_count, false); |
| | } |
| |
|
| | template<class _Group, thread_scope _Scope, uint8_t _Stages_count> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | pipeline<_Scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Scope, _Stages_count> * __shared_state, size_t __producer_count) |
| | { |
| | const uint32_t __group_size = static_cast<uint32_t>(__group.size()); |
| | const uint32_t __thread_rank = static_cast<uint32_t>(__group.thread_rank()); |
| |
|
| | if (__thread_rank == 0) { |
| | const size_t __consumer_count = __group_size - __producer_count; |
| | for (uint8_t __stage = 0; __stage < _Stages_count; ++__stage) { |
| | init(&__shared_state->__stages[__stage].__consumed, __consumer_count); |
| | init(&__shared_state->__stages[__stage].__produced, __producer_count); |
| | } |
| | __shared_state->__refcount.store(__group_size, std::memory_order_relaxed); |
| | } |
| | __group.sync(); |
| |
|
| | return pipeline<_Scope>(reinterpret_cast<char*>(__shared_state->__stages), _Stages_count, true); |
| | } |
| | |
| | template<class _Group, thread_scope _Scope, uint8_t _Stages_count> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | pipeline<_Scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Scope, _Stages_count> * __shared_state, pipeline_role __role) |
| | { |
| | const uint32_t __group_size = static_cast<uint32_t>(__group.size()); |
| | const uint32_t __thread_rank = static_cast<uint32_t>(__group.thread_rank()); |
| |
|
| | if (__thread_rank == 0) { |
| | __shared_state->__refcount.store(0, std::memory_order_relaxed); |
| | } |
| | __group.sync(); |
| |
|
| | if (__role == pipeline_role::producer) { |
| | bool __elected; |
| | uint32_t __add_count; |
| | NV_IF_TARGET(NV_IS_DEVICE, |
| | const uint32_t __match_mask = __match_any_sync(__activemask(), reinterpret_cast<uintptr_t>(&__shared_state->__refcount)); |
| | const uint32_t __elected_id = __ffs(__match_mask) - 1; |
| | __elected = (__pipeline_asm_helper::__lane_id() == __elected_id); |
| | __add_count = __popc(__match_mask); |
| | , |
| | __elected = true; |
| | __add_count = 1; |
| | ) |
| | if (__elected) { |
| | (void)__shared_state->__refcount.fetch_add(__add_count, std::memory_order_relaxed); |
| | } |
| | } |
| | __group.sync(); |
| |
|
| | if (__thread_rank == 0) { |
| | const uint32_t __producer_count = __shared_state->__refcount.load(std::memory_order_relaxed); |
| | const uint32_t __consumer_count = __group_size - __producer_count; |
| | for (uint8_t __stage = 0; __stage < _Stages_count; ++__stage) { |
| | init(&__shared_state->__stages[__stage].__consumed, __consumer_count); |
| | init(&__shared_state->__stages[__stage].__produced, __producer_count); |
| | } |
| | __shared_state->__refcount.store(__group_size, std::memory_order_relaxed); |
| | } |
| | __group.sync(); |
| |
|
| | return pipeline<_Scope>(reinterpret_cast<char*>(__shared_state->__stages), _Stages_count, true); |
| | } |
| |
|
| | _LIBCUDACXX_END_NAMESPACE_CUDA |
| |
|
| | _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE |
| |
|
| | template<uint8_t _Prior> |
| | __device__ |
| | void __pipeline_consumer_wait(pipeline<thread_scope_thread> & __pipeline); |
| |
|
| | __device__ |
| | inline void __pipeline_consumer_wait(pipeline<thread_scope_thread> & __pipeline, uint8_t __prior); |
| |
|
| | _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE |
| |
|
| | _LIBCUDACXX_BEGIN_NAMESPACE_CUDA |
| |
|
| | template<> |
| | class pipeline<thread_scope_thread> { |
| | public: |
| | pipeline(pipeline &&) = default; |
| | pipeline(const pipeline &) = delete; |
| | pipeline & operator=(pipeline &&) = delete; |
| | pipeline & operator=(const pipeline &) = delete; |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | ~pipeline() {} |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | bool quit() |
| | { |
| | return true; |
| | } |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void producer_acquire() {} |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void producer_commit() |
| | { |
| | NV_IF_TARGET(NV_PROVIDES_SM_80, |
| | asm volatile ("cp.async.commit_group;"); |
| | ++__head; |
| | ) |
| | } |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void consumer_wait() |
| | { |
| | NV_IF_TARGET(NV_PROVIDES_SM_80, |
| | if (__head == __tail) { |
| | return; |
| | } |
| |
|
| | const uint8_t __prior = __head - __tail - 1; |
| | device::__pipeline_consumer_wait(*this, __prior); |
| | ++__tail; |
| | ) |
| | } |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void consumer_release() {} |
| | |
| | template<class _Rep, class _Period> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | bool consumer_wait_for(const _CUDA_VSTD::chrono::duration<_Rep, _Period> & __duration) |
| | { |
| | (void)__duration; |
| | consumer_wait(); |
| | return true; |
| | } |
| |
|
| | template<class _Clock, class _Duration> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | bool consumer_wait_until(const _CUDA_VSTD::chrono::time_point<_Clock, _Duration> & __time_point) |
| | { |
| | (void)__time_point; |
| | consumer_wait(); |
| | return true; |
| | } |
| |
|
| | private: |
| | uint8_t __head; |
| | uint8_t __tail; |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | pipeline() |
| | : __head(0) |
| | , __tail(0) |
| | {} |
| |
|
| | friend _LIBCUDACXX_INLINE_VISIBILITY inline pipeline<thread_scope_thread> make_pipeline(); |
| |
|
| | template<uint8_t _Prior> |
| | friend _LIBCUDACXX_INLINE_VISIBILITY |
| | void pipeline_consumer_wait_prior(pipeline<thread_scope_thread> & __pipeline); |
| | }; |
| |
|
| | _LIBCUDACXX_END_NAMESPACE_CUDA |
| |
|
| | _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE |
| |
|
| | template<uint8_t _Prior> |
| | __device__ |
| | void __pipeline_consumer_wait(pipeline<thread_scope_thread> & __pipeline) |
| | { |
| | (void)__pipeline; |
| | NV_IF_TARGET(NV_PROVIDES_SM_80, |
| | constexpr uint8_t __max_prior = 8; |
| |
|
| | asm volatile ("cp.async.wait_group %0;" |
| | : |
| | : "n"(_Prior < __max_prior ? _Prior : __max_prior)); |
| | ) |
| | } |
| |
|
| | __device__ |
| | inline void __pipeline_consumer_wait(pipeline<thread_scope_thread> & __pipeline, uint8_t __prior) |
| | { |
| | switch (__prior) { |
| | case 0: device::__pipeline_consumer_wait<0>(__pipeline); break; |
| | case 1: device::__pipeline_consumer_wait<1>(__pipeline); break; |
| | case 2: device::__pipeline_consumer_wait<2>(__pipeline); break; |
| | case 3: device::__pipeline_consumer_wait<3>(__pipeline); break; |
| | case 4: device::__pipeline_consumer_wait<4>(__pipeline); break; |
| | case 5: device::__pipeline_consumer_wait<5>(__pipeline); break; |
| | case 6: device::__pipeline_consumer_wait<6>(__pipeline); break; |
| | case 7: device::__pipeline_consumer_wait<7>(__pipeline); break; |
| | default: device::__pipeline_consumer_wait<8>(__pipeline); break; |
| | } |
| | } |
| |
|
| | _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE |
| |
|
| | _LIBCUDACXX_BEGIN_NAMESPACE_CUDA |
| |
|
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | inline pipeline<thread_scope_thread> make_pipeline() |
| | { |
| | return pipeline<thread_scope_thread>(); |
| | } |
| |
|
| | template<uint8_t _Prior> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void pipeline_consumer_wait_prior(pipeline<thread_scope_thread> & __pipeline) |
| | { |
| | NV_IF_TARGET(NV_PROVIDES_SM_80, |
| | device::__pipeline_consumer_wait<_Prior>(__pipeline); |
| | __pipeline.__tail = __pipeline.__head - _Prior; |
| | ) |
| | } |
| |
|
| | template<thread_scope _Scope> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void pipeline_producer_commit(pipeline<thread_scope_thread> & __pipeline, barrier<_Scope> & __barrier) |
| | { |
| | (void)__pipeline; |
| | NV_IF_TARGET(NV_PROVIDES_SM_80, |
| | __memcpy_async_synchronize(__barrier, true); |
| | ) |
| | } |
| |
|
| | template<thread_scope _Scope> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void __memcpy_async_synchronize(pipeline<_Scope> & __pipeline, bool __is_async) { |
| | |
| | (void)__pipeline; |
| | (void)__is_async; |
| | } |
| |
|
| | template<typename _Group, class _Type, thread_scope _Scope> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void memcpy_async(_Group const & __group, _Type * __destination, _Type const * __source, std::size_t __size, pipeline<_Scope> & __pipeline) |
| | { |
| | |
| | |
| | |
| | |
| | #if !defined(_LIBCUDACXX_COMPILER_GCC) || _GNUC_VER > 408 |
| | static_assert(_CUDA_VSTD::is_trivially_copyable<_Type>::value, "memcpy_async requires a trivially copyable type"); |
| | #endif |
| |
|
| | __memcpy_async<alignof(_Type)>(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __pipeline); |
| | } |
| |
|
| | template<typename _Group, class _Type, std::size_t _Alignment, thread_scope _Scope, std::size_t _Larger_alignment = (alignof(_Type) > _Alignment) ? alignof(_Type) : _Alignment> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void memcpy_async(_Group const & __group, _Type * __destination, _Type const * __source, aligned_size_t<_Alignment> __size, pipeline<_Scope> & __pipeline) { |
| | |
| | |
| | |
| | |
| | #if !defined(_LIBCUDACXX_COMPILER_GCC) || _GNUC_VER > 408 |
| | static_assert(_CUDA_VSTD::is_trivially_copyable<_Type>::value, "memcpy_async requires a trivially copyable type"); |
| | #endif |
| |
|
| | __memcpy_async<_Larger_alignment>(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __pipeline); |
| | } |
| |
|
| | template<class _Type, typename _Size, thread_scope _Scope> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void memcpy_async(_Type * __destination, _Type const * __source, _Size __size, pipeline<_Scope> & __pipeline) { |
| | memcpy_async(__single_thread_group{}, __destination, __source, __size, __pipeline); |
| | } |
| |
|
| | template<typename _Group, thread_scope _Scope> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void memcpy_async(_Group const & __group, void * __destination, void const * __source, std::size_t __size, pipeline<_Scope> & __pipeline) { |
| | __memcpy_async<1>(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __pipeline); |
| | } |
| |
|
| | template<typename _Group, std::size_t _Alignment, thread_scope _Scope> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void memcpy_async(_Group const & __group, void * __destination, void const * __source, aligned_size_t<_Alignment> __size, pipeline<_Scope> & __pipeline) { |
| | __memcpy_async<_Alignment>(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __pipeline); |
| | } |
| |
|
| | template<typename _Size, thread_scope _Scope> |
| | _LIBCUDACXX_INLINE_VISIBILITY |
| | void memcpy_async(void * __destination, void const * __source, _Size __size, pipeline<_Scope> & __pipeline) { |
| | memcpy_async(__single_thread_group{}, __destination, __source, __size, __pipeline); |
| | } |
| |
|
| | _LIBCUDACXX_END_NAMESPACE_CUDA |
| |
|
| | #endif |
| |
|