grand_parent: Extended API
parent: Asynchronous Operations
cuda::memcpy_async
Defined in header <cuda/barrier>:
// (1)
template <typename Shape, cuda::thread_scope Scope, typename CompletionFunction>
__host__ __device__
void cuda::memcpy_async(void* destination, void const* source, Shape size,
cuda::barrier<Scope, CompletionFunction>& barrier);
// (2)
template <typename Group,
typename Shape, cuda::thread_scope Scope, typename CompletionFunction>
__host__ __device__
void cuda::memcpy_async(Group const& group,
void* destination, void const* source, Shape size,
cuda::barrier<Scope, CompletionFunction>& barrier);
Defined in header <cuda/pipeline>:
// (3)
template <typename Shape, cuda::thread_scope Scope>
__host__ __device__
void cuda::memcpy_async(void* destination, void const* source, Shape size,
cuda::pipeline<Scope>& pipeline);
// (4)
template <typename Group, typename Shape, cuda::thread_scope Scope>
__host__ __device__
void cuda::memcpy_async(Group const& group,
void* destination, void const* source, Shape size,
cuda::pipeline<Scope>& pipeline);
Defined in header <cuda/annotated_ptr>:
// (5)
template <typename Dst, typename Src, typename SrcProperty, typename Shape, typename Sync>
__host__ __device__
void memcpy_async(Dst* dst, cuda::annotated_ptr<Src, SrcProperty> src, Shape size, Sync& sync);
// (6)
template<typename Dst, typename DstProperty, typename Src, typename SrcProperty, typename Shape, typename Sync>
__host__ __device__
void memcpy_async(cuda::annotated_ptr<Dst, DstProperty> dst, cuda::annotated_ptr<Src, SrcProperty> src, Shape size, Sync& sync);
// (7)
template<typename Group, typename Dst, typename Src, typename SrcProperty, typename Shape, typename Sync>
__host__ __device__
void memcpy_async(Group const& group, Dst* dst, cuda::annotated_ptr<Src, SrcProperty> src, Shape size, Sync& sync);
// (8)
template<typename Group, typename Dst, typename DstProperty, typename Src, typename SrcProperty, typename Shape, typename Sync>
__host__ __device__
void memcpy_async(Group const& group, cuda::annotated_ptr<Dst, DstProperty> dst, cuda::annotated_ptr<Src, SrcProperty> src, Shape size, Sync& sync);
cuda::memcpy_async asynchronously copies size bytes from the memory
location pointed to by source to the memory location pointed to by
destination.
Both objects are reinterpreted as arrays of unsigned char.
- Binds the asynchronous copy completion to
cuda::barrierand issues the copy in the current thread. - Binds the asynchronous copy completion to
cuda::barrierand cooperatively issues the copy across all threads ingroup. - Binds the asynchronous copy completion to
cuda::pipelineand issues the copy in the current thread. - Binds the asynchronous copy completion to
cuda::pipelineand cooperatively issues the copy across all threads ingroup. - 5-8: convenience wrappers using
cuda::annotated_ptrwhereSyncis eithercuda::barrierorcuda::pipeline.
Notes
cuda::memcpy_async have similar constraints to std::memcpy, namely:
- If the objects overlap, the behavior is undefined.
- If either
destinationorsourceis an invalid or null pointer, the behavior is undefined (even ifcountis zero). - If the objects are potentially-overlapping the behavior is undefined.
- If the objects are not of TriviallyCopyable type the program is ill-formed, no diagnostic required.
If Shape is cuda::aligned_size_t, source and destination are both
required to be aligned on cuda::aligned_size_t::align, else the behavior is
undefined.
If cuda::pipeline is in a quitted state (see cuda::pipeline::quit), the
behavior is undefined.
For cooperative variants, if the parameters are not the same across all threads
in group, the behavior is undefined.
Template Parameters
| Group | A type satisfying the [Group] concept. |
| Shape | Either cuda::std::size_t or cuda::aligned_size_t. |
Parameters
| group | The group of threads. |
| destination | Pointer to the memory location to copy to. |
| source | Pointer to the memory location to copy from. |
| size | The number of bytes to copy. |
| barrier | The barrier object used to wait on the copy completion. |
| pipeline | The pipeline object used to wait on the copy completion. |
Examples
#include <cuda/barrier>
__global__ void example_kernel(char* dst, char* src) {
cuda::barrier<cuda::thread_scope_system> bar;
init(&bar, 1);
cuda::memcpy_async(dst, src, 1, bar);
cuda::memcpy_async(dst + 1, src + 8, 1, bar);
cuda::memcpy_async(dst + 2, src + 16, 1, bar);
cuda::memcpy_async(dst + 3, src + 24, 1, bar);
bar.arrive_and_wait();
}
See it on Godbolt{: .btn }