thrust / install /include /cub /device /dispatch /dispatch_adjacent_difference.cuh

thanks to nvidia ❤

0dc1b04 over 2 years ago

14.3 kB

	/******************************************************************************
	* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	#pragma once

	#include <cub/agent/agent_adjacent_difference.cuh>
	#include <cub/config.cuh>
	#include <cub/detail/type_traits.cuh>
	#include <cub/util_debug.cuh>
	#include <cub/util_deprecated.cuh>
	#include <cub/util_device.cuh>
	#include <cub/util_math.cuh>
	#include <cub/util_namespace.cuh>

	#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

	#include <iterator>

	CUB_NAMESPACE_BEGIN


	template <typename AgentDifferenceInitT,
	typename InputIteratorT,
	typename InputT,
	typename OffsetT>
	void __global__ DeviceAdjacentDifferenceInitKernel(InputIteratorT first,
	InputT *result,
	OffsetT num_tiles,
	int items_per_tile)
	{
	const int tile_idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
	AgentDifferenceInitT::Process(tile_idx,
	first,
	result,
	num_tiles,
	items_per_tile);
	}

	template <typename ChainedPolicyT,
	typename InputIteratorT,
	typename OutputIteratorT,
	typename DifferenceOpT,
	typename OffsetT,
	typename InputT,
	bool MayAlias,
	bool ReadLeft>
	void __global__
	DeviceAdjacentDifferenceDifferenceKernel(InputIteratorT input,
	InputT *first_tile_previous,
	OutputIteratorT result,
	DifferenceOpT difference_op,
	OffsetT num_items)
	{
	using ActivePolicyT =
	typename ChainedPolicyT::ActivePolicy::AdjacentDifferencePolicy;

	// It is OK to introspect the return type or parameter types of the
	// `operator()` function of `__device__` extended lambda within device code.
	using OutputT = detail::invoke_result_t<DifferenceOpT, InputT, InputT>;

	using Agent = AgentDifference<ActivePolicyT,
	InputIteratorT,
	OutputIteratorT,
	DifferenceOpT,
	OffsetT,
	InputT,
	OutputT,
	MayAlias,
	ReadLeft>;

	__shared__ typename Agent::TempStorage storage;

	Agent agent(storage,
	input,
	first_tile_previous,
	result,
	difference_op,
	num_items);

	int tile_idx = static_cast<int>(blockIdx.x);
	OffsetT tile_base = static_cast<OffsetT>(tile_idx)
	* ActivePolicyT::ITEMS_PER_TILE;

	agent.Process(tile_idx, tile_base);
	}

	template <typename InputIteratorT, bool MayAlias = true>
	struct DeviceAdjacentDifferencePolicy
	{
	using ValueT = typename std::iterator_traits<InputIteratorT>::value_type;

	//------------------------------------------------------------------------------
	// Architecture-specific tuning policies
	//------------------------------------------------------------------------------

	struct Policy300 : ChainedPolicy<300, Policy300, Policy300>
	{
	using AdjacentDifferencePolicy =
	AgentAdjacentDifferencePolicy<128,
	Nominal8BItemsToItems<ValueT>(7),
	BLOCK_LOAD_WARP_TRANSPOSE,
	LOAD_DEFAULT,
	BLOCK_STORE_WARP_TRANSPOSE>;
	};

	struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
	{
	using AdjacentDifferencePolicy =
	AgentAdjacentDifferencePolicy<128,
	Nominal8BItemsToItems<ValueT>(7),
	BLOCK_LOAD_WARP_TRANSPOSE,
	MayAlias ? LOAD_CA : LOAD_LDG,
	BLOCK_STORE_WARP_TRANSPOSE>;
	};

	using MaxPolicy = Policy350;
	};

	template <typename InputIteratorT,
	typename OutputIteratorT,
	typename DifferenceOpT,
	typename OffsetT,
	bool MayAlias,
	bool ReadLeft,
	typename SelectedPolicy =
	DeviceAdjacentDifferencePolicy<InputIteratorT, MayAlias>>
	struct DispatchAdjacentDifference : public SelectedPolicy
	{
	using InputT = typename std::iterator_traits<InputIteratorT>::value_type;

	void *d_temp_storage;
	std::size_t &temp_storage_bytes;
	InputIteratorT d_input;
	OutputIteratorT d_output;
	OffsetT num_items;
	DifferenceOpT difference_op;
	cudaStream_t stream;

	CUB_RUNTIME_FUNCTION __forceinline__
	DispatchAdjacentDifference(void *d_temp_storage,
	std::size_t &temp_storage_bytes,
	InputIteratorT d_input,
	OutputIteratorT d_output,
	OffsetT num_items,
	DifferenceOpT difference_op,
	cudaStream_t stream)
	: d_temp_storage(d_temp_storage)
	, temp_storage_bytes(temp_storage_bytes)
	, d_input(d_input)
	, d_output(d_output)
	, num_items(num_items)
	, difference_op(difference_op)
	, stream(stream)
	{}

	CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
	CUB_DEPRECATED CUB_RUNTIME_FUNCTION __forceinline__
	DispatchAdjacentDifference(void *d_temp_storage,
	std::size_t &temp_storage_bytes,
	InputIteratorT d_input,
	OutputIteratorT d_output,
	OffsetT num_items,
	DifferenceOpT difference_op,
	cudaStream_t stream,
	bool debug_synchronous)
	: d_temp_storage(d_temp_storage)
	, temp_storage_bytes(temp_storage_bytes)
	, d_input(d_input)
	, d_output(d_output)
	, num_items(num_items)
	, difference_op(difference_op)
	, stream(stream)
	{
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
	}

	/// Invocation
	template <typename ActivePolicyT>
	CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
	{
	using AdjacentDifferencePolicyT =
	typename ActivePolicyT::AdjacentDifferencePolicy;

	using MaxPolicyT = typename DispatchAdjacentDifference::MaxPolicy;

	cudaError error = cudaSuccess;

	do
	{
	const int tile_size = AdjacentDifferencePolicyT::ITEMS_PER_TILE;
	const int num_tiles =
	static_cast<int>(DivideAndRoundUp(num_items, tile_size));

	std::size_t first_tile_previous_size = MayAlias * num_tiles *
	sizeof(InputT);

	void *allocations[1] = {nullptr};
	std::size_t allocation_sizes[1] = {MayAlias * first_tile_previous_size};

	if (CubDebug(error = AliasTemporaries(d_temp_storage,
	temp_storage_bytes,
	allocations,
	allocation_sizes)))
	{
	break;
	}

	if (d_temp_storage == nullptr)
	{
	// Return if the caller is simply requesting the size of the storage
	// allocation

	if (temp_storage_bytes == 0)
	{
	temp_storage_bytes = 1;
	}

	break;
	}

	if (num_items == OffsetT{})
	{
	break;
	}

	auto first_tile_previous = reinterpret_cast<InputT*>(allocations[0]);

	if (MayAlias)
	{
	using AgentDifferenceInitT =
	AgentDifferenceInit<InputIteratorT, InputT, OffsetT, ReadLeft>;

	const int init_block_size = AgentDifferenceInitT::BLOCK_THREADS;
	const int init_grid_size = DivideAndRoundUp(num_tiles, init_block_size);

	#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
	_CubLog("Invoking DeviceAdjacentDifferenceInitKernel"
	"<<<%d, %d, 0, %lld>>>()\n",
	init_grid_size,
	init_block_size,
	reinterpret_cast<long long>(stream));
	#endif

	THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size,
	init_block_size,
	0,
	stream)
	.doit(DeviceAdjacentDifferenceInitKernel<AgentDifferenceInitT,
	InputIteratorT,
	InputT,
	OffsetT>,
	d_input,
	first_tile_previous,
	num_tiles,
	tile_size);

	error = detail::DebugSyncStream(stream);

	if (CubDebug(error))
	{
	break;
	}

	// Check for failure to launch
	if (CubDebug(error = cudaPeekAtLastError()))
	{
	break;
	}
	}

	#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
	_CubLog("Invoking DeviceAdjacentDifferenceDifferenceKernel"
	"<<<%d, %d, 0, %lld>>>()\n",
	num_tiles,
	AdjacentDifferencePolicyT::BLOCK_THREADS,
	reinterpret_cast<long long>(stream));
	#endif

	THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
	num_tiles,
	AdjacentDifferencePolicyT::BLOCK_THREADS,
	0,
	stream)
	.doit(DeviceAdjacentDifferenceDifferenceKernel<MaxPolicyT,
	InputIteratorT,
	OutputIteratorT,
	DifferenceOpT,
	OffsetT,
	InputT,
	MayAlias,
	ReadLeft>,
	d_input,
	first_tile_previous,
	d_output,
	difference_op,
	num_items);

	error = detail::DebugSyncStream(stream);

	if (CubDebug(error))
	{
	break;
	}

	// Check for failure to launch
	if (CubDebug(error = cudaPeekAtLastError()))
	{
	break;
	}
	} while (0);

	return error;
	}

	CUB_RUNTIME_FUNCTION
	static cudaError_t Dispatch(void *d_temp_storage,
	std::size_t &temp_storage_bytes,
	InputIteratorT d_input,
	OutputIteratorT d_output,
	OffsetT num_items,
	DifferenceOpT difference_op,
	cudaStream_t stream)
	{
	using MaxPolicyT = typename DispatchAdjacentDifference::MaxPolicy;

	cudaError error = cudaSuccess;
	do
	{
	// Get PTX version
	int ptx_version = 0;
	if (CubDebug(error = PtxVersion(ptx_version)))
	{
	break;
	}

	// Create dispatch functor
	DispatchAdjacentDifference dispatch(d_temp_storage,
	temp_storage_bytes,
	d_input,
	d_output,
	num_items,
	difference_op,
	stream);

	// Dispatch to chained policy
	if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
	{
	break;
	}
	} while (0);

	return error;
	}

	CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
	CUB_RUNTIME_FUNCTION
	static cudaError_t Dispatch(void *d_temp_storage,
	std::size_t &temp_storage_bytes,
	InputIteratorT d_input,
	OutputIteratorT d_output,
	OffsetT num_items,
	DifferenceOpT difference_op,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

	return Dispatch(d_temp_storage,
	temp_storage_bytes,
	d_input,
	d_output,
	num_items,
	difference_op,
	stream);
	}
	};


	CUB_NAMESPACE_END