thrust / install /include /cub /agent /agent_adjacent_difference.cuh

thanks to nvidia ❤

0dc1b04 over 2 years ago

9.73 kB

	/******************************************************************************
	* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	#pragma once

	#include "../config.cuh"
	#include "../util_type.cuh"
	#include "../util_namespace.cuh"
	#include "../block/block_load.cuh"
	#include "../block/block_store.cuh"
	#include "../block/block_adjacent_difference.cuh"

	#include <thrust/system/cuda/detail/core/util.h>


	CUB_NAMESPACE_BEGIN


	template <
	int _BLOCK_THREADS,
	int _ITEMS_PER_THREAD = 1,
	cub::BlockLoadAlgorithm _LOAD_ALGORITHM = cub::BLOCK_LOAD_DIRECT,
	cub::CacheLoadModifier _LOAD_MODIFIER = cub::LOAD_LDG,
	cub::BlockStoreAlgorithm _STORE_ALGORITHM = cub::BLOCK_STORE_DIRECT>
	struct AgentAdjacentDifferencePolicy
	{
	static constexpr int BLOCK_THREADS = _BLOCK_THREADS;
	static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD;
	static constexpr int ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD;

	static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
	static constexpr cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
	static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
	};

	template <typename Policy,
	typename InputIteratorT,
	typename OutputIteratorT,
	typename DifferenceOpT,
	typename OffsetT,
	typename InputT,
	typename OutputT,
	bool MayAlias,
	bool ReadLeft>
	struct AgentDifference
	{
	using LoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, InputIteratorT>::type;

	using BlockLoad = typename cub::BlockLoadType<Policy, LoadIt>::type;
	using BlockStore = typename cub::BlockStoreType<Policy, OutputIteratorT, OutputT>::type;

	using BlockAdjacentDifferenceT =
	cub::BlockAdjacentDifference<InputT, Policy::BLOCK_THREADS>;

	union _TempStorage
	{
	typename BlockLoad::TempStorage load;
	typename BlockStore::TempStorage store;
	typename BlockAdjacentDifferenceT::TempStorage adjacent_difference;
	};

	/// Alias wrapper allowing storage to be unioned
	struct TempStorage : Uninitialized<_TempStorage> {};

	static constexpr int BLOCK_THREADS = Policy::BLOCK_THREADS;
	static constexpr int ITEMS_PER_THREAD = Policy::ITEMS_PER_THREAD;
	static constexpr int ITEMS_PER_TILE = Policy::ITEMS_PER_TILE;
	static constexpr int SHARED_MEMORY_SIZE = static_cast<int>(sizeof(TempStorage));

	_TempStorage &temp_storage;
	InputIteratorT input_it;
	LoadIt load_it;
	InputT *first_tile_previous;
	OutputIteratorT result;
	DifferenceOpT difference_op;
	OffsetT num_items;

	__device__ __forceinline__ AgentDifference(TempStorage &temp_storage,
	InputIteratorT input_it,
	InputT *first_tile_previous,
	OutputIteratorT result,
	DifferenceOpT difference_op,
	OffsetT num_items)
	: temp_storage(temp_storage.Alias())
	, input_it(input_it)
	, load_it(
	THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(Policy(),
	input_it))
	, first_tile_previous(first_tile_previous)
	, result(result)
	, difference_op(difference_op)
	, num_items(num_items)
	{}

	template <bool IS_LAST_TILE,
	bool IS_FIRST_TILE>
	__device__ __forceinline__ void consume_tile_impl(int num_remaining,
	int tile_idx,
	OffsetT tile_base)
	{
	InputT input[ITEMS_PER_THREAD];
	OutputT output[ITEMS_PER_THREAD];

	if (IS_LAST_TILE)
	{
	// Fill last elements with the first element
	// because collectives are not suffix guarded
	BlockLoad(temp_storage.load)
	.Load(load_it + tile_base, input, num_remaining, *(load_it + tile_base));
	}
	else
	{
	BlockLoad(temp_storage.load).Load(load_it + tile_base, input);
	}

	CTA_SYNC();

	if (ReadLeft)
	{
	if (IS_FIRST_TILE)
	{
	if (IS_LAST_TILE)
	{
	BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
	.SubtractLeftPartialTile(input,
	output,
	difference_op,
	num_remaining);
	}
	else
	{
	BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
	.SubtractLeft(input, output, difference_op);
	}
	}
	else
	{
	InputT tile_prev_input = MayAlias
	? first_tile_previous[tile_idx]
	: *(input_it + tile_base - 1);

	if (IS_LAST_TILE)
	{
	BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
	.SubtractLeftPartialTile(input,
	output,
	difference_op,
	num_remaining,
	tile_prev_input);
	}
	else
	{
	BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
	.SubtractLeft(input, output, difference_op, tile_prev_input);
	}
	}
	}
	else
	{
	if (IS_LAST_TILE)
	{
	BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
	.SubtractRightPartialTile(input, output, difference_op, num_remaining);
	}
	else
	{
	InputT tile_next_input = MayAlias
	? first_tile_previous[tile_idx]
	: *(input_it + tile_base + ITEMS_PER_TILE);

	BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
	.SubtractRight(input, output, difference_op, tile_next_input);
	}
	}

	CTA_SYNC();

	if (IS_LAST_TILE)
	{
	BlockStore(temp_storage.store)
	.Store(result + tile_base, output, num_remaining);
	}
	else
	{
	BlockStore(temp_storage.store).Store(result + tile_base, output);
	}
	}

	template <bool IS_LAST_TILE>
	__device__ __forceinline__ void consume_tile(int num_remaining,
	int tile_idx,
	OffsetT tile_base)
	{
	if (tile_idx == 0)
	{
	consume_tile_impl<IS_LAST_TILE, true>(num_remaining,
	tile_idx,
	tile_base);
	}
	else
	{
	consume_tile_impl<IS_LAST_TILE, false>(num_remaining,
	tile_idx,
	tile_base);
	}
	}

	__device__ __forceinline__ void Process(int tile_idx,
	OffsetT tile_base)
	{
	OffsetT num_remaining = num_items - tile_base;

	if (num_remaining > ITEMS_PER_TILE) // not a last tile
	{
	consume_tile<false>(num_remaining, tile_idx, tile_base);
	}
	else
	{
	consume_tile<true>(num_remaining, tile_idx, tile_base);
	}
	}
	};

	template <typename InputIteratorT,
	typename InputT,
	typename OffsetT,
	bool ReadLeft>
	struct AgentDifferenceInit
	{
	static constexpr int BLOCK_THREADS = 128;

	static __device__ __forceinline__ void Process(int tile_idx,
	InputIteratorT first,
	InputT *result,
	OffsetT num_tiles,
	int items_per_tile)
	{
	OffsetT tile_base = static_cast<OffsetT>(tile_idx) * items_per_tile;

	if (tile_base > 0 && tile_idx < num_tiles)
	{
	if (ReadLeft)
	{
	result[tile_idx] = first[tile_base - 1];
	}
	else
	{
	result[tile_idx - 1] = first[tile_base];
	}
	}
	}
	};


	CUB_NAMESPACE_END