thrust / install /include /cub /agent /agent_adjacent_difference.cuh
camenduru's picture
thanks to nvidia ❤
0dc1b04
/******************************************************************************
* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#pragma once
#include "../config.cuh"
#include "../util_type.cuh"
#include "../util_namespace.cuh"
#include "../block/block_load.cuh"
#include "../block/block_store.cuh"
#include "../block/block_adjacent_difference.cuh"
#include <thrust/system/cuda/detail/core/util.h>
CUB_NAMESPACE_BEGIN
template <
int _BLOCK_THREADS,
int _ITEMS_PER_THREAD = 1,
cub::BlockLoadAlgorithm _LOAD_ALGORITHM = cub::BLOCK_LOAD_DIRECT,
cub::CacheLoadModifier _LOAD_MODIFIER = cub::LOAD_LDG,
cub::BlockStoreAlgorithm _STORE_ALGORITHM = cub::BLOCK_STORE_DIRECT>
struct AgentAdjacentDifferencePolicy
{
static constexpr int BLOCK_THREADS = _BLOCK_THREADS;
static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD;
static constexpr int ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD;
static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
static constexpr cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
};
template <typename Policy,
typename InputIteratorT,
typename OutputIteratorT,
typename DifferenceOpT,
typename OffsetT,
typename InputT,
typename OutputT,
bool MayAlias,
bool ReadLeft>
struct AgentDifference
{
using LoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, InputIteratorT>::type;
using BlockLoad = typename cub::BlockLoadType<Policy, LoadIt>::type;
using BlockStore = typename cub::BlockStoreType<Policy, OutputIteratorT, OutputT>::type;
using BlockAdjacentDifferenceT =
cub::BlockAdjacentDifference<InputT, Policy::BLOCK_THREADS>;
union _TempStorage
{
typename BlockLoad::TempStorage load;
typename BlockStore::TempStorage store;
typename BlockAdjacentDifferenceT::TempStorage adjacent_difference;
};
/// Alias wrapper allowing storage to be unioned
struct TempStorage : Uninitialized<_TempStorage> {};
static constexpr int BLOCK_THREADS = Policy::BLOCK_THREADS;
static constexpr int ITEMS_PER_THREAD = Policy::ITEMS_PER_THREAD;
static constexpr int ITEMS_PER_TILE = Policy::ITEMS_PER_TILE;
static constexpr int SHARED_MEMORY_SIZE = static_cast<int>(sizeof(TempStorage));
_TempStorage &temp_storage;
InputIteratorT input_it;
LoadIt load_it;
InputT *first_tile_previous;
OutputIteratorT result;
DifferenceOpT difference_op;
OffsetT num_items;
__device__ __forceinline__ AgentDifference(TempStorage &temp_storage,
InputIteratorT input_it,
InputT *first_tile_previous,
OutputIteratorT result,
DifferenceOpT difference_op,
OffsetT num_items)
: temp_storage(temp_storage.Alias())
, input_it(input_it)
, load_it(
THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(Policy(),
input_it))
, first_tile_previous(first_tile_previous)
, result(result)
, difference_op(difference_op)
, num_items(num_items)
{}
template <bool IS_LAST_TILE,
bool IS_FIRST_TILE>
__device__ __forceinline__ void consume_tile_impl(int num_remaining,
int tile_idx,
OffsetT tile_base)
{
InputT input[ITEMS_PER_THREAD];
OutputT output[ITEMS_PER_THREAD];
if (IS_LAST_TILE)
{
// Fill last elements with the first element
// because collectives are not suffix guarded
BlockLoad(temp_storage.load)
.Load(load_it + tile_base, input, num_remaining, *(load_it + tile_base));
}
else
{
BlockLoad(temp_storage.load).Load(load_it + tile_base, input);
}
CTA_SYNC();
if (ReadLeft)
{
if (IS_FIRST_TILE)
{
if (IS_LAST_TILE)
{
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
.SubtractLeftPartialTile(input,
output,
difference_op,
num_remaining);
}
else
{
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
.SubtractLeft(input, output, difference_op);
}
}
else
{
InputT tile_prev_input = MayAlias
? first_tile_previous[tile_idx]
: *(input_it + tile_base - 1);
if (IS_LAST_TILE)
{
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
.SubtractLeftPartialTile(input,
output,
difference_op,
num_remaining,
tile_prev_input);
}
else
{
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
.SubtractLeft(input, output, difference_op, tile_prev_input);
}
}
}
else
{
if (IS_LAST_TILE)
{
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
.SubtractRightPartialTile(input, output, difference_op, num_remaining);
}
else
{
InputT tile_next_input = MayAlias
? first_tile_previous[tile_idx]
: *(input_it + tile_base + ITEMS_PER_TILE);
BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
.SubtractRight(input, output, difference_op, tile_next_input);
}
}
CTA_SYNC();
if (IS_LAST_TILE)
{
BlockStore(temp_storage.store)
.Store(result + tile_base, output, num_remaining);
}
else
{
BlockStore(temp_storage.store).Store(result + tile_base, output);
}
}
template <bool IS_LAST_TILE>
__device__ __forceinline__ void consume_tile(int num_remaining,
int tile_idx,
OffsetT tile_base)
{
if (tile_idx == 0)
{
consume_tile_impl<IS_LAST_TILE, true>(num_remaining,
tile_idx,
tile_base);
}
else
{
consume_tile_impl<IS_LAST_TILE, false>(num_remaining,
tile_idx,
tile_base);
}
}
__device__ __forceinline__ void Process(int tile_idx,
OffsetT tile_base)
{
OffsetT num_remaining = num_items - tile_base;
if (num_remaining > ITEMS_PER_TILE) // not a last tile
{
consume_tile<false>(num_remaining, tile_idx, tile_base);
}
else
{
consume_tile<true>(num_remaining, tile_idx, tile_base);
}
}
};
template <typename InputIteratorT,
typename InputT,
typename OffsetT,
bool ReadLeft>
struct AgentDifferenceInit
{
static constexpr int BLOCK_THREADS = 128;
static __device__ __forceinline__ void Process(int tile_idx,
InputIteratorT first,
InputT *result,
OffsetT num_tiles,
int items_per_tile)
{
OffsetT tile_base = static_cast<OffsetT>(tile_idx) * items_per_tile;
if (tile_base > 0 && tile_idx < num_tiles)
{
if (ReadLeft)
{
result[tile_idx] = first[tile_base - 1];
}
else
{
result[tile_idx - 1] = first[tile_base];
}
}
}
};
CUB_NAMESPACE_END