thrust / install /include /cub /agent /agent_histogram.cuh

thanks to nvidia ❤

0dc1b04 almost 3 years ago

33.1 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/**
	* \file
	* cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
	*/

	#pragma once

	#include <iterator>

	#include "../util_type.cuh"
	#include "../block/block_load.cuh"
	#include "../config.cuh"
	#include "../grid/grid_queue.cuh"
	#include "../iterator/cache_modified_input_iterator.cuh"

	CUB_NAMESPACE_BEGIN


	/******************************************************************************
	* Tuning policy
	******************************************************************************/

	/**
	*
	*/
	enum BlockHistogramMemoryPreference
	{
	GMEM,
	SMEM,
	BLEND
	};


	/**
	* Parameterizable tuning policy type for AgentHistogram
	*/
	template <
	int _BLOCK_THREADS, ///< Threads per thread block
	int _PIXELS_PER_THREAD, ///< Pixels per thread (per tile of input)
	BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use
	CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements
	bool _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming
	BlockHistogramMemoryPreference _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
	bool _WORK_STEALING> ///< Whether to dequeue tiles from a global work queue
	struct AgentHistogramPolicy
	{
	enum
	{
	BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
	PIXELS_PER_THREAD = _PIXELS_PER_THREAD, ///< Pixels per thread (per tile of input)
	IS_RLE_COMPRESS = _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming
	MEM_PREFERENCE = _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
	IS_WORK_STEALING = _WORK_STEALING, ///< Whether to dequeue tiles from a global work queue
	};

	static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use
	static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements
	};


	/******************************************************************************
	* Thread block abstractions
	******************************************************************************/

	/**
	* \brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
	*/
	template <
	typename AgentHistogramPolicyT, ///< Parameterized AgentHistogramPolicy tuning policy type
	int PRIVATIZED_SMEM_BINS, ///< Number of privatized shared-memory histogram bins of any channel. Zero indicates privatized counters to be maintained in device-accessible memory.
	int NUM_CHANNELS, ///< Number of channels interleaved in the input data. Supports up to four channels.
	int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
	typename SampleIteratorT, ///< Random-access input iterator type for reading samples
	typename CounterT, ///< Integer type for counting sample occurrences per histogram bin
	typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel
	typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
	typename OffsetT, ///< Signed integer type for global offsets
	int LEGACY_PTX_ARCH = 0> ///< PTX compute capability (unused)
	struct AgentHistogram
	{
	//---------------------------------------------------------------------
	// Types and constants
	//---------------------------------------------------------------------

	/// The sample type of the input iterator
	using SampleT = cub::detail::value_t<SampleIteratorT>;

	/// The pixel type of SampleT
	using PixelT = typename CubVector<SampleT, NUM_CHANNELS>::Type;

	/// The quad type of SampleT
	using QuadT = typename CubVector<SampleT, 4>::Type;

	/// Constants
	enum
	{
	BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS,

	PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD,
	SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS,
	QUADS_PER_THREAD = SAMPLES_PER_THREAD / 4,

	TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS,
	TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS,

	IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS,

	MEM_PREFERENCE = (PRIVATIZED_SMEM_BINS > 0) ?
	AgentHistogramPolicyT::MEM_PREFERENCE :
	GMEM,

	IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING,
	};

	/// Cache load modifier for reading input elements
	static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;


	/// Input iterator wrapper type (for applying cache modifier)
	// Wrap the native input pointer with CacheModifiedInputIterator
	// or directly use the supplied input iterator type
	using WrappedSampleIteratorT = cub::detail::conditional_t<
	std::is_pointer<SampleIteratorT>::value,
	CacheModifiedInputIterator<LOAD_MODIFIER, SampleT, OffsetT>,
	SampleIteratorT>;

	/// Pixel input iterator type (for applying cache modifier)
	typedef CacheModifiedInputIterator<LOAD_MODIFIER, PixelT, OffsetT>
	WrappedPixelIteratorT;

	/// Qaud input iterator type (for applying cache modifier)
	typedef CacheModifiedInputIterator<LOAD_MODIFIER, QuadT, OffsetT>
	WrappedQuadIteratorT;

	/// Parameterized BlockLoad type for samples
	typedef BlockLoad<
	SampleT,
	BLOCK_THREADS,
	SAMPLES_PER_THREAD,
	AgentHistogramPolicyT::LOAD_ALGORITHM>
	BlockLoadSampleT;

	/// Parameterized BlockLoad type for pixels
	typedef BlockLoad<
	PixelT,
	BLOCK_THREADS,
	PIXELS_PER_THREAD,
	AgentHistogramPolicyT::LOAD_ALGORITHM>
	BlockLoadPixelT;

	/// Parameterized BlockLoad type for quads
	typedef BlockLoad<
	QuadT,
	BLOCK_THREADS,
	QUADS_PER_THREAD,
	AgentHistogramPolicyT::LOAD_ALGORITHM>
	BlockLoadQuadT;

	/// Shared memory type required by this thread block
	struct _TempStorage
	{
	CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1]; // Smem needed for block-privatized smem histogram (with 1 word of padding)

	int tile_idx;

	// Aliasable storage layout
	union Aliasable
	{
	typename BlockLoadSampleT::TempStorage sample_load; // Smem needed for loading a tile of samples
	typename BlockLoadPixelT::TempStorage pixel_load; // Smem needed for loading a tile of pixels
	typename BlockLoadQuadT::TempStorage quad_load; // Smem needed for loading a tile of quads

	} aliasable;
	};


	/// Temporary storage type (unionable)
	struct TempStorage : Uninitialized<_TempStorage> {};


	//---------------------------------------------------------------------
	// Per-thread fields
	//---------------------------------------------------------------------

	/// Reference to temp_storage
	_TempStorage &temp_storage;

	/// Sample input iterator (with cache modifier applied, if possible)
	WrappedSampleIteratorT d_wrapped_samples;

	/// Native pointer for input samples (possibly NULL if unavailable)
	SampleT* d_native_samples;

	/// The number of output bins for each channel
	int (&num_output_bins)[NUM_ACTIVE_CHANNELS];

	/// The number of privatized bins for each channel
	int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS];

	/// Reference to gmem privatized histograms for each channel
	CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS];

	/// Reference to final output histograms (gmem)
	CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS];

	/// The transform operator for determining output bin-ids from privatized counter indices, one for each channel
	OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS];

	/// The transform operator for determining privatized counter indices from samples, one for each channel
	PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS];

	/// Whether to prefer privatized smem counters vs privatized global counters
	bool prefer_smem;


	//---------------------------------------------------------------------
	// Initialize privatized bin counters
	//---------------------------------------------------------------------

	// Initialize privatized bin counters
	__device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
	{
	// Initialize histogram bin counts to zeros
	#pragma unroll
	for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
	{
	for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS)
	{
	privatized_histograms[CHANNEL][privatized_bin] = 0;
	}
	}

	// Barrier to make sure all threads are done updating counters
	CTA_SYNC();
	}


	// Initialize privatized bin counters. Specialized for privatized shared-memory counters
	__device__ __forceinline__ void InitSmemBinCounters()
	{
	CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];

	for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
	privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];

	InitBinCounters(privatized_histograms);
	}


	// Initialize privatized bin counters. Specialized for privatized global-memory counters
	__device__ __forceinline__ void InitGmemBinCounters()
	{
	InitBinCounters(d_privatized_histograms);
	}


	//---------------------------------------------------------------------
	// Update final output histograms
	//---------------------------------------------------------------------

	// Update final output histograms from privatized histograms
	__device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
	{
	// Barrier to make sure all threads are done updating counters
	CTA_SYNC();

	// Apply privatized bin counts to output bin counts
	#pragma unroll
	for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
	{
	int channel_bins = num_privatized_bins[CHANNEL];
	for (int privatized_bin = threadIdx.x;
	privatized_bin < channel_bins;
	privatized_bin += BLOCK_THREADS)
	{
	int output_bin = -1;
	CounterT count = privatized_histograms[CHANNEL][privatized_bin];
	bool is_valid = count > 0;

	output_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>((SampleT) privatized_bin, output_bin, is_valid);

	if (output_bin >= 0)
	{
	atomicAdd(&d_output_histograms[CHANNEL][output_bin], count);
	}

	}
	}
	}


	// Update final output histograms from privatized histograms. Specialized for privatized shared-memory counters
	__device__ __forceinline__ void StoreSmemOutput()
	{
	CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
	for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
	privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];

	StoreOutput(privatized_histograms);
	}


	// Update final output histograms from privatized histograms. Specialized for privatized global-memory counters
	__device__ __forceinline__ void StoreGmemOutput()
	{
	StoreOutput(d_privatized_histograms);
	}


	//---------------------------------------------------------------------
	// Tile accumulation
	//---------------------------------------------------------------------

	// Accumulate pixels. Specialized for RLE compression.
	__device__ __forceinline__ void AccumulatePixels(
	SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
	bool is_valid[PIXELS_PER_THREAD],
	CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS],
	Int2Type<true> is_rle_compress)
	{
	#pragma unroll
	for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
	{
	// Bin pixels
	int bins[PIXELS_PER_THREAD];

	#pragma unroll
	for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
	{
	bins[PIXEL] = -1;
	privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]);
	}

	CounterT accumulator = 1;

	#pragma unroll
	for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL)
	{
	if (bins[PIXEL] != bins[PIXEL + 1])
	{
	if (bins[PIXEL] >= 0)
	atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator);

	accumulator = 0;
	}
	accumulator++;
	}

	// Last pixel
	if (bins[PIXELS_PER_THREAD - 1] >= 0)
	atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator);
	}
	}


	// Accumulate pixels. Specialized for individual accumulation of each pixel.
	__device__ __forceinline__ void AccumulatePixels(
	SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
	bool is_valid[PIXELS_PER_THREAD],
	CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS],
	Int2Type<false> is_rle_compress)
	{
	#pragma unroll
	for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
	{
	#pragma unroll
	for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
	{
	int bin = -1;
	privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]);
	if (bin >= 0)
	atomicAdd(privatized_histograms[CHANNEL] + bin, 1);
	}
	}
	}


	/**
	* Accumulate pixel, specialized for smem privatized histogram
	*/
	__device__ __forceinline__ void AccumulateSmemPixels(
	SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
	bool is_valid[PIXELS_PER_THREAD])
	{
	CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];

	for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
	privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];

	AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
	}


	/**
	* Accumulate pixel, specialized for gmem privatized histogram
	*/
	__device__ __forceinline__ void AccumulateGmemPixels(
	SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
	bool is_valid[PIXELS_PER_THREAD])
	{
	AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
	}



	//---------------------------------------------------------------------
	// Tile loading
	//---------------------------------------------------------------------

	// Load full, aligned tile using pixel iterator (multi-channel)
	template <int _NUM_ACTIVE_CHANNELS>
	__device__ __forceinline__ void LoadFullAlignedTile(
	OffsetT block_offset,
	int valid_samples,
	SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
	Int2Type<_NUM_ACTIVE_CHANNELS> num_active_channels)
	{
	typedef PixelT AliasedPixels[PIXELS_PER_THREAD];

	WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));

	// Load using a wrapped pixel iterator
	BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
	d_wrapped_pixels,
	reinterpret_cast<AliasedPixels&>(samples));
	}

	// Load full, aligned tile using quad iterator (single-channel)
	__device__ __forceinline__ void LoadFullAlignedTile(
	OffsetT block_offset,
	int valid_samples,
	SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
	Int2Type<1> num_active_channels)
	{
	typedef QuadT AliasedQuads[QUADS_PER_THREAD];

	WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset));

	// Load using a wrapped quad iterator
	BlockLoadQuadT(temp_storage.aliasable.quad_load).Load(
	d_wrapped_quads,
	reinterpret_cast<AliasedQuads&>(samples));
	}

	// Load full, aligned tile
	__device__ __forceinline__ void LoadTile(
	OffsetT block_offset,
	int valid_samples,
	SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
	Int2Type<true> is_full_tile,
	Int2Type<true> is_aligned)
	{
	LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type<NUM_ACTIVE_CHANNELS>());
	}

	// Load full, mis-aligned tile using sample iterator
	__device__ __forceinline__ void LoadTile(
	OffsetT block_offset,
	int valid_samples,
	SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
	Int2Type<true> is_full_tile,
	Int2Type<false> is_aligned)
	{
	typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];

	// Load using sample iterator
	BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
	d_wrapped_samples + block_offset,
	reinterpret_cast<AliasedSamples&>(samples));
	}

	// Load partially-full, aligned tile using the pixel iterator
	__device__ __forceinline__ void LoadTile(
	OffsetT block_offset,
	int valid_samples,
	SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
	Int2Type<false> is_full_tile,
	Int2Type<true> is_aligned)
	{
	typedef PixelT AliasedPixels[PIXELS_PER_THREAD];

	WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));

	int valid_pixels = valid_samples / NUM_CHANNELS;

	// Load using a wrapped pixel iterator
	BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
	d_wrapped_pixels,
	reinterpret_cast<AliasedPixels&>(samples),
	valid_pixels);
	}

	// Load partially-full, mis-aligned tile using sample iterator
	__device__ __forceinline__ void LoadTile(
	OffsetT block_offset,
	int valid_samples,
	SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
	Int2Type<false> is_full_tile,
	Int2Type<false> is_aligned)
	{
	typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];

	BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
	d_wrapped_samples + block_offset,
	reinterpret_cast<AliasedSamples&>(samples),
	valid_samples);
	}


	//---------------------------------------------------------------------
	// Tile processing
	//---------------------------------------------------------------------

	// Consume a tile of data samples
	template <
	bool IS_ALIGNED, // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel)
	bool IS_FULL_TILE> // Whether the tile is full
	__device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples)
	{
	SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS];
	bool is_valid[PIXELS_PER_THREAD];

	// Load tile
	LoadTile(
	block_offset,
	valid_samples,
	samples,
	Int2Type<IS_FULL_TILE>(),
	Int2Type<IS_ALIGNED>());

	// Set valid flags
	#pragma unroll
	for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
	is_valid[PIXEL] = IS_FULL_TILE \|\| (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);

	// Accumulate samples
	if (prefer_smem)
	AccumulateSmemPixels(samples, is_valid);
	else
	AccumulateGmemPixels(samples, is_valid);
	}


	// Consume row tiles. Specialized for work-stealing from queue
	template <bool IS_ALIGNED>
	__device__ __forceinline__ void ConsumeTiles(
	OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest
	OffsetT num_rows, ///< The number of rows in the region of interest
	OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest
	int tiles_per_row, ///< Number of image tiles per row
	GridQueue<int> tile_queue,
	Int2Type<true> is_work_stealing)
	{

	int num_tiles = num_rows * tiles_per_row;
	int tile_idx = (blockIdx.y * gridDim.x) + blockIdx.x;
	OffsetT num_even_share_tiles = gridDim.x * gridDim.y;

	while (tile_idx < num_tiles)
	{
	int row = tile_idx / tiles_per_row;
	int col = tile_idx - (row * tiles_per_row);
	OffsetT row_offset = row * row_stride_samples;
	OffsetT col_offset = (col * TILE_SAMPLES);
	OffsetT tile_offset = row_offset + col_offset;

	if (col == tiles_per_row - 1)
	{
	// Consume a partially-full tile at the end of the row
	OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;
	ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
	}
	else
	{
	// Consume full tile
	ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
	}

	CTA_SYNC();

	// Get next tile
	if (threadIdx.x == 0)
	temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles;

	CTA_SYNC();

	tile_idx = temp_storage.tile_idx;
	}
	}


	// Consume row tiles. Specialized for even-share (striped across thread blocks)
	template <bool IS_ALIGNED>
	__device__ __forceinline__ void ConsumeTiles(
	OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest
	OffsetT num_rows, ///< The number of rows in the region of interest
	OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest
	int tiles_per_row, ///< Number of image tiles per row
	GridQueue<int> tile_queue,
	Int2Type<false> is_work_stealing)
	{
	for (int row = blockIdx.y; row < num_rows; row += gridDim.y)
	{
	OffsetT row_begin = row * row_stride_samples;
	OffsetT row_end = row_begin + (num_row_pixels * NUM_CHANNELS);
	OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES);

	while (tile_offset < row_end)
	{
	OffsetT num_remaining = row_end - tile_offset;

	if (num_remaining < TILE_SAMPLES)
	{
	// Consume partial tile
	ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
	break;
	}

	// Consume full tile
	ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
	tile_offset += gridDim.x * TILE_SAMPLES;
	}
	}
	}


	//---------------------------------------------------------------------
	// Parameter extraction
	//---------------------------------------------------------------------

	// Return a native pixel pointer (specialized for CacheModifiedInputIterator types)
	template <
	CacheLoadModifier _MODIFIER,
	typename _ValueT,
	typename _OffsetT>
	__device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr)
	{
	return itr.ptr;
	}

	// Return a native pixel pointer (specialized for other types)
	template <typename IteratorT>
	__device__ __forceinline__ SampleT* NativePointer(IteratorT itr)
	{
	return NULL;
	}



	//---------------------------------------------------------------------
	// Interface
	//---------------------------------------------------------------------


	/**
	* Constructor
	*/
	__device__ __forceinline__ AgentHistogram(
	TempStorage &temp_storage, ///< Reference to temp_storage
	SampleIteratorT d_samples, ///< Input data to reduce
	int (&num_output_bins)[NUM_ACTIVE_CHANNELS], ///< The number bins per final output histogram
	int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS], ///< The number bins per privatized histogram
	CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS], ///< Reference to final output histograms
	CounterT* (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS], ///< Reference to privatized histograms
	OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS], ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
	PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS]) ///< The transform operator for determining privatized counter indices from samples, one for each channel
	:
	temp_storage(temp_storage.Alias()),
	d_wrapped_samples(d_samples),
	d_native_samples(NativePointer(d_wrapped_samples)),
	num_output_bins(num_output_bins),
	num_privatized_bins(num_privatized_bins),
	d_output_histograms(d_output_histograms),
	output_decode_op(output_decode_op),
	privatized_decode_op(privatized_decode_op),
	prefer_smem((MEM_PREFERENCE == SMEM) ?
	true : // prefer smem privatized histograms
	(MEM_PREFERENCE == GMEM) ?
	false : // prefer gmem privatized histograms
	blockIdx.x & 1) // prefer blended privatized histograms
	{
	int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;

	// Initialize the locations of this block's privatized histograms
	for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
	this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]);
	}


	/**
	* Consume image
	*/
	__device__ __forceinline__ void ConsumeTiles(
	OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest
	OffsetT num_rows, ///< The number of rows in the region of interest
	OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest
	int tiles_per_row, ///< Number of image tiles per row
	GridQueue<int> tile_queue) ///< Queue descriptor for assigning tiles of work to thread blocks
	{
	// Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel)
	int quad_mask = AlignBytes<QuadT>::ALIGN_BYTES - 1;
	int pixel_mask = AlignBytes<PixelT>::ALIGN_BYTES - 1;
	size_t row_bytes = sizeof(SampleT) * row_stride_samples;

	bool quad_aligned_rows = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) && // Single channel
	((size_t(d_native_samples) & quad_mask) == 0) && // ptr is quad-aligned
	((num_rows == 1) \|\| ((row_bytes & quad_mask) == 0)); // number of row-samples is a multiple of the alignment of the quad

	bool pixel_aligned_rows = (NUM_CHANNELS > 1) && // Multi channel
	((size_t(d_native_samples) & pixel_mask) == 0) && // ptr is pixel-aligned
	((row_bytes & pixel_mask) == 0); // number of row-samples is a multiple of the alignment of the pixel

	// Whether rows are aligned and can be vectorized
	if ((d_native_samples != NULL) && (quad_aligned_rows \|\| pixel_aligned_rows))
	ConsumeTiles<true>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
	else
	ConsumeTiles<false>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
	}


	/**
	* Initialize privatized bin counters. Specialized for privatized shared-memory counters
	*/
	__device__ __forceinline__ void InitBinCounters()
	{
	if (prefer_smem)
	InitSmemBinCounters();
	else
	InitGmemBinCounters();
	}


	/**
	* Store privatized histogram to device-accessible memory. Specialized for privatized shared-memory counters
	*/
	__device__ __forceinline__ void StoreOutput()
	{
	if (prefer_smem)
	StoreSmemOutput();
	else
	StoreGmemOutput();
	}


	};

	CUB_NAMESPACE_END