thrust / install /include /cub /device /dispatch /dispatch_spmv_orig.cuh

thanks to nvidia ❤

0dc1b04 over 2 years ago

35.9 kB


	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/**
	* \file
	* cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV).
	*/

	#pragma once

	#include <cub/agent/agent_segment_fixup.cuh>
	#include <cub/agent/agent_spmv_orig.cuh>
	#include <cub/agent/single_pass_scan_operators.cuh>
	#include <cub/config.cuh>
	#include <cub/detail/cpp_compatibility.cuh>
	#include <cub/grid/grid_queue.cuh>
	#include <cub/thread/thread_search.cuh>
	#include <cub/util_debug.cuh>
	#include <cub/util_deprecated.cuh>
	#include <cub/util_device.cuh>
	#include <cub/util_math.cuh>
	#include <cub/util_type.cuh>

	#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

	#include <cstdio>
	#include <iterator>

	#include <nv/target>

	CUB_NAMESPACE_BEGIN


	/******************************************************************************
	* SpMV kernel entry points
	*****************************************************************************/

	/**
	* Spmv search kernel. Identifies merge path starting coordinates for each tile.
	*/
	template <
	typename AgentSpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type
	typename ValueT, ///< Matrix and vector value type
	typename OffsetT> ///< Signed integer type for sequence offsets
	__global__ void DeviceSpmv1ColKernel(
	SpmvParams<ValueT, OffsetT> spmv_params) ///< [in] SpMV input parameter bundle
	{
	typedef CacheModifiedInputIterator<
	AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
	ValueT,
	OffsetT>
	VectorValueIteratorT;

	VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x);

	int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
	if (row_idx < spmv_params.num_rows)
	{
	OffsetT end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx];
	OffsetT nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1];

	ValueT value = 0.0;
	if (end_nonzero_idx != nonzero_idx)
	{
	value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]];
	}

	spmv_params.d_vector_y[row_idx] = value;
	}
	}


	/**
	* Spmv search kernel. Identifies merge path starting coordinates for each tile.
	*/
	template <
	typename SpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type
	typename OffsetT, ///< Signed integer type for sequence offsets
	typename CoordinateT, ///< Merge path coordinate type
	typename SpmvParamsT> ///< SpmvParams type
	__global__ void DeviceSpmvSearchKernel(
	int num_merge_tiles, ///< [in] Number of SpMV merge tiles (spmv grid size)
	CoordinateT* d_tile_coordinates, ///< [out] Pointer to the temporary array of tile starting coordinates
	SpmvParamsT spmv_params) ///< [in] SpMV input parameter bundle
	{
	/// Constants
	enum
	{
	BLOCK_THREADS = SpmvPolicyT::BLOCK_THREADS,
	ITEMS_PER_THREAD = SpmvPolicyT::ITEMS_PER_THREAD,
	TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
	};

	typedef CacheModifiedInputIterator<
	SpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
	OffsetT,
	OffsetT>
	RowOffsetsSearchIteratorT;

	// Find the starting coordinate for all tiles (plus the end coordinate of the last one)
	int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
	if (tile_idx < num_merge_tiles + 1)
	{
	OffsetT diagonal = (tile_idx * TILE_ITEMS);
	CoordinateT tile_coordinate;
	CountingInputIterator<OffsetT> nonzero_indices(0);

	// Search the merge path
	MergePathSearch(
	diagonal,
	RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
	nonzero_indices,
	spmv_params.num_rows,
	spmv_params.num_nonzeros,
	tile_coordinate);

	// Output starting offset
	d_tile_coordinates[tile_idx] = tile_coordinate;
	}
	}


	/**
	* Spmv agent entry point
	*/
	template <
	typename SpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type
	typename ScanTileStateT, ///< Tile status interface type
	typename ValueT, ///< Matrix and vector value type
	typename OffsetT, ///< Signed integer type for sequence offsets
	typename CoordinateT, ///< Merge path coordinate type
	bool HAS_ALPHA, ///< Whether the input parameter Alpha is 1
	bool HAS_BETA> ///< Whether the input parameter Beta is 0
	__launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS))
	__global__ void DeviceSpmvKernel(
	SpmvParams<ValueT, OffsetT> spmv_params, ///< [in] SpMV input parameter bundle
	CoordinateT* d_tile_coordinates, ///< [in] Pointer to the temporary array of tile starting coordinates
	KeyValuePair<OffsetT,ValueT>* d_tile_carry_pairs, ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
	int num_tiles, ///< [in] Number of merge tiles
	ScanTileStateT tile_state, ///< [in] Tile status interface for fixup reduce-by-key kernel
	int num_segment_fixup_tiles) ///< [in] Number of reduce-by-key tiles (fixup grid size)
	{
	// Spmv agent type specialization
	typedef AgentSpmv<
	SpmvPolicyT,
	ValueT,
	OffsetT,
	HAS_ALPHA,
	HAS_BETA>
	AgentSpmvT;

	// Shared memory for AgentSpmv
	__shared__ typename AgentSpmvT::TempStorage temp_storage;

	AgentSpmvT(temp_storage, spmv_params).ConsumeTile(
	d_tile_coordinates,
	d_tile_carry_pairs,
	num_tiles);

	// Initialize fixup tile status
	tile_state.InitializeStatus(num_segment_fixup_tiles);

	}

	template <typename ValueT, ///< Matrix and vector value type
	typename OffsetT, ///< Signed integer type for sequence offsets
	bool HAS_BETA> ///< Whether the input parameter Beta is 0
	__global__ void DeviceSpmvEmptyMatrixKernel(SpmvParams<ValueT, OffsetT> spmv_params)
	{
	const int row = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);

	if (row < spmv_params.num_rows)
	{
	ValueT result = 0.0;

	CUB_IF_CONSTEXPR(HAS_BETA)
	{
	result += spmv_params.beta * spmv_params.d_vector_y[row];
	}

	spmv_params.d_vector_y[row] = result;
	}
	}

	/**
	* Multi-block reduce-by-key sweep kernel entry point
	*/
	template <
	typename AgentSegmentFixupPolicyT, ///< Parameterized AgentSegmentFixupPolicy tuning policy type
	typename PairsInputIteratorT, ///< Random-access input iterator type for keys
	typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values
	typename OffsetT, ///< Signed integer type for global offsets
	typename ScanTileStateT> ///< Tile status interface type
	__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS))
	__global__ void DeviceSegmentFixupKernel(
	PairsInputIteratorT d_pairs_in, ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block
	AggregatesOutputIteratorT d_aggregates_out, ///< [in,out] Output value aggregates
	OffsetT num_items, ///< [in] Total number of items to select from
	int num_tiles, ///< [in] Total number of tiles for the entire problem
	ScanTileStateT tile_state) ///< [in] Tile status interface
	{
	// Thread block type for reducing tiles of value segments
	typedef AgentSegmentFixup<
	AgentSegmentFixupPolicyT,
	PairsInputIteratorT,
	AggregatesOutputIteratorT,
	cub::Equality,
	cub::Sum,
	OffsetT>
	AgentSegmentFixupT;

	// Shared memory for AgentSegmentFixup
	__shared__ typename AgentSegmentFixupT::TempStorage temp_storage;

	// Process tiles
	AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, cub::Equality(), cub::Sum()).ConsumeRange(
	num_items,
	num_tiles,
	tile_state);
	}


	/******************************************************************************
	* Dispatch
	******************************************************************************/

	/**
	* Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv
	*/
	template <
	typename ValueT, ///< Matrix and vector value type
	typename OffsetT> ///< Signed integer type for global offsets
	struct DispatchSpmv
	{
	//---------------------------------------------------------------------
	// Constants and Types
	//---------------------------------------------------------------------

	enum
	{
	INIT_KERNEL_THREADS = 128,
	EMPTY_MATRIX_KERNEL_THREADS = 128
	};

	// SpmvParams bundle type
	typedef SpmvParams<ValueT, OffsetT> SpmvParamsT;

	// 2D merge path coordinate type
	typedef typename CubVector<OffsetT, 2>::Type CoordinateT;

	// Tile status descriptor interface type
	typedef ReduceByKeyScanTileState<ValueT, OffsetT> ScanTileStateT;

	// Tuple type for scanning (pairs accumulated segment-value with segment-index)
	typedef KeyValuePair<OffsetT, ValueT> KeyValuePairT;


	//---------------------------------------------------------------------
	// Tuning policies
	//---------------------------------------------------------------------

	/// SM35
	struct Policy350
	{
	typedef AgentSpmvPolicy<
	(sizeof(ValueT) > 4) ? 96 : 128,
	(sizeof(ValueT) > 4) ? 4 : 7,
	LOAD_LDG,
	LOAD_CA,
	LOAD_LDG,
	LOAD_LDG,
	LOAD_LDG,
	(sizeof(ValueT) > 4) ? true : false,
	BLOCK_SCAN_WARP_SCANS>
	SpmvPolicyT;

	typedef AgentSegmentFixupPolicy<
	128,
	3,
	BLOCK_LOAD_VECTORIZE,
	LOAD_LDG,
	BLOCK_SCAN_WARP_SCANS>
	SegmentFixupPolicyT;
	};


	/// SM37
	struct Policy370
	{

	typedef AgentSpmvPolicy<
	(sizeof(ValueT) > 4) ? 128 : 128,
	(sizeof(ValueT) > 4) ? 9 : 14,
	LOAD_LDG,
	LOAD_CA,
	LOAD_LDG,
	LOAD_LDG,
	LOAD_LDG,
	false,
	BLOCK_SCAN_WARP_SCANS>
	SpmvPolicyT;

	typedef AgentSegmentFixupPolicy<
	128,
	3,
	BLOCK_LOAD_VECTORIZE,
	LOAD_LDG,
	BLOCK_SCAN_WARP_SCANS>
	SegmentFixupPolicyT;
	};

	/// SM50
	struct Policy500
	{
	typedef AgentSpmvPolicy<
	(sizeof(ValueT) > 4) ? 64 : 128,
	(sizeof(ValueT) > 4) ? 6 : 7,
	LOAD_LDG,
	LOAD_DEFAULT,
	(sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
	(sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
	LOAD_LDG,
	(sizeof(ValueT) > 4) ? true : false,
	(sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE>
	SpmvPolicyT;


	typedef AgentSegmentFixupPolicy<
	128,
	3,
	BLOCK_LOAD_VECTORIZE,
	LOAD_LDG,
	BLOCK_SCAN_RAKING_MEMOIZE>
	SegmentFixupPolicyT;
	};


	/// SM60
	struct Policy600
	{
	typedef AgentSpmvPolicy<
	(sizeof(ValueT) > 4) ? 64 : 128,
	(sizeof(ValueT) > 4) ? 5 : 7,
	LOAD_DEFAULT,
	LOAD_DEFAULT,
	LOAD_DEFAULT,
	LOAD_DEFAULT,
	LOAD_DEFAULT,
	false,
	BLOCK_SCAN_WARP_SCANS>
	SpmvPolicyT;


	typedef AgentSegmentFixupPolicy<
	128,
	3,
	BLOCK_LOAD_DIRECT,
	LOAD_LDG,
	BLOCK_SCAN_WARP_SCANS>
	SegmentFixupPolicyT;
	};



	//---------------------------------------------------------------------
	// Tuning policies of current PTX compiler pass
	//---------------------------------------------------------------------

	#if (CUB_PTX_ARCH >= 600)
	typedef Policy600 PtxPolicy;

	#elif (CUB_PTX_ARCH >= 500)
	typedef Policy500 PtxPolicy;

	#elif (CUB_PTX_ARCH >= 370)
	typedef Policy370 PtxPolicy;

	#else
	typedef Policy350 PtxPolicy;

	#endif

	// "Opaque" policies (whose parameterizations aren't reflected in the type signature)
	struct PtxSpmvPolicyT : PtxPolicy::SpmvPolicyT {};
	struct PtxSegmentFixupPolicy : PtxPolicy::SegmentFixupPolicyT {};


	//---------------------------------------------------------------------
	// Utilities
	//---------------------------------------------------------------------

	/**
	* Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
	*/
	template <typename KernelConfig>
	CUB_RUNTIME_FUNCTION __forceinline__
	static void InitConfigs(
	int ptx_version,
	KernelConfig &spmv_config,
	KernelConfig &segment_fixup_config)
	{
	NV_IF_TARGET(
	NV_IS_DEVICE,
	( // We're on the device, so initialize the kernel dispatch
	// configurations with the current PTX policy
	spmv_config.template Init<PtxSpmvPolicyT>();
	segment_fixup_config.template Init<PtxSegmentFixupPolicy>();),
	(
	// We're on the host, so lookup and initialize the kernel dispatch
	// configurations with the policies that match the device's PTX
	// version
	if (ptx_version >= 600) {
	spmv_config.template Init<typename Policy600::SpmvPolicyT>();
	segment_fixup_config
	.template Init<typename Policy600::SegmentFixupPolicyT>();
	} else if (ptx_version >= 500) {
	spmv_config.template Init<typename Policy500::SpmvPolicyT>();
	segment_fixup_config
	.template Init<typename Policy500::SegmentFixupPolicyT>();
	} else if (ptx_version >= 370) {
	spmv_config.template Init<typename Policy370::SpmvPolicyT>();
	segment_fixup_config
	.template Init<typename Policy370::SegmentFixupPolicyT>();
	} else {
	spmv_config.template Init<typename Policy350::SpmvPolicyT>();
	segment_fixup_config
	.template Init<typename Policy350::SegmentFixupPolicyT>();
	}));
	}


	/**
	* Kernel kernel dispatch configuration.
	*/
	struct KernelConfig
	{
	int block_threads;
	int items_per_thread;
	int tile_items;

	template <typename PolicyT>
	CUB_RUNTIME_FUNCTION __forceinline__
	void Init()
	{
	block_threads = PolicyT::BLOCK_THREADS;
	items_per_thread = PolicyT::ITEMS_PER_THREAD;
	tile_items = block_threads * items_per_thread;
	}
	};


	//---------------------------------------------------------------------
	// Dispatch entrypoints
	//---------------------------------------------------------------------

	/**
	* Internal dispatch routine for computing a device-wide reduction using the
	* specified kernel functions.
	*
	* If the input is larger than a single tile, this method uses two-passes of
	* kernel invocations.
	*/
	template <
	typename Spmv1ColKernelT, ///< Function type of cub::DeviceSpmv1ColKernel
	typename SpmvSearchKernelT, ///< Function type of cub::AgentSpmvSearchKernel
	typename SpmvKernelT, ///< Function type of cub::AgentSpmvKernel
	typename SegmentFixupKernelT, ///< Function type of cub::DeviceSegmentFixupKernelT
	typename SpmvEmptyMatrixKernelT> ///< Function type of cub::DeviceSpmvEmptyMatrixKernel
	CUB_RUNTIME_FUNCTION __forceinline__
	static cudaError_t Dispatch(
	void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
	size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
	SpmvParamsT& spmv_params, ///< SpMV input parameter bundle
	cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
	Spmv1ColKernelT spmv_1col_kernel, ///< [in] Kernel function pointer to parameterization of DeviceSpmv1ColKernel
	SpmvSearchKernelT spmv_search_kernel, ///< [in] Kernel function pointer to parameterization of AgentSpmvSearchKernel
	SpmvKernelT spmv_kernel, ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel
	SegmentFixupKernelT segment_fixup_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel
	SpmvEmptyMatrixKernelT spmv_empty_matrix_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSpmvEmptyMatrixKernel
	KernelConfig spmv_config, ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for
	KernelConfig segment_fixup_config) ///< [in] Dispatch parameters that match the policy that \p segment_fixup_kernel was compiled for
	{
	cudaError error = cudaSuccess;
	do
	{
	if (spmv_params.num_rows < 0 \|\| spmv_params.num_cols < 0)
	{
	return cudaErrorInvalidValue;
	}

	if (spmv_params.num_rows == 0 \|\| spmv_params.num_cols == 0)
	{ // Empty problem, no-op.
	if (d_temp_storage == NULL)
	{
	temp_storage_bytes = 1;
	}

	break;
	}

	if (spmv_params.num_nonzeros == 0)
	{
	if (d_temp_storage == NULL)
	{
	// Return if the caller is simply requesting the size of the storage allocation
	temp_storage_bytes = 1;
	break;
	}

	const int threads_in_block = EMPTY_MATRIX_KERNEL_THREADS;
	const int blocks_in_grid = cub::DivideAndRoundUp(spmv_params.num_rows,
	threads_in_block);

	#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
	_CubLog("Invoking spmv_empty_matrix_kernel<<<%d, %d, 0, %lld>>>()\n",
	blocks_in_grid,
	threads_in_block,
	(long long)stream);
	#endif
	error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(blocks_in_grid,
	threads_in_block,
	0,
	stream)
	.doit(spmv_empty_matrix_kernel, spmv_params);

	if (CubDebug(error))
	{
	break;
	}

	// Sync the stream if specified to flush runtime errors
	error = detail::DebugSyncStream(stream);
	if (CubDebug(error))
	{
	break;
	}

	break;
	}

	if (spmv_params.num_cols == 1)
	{
	if (d_temp_storage == NULL)
	{
	// Return if the caller is simply requesting the size of the storage allocation
	temp_storage_bytes = 1;
	break;
	}

	// Get search/init grid dims
	int degen_col_kernel_block_size = INIT_KERNEL_THREADS;
	int degen_col_kernel_grid_size = cub::DivideAndRoundUp(spmv_params.num_rows, degen_col_kernel_block_size);

	#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
	_CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",
	degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream);
	#endif

	// Invoke spmv_search_kernel
	THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
	degen_col_kernel_grid_size, degen_col_kernel_block_size, 0,
	stream
	).doit(spmv_1col_kernel,
	spmv_params);

	// Check for failure to launch
	if (CubDebug(error = cudaPeekAtLastError()))
	{
	break;
	}

	// Sync the stream if specified to flush runtime errors
	error = detail::DebugSyncStream(stream);
	if (CubDebug(error))
	{
	break;
	}

	break;
	}

	// Get device ordinal
	int device_ordinal;
	if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;

	// Get SM count
	int sm_count;
	if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;

	// Get max x-dimension of grid
	int max_dim_x;
	if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;

	// Total number of spmv work items
	int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros;

	// Tile sizes of kernels
	int merge_tile_size = spmv_config.block_threads * spmv_config.items_per_thread;
	int segment_fixup_tile_size = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread;

	// Number of tiles for kernels
	int num_merge_tiles = cub::DivideAndRoundUp(num_merge_items, merge_tile_size);
	int num_segment_fixup_tiles = cub::DivideAndRoundUp(num_merge_tiles, segment_fixup_tile_size);

	// Get SM occupancy for kernels
	int spmv_sm_occupancy;
	if (CubDebug(error = MaxSmOccupancy(
	spmv_sm_occupancy,
	spmv_kernel,
	spmv_config.block_threads))) break;

	int segment_fixup_sm_occupancy;
	if (CubDebug(error = MaxSmOccupancy(
	segment_fixup_sm_occupancy,
	segment_fixup_kernel,
	segment_fixup_config.block_threads))) break;

	// Get grid dimensions
	dim3 spmv_grid_size(
	CUB_MIN(num_merge_tiles, max_dim_x),
	cub::DivideAndRoundUp(num_merge_tiles, max_dim_x),
	1);

	dim3 segment_fixup_grid_size(
	CUB_MIN(num_segment_fixup_tiles, max_dim_x),
	cub::DivideAndRoundUp(num_segment_fixup_tiles, max_dim_x),
	1);

	// Get the temporary storage allocation requirements
	size_t allocation_sizes[3];
	if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0]))) break; // bytes needed for reduce-by-key tile status descriptors
	allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT); // bytes needed for block carry-out pairs
	allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT); // bytes needed for tile starting coordinates

	// Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
	void* allocations[3] = {};
	if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
	if (d_temp_storage == NULL)
	{
	// Return if the caller is simply requesting the size of the storage allocation
	break;
	}

	// Construct the tile status interface
	ScanTileStateT tile_state;
	if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0]))) break;

	// Alias the other allocations
	KeyValuePairT* d_tile_carry_pairs = (KeyValuePairT*) allocations[1]; // Agent carry-out pairs
	CoordinateT* d_tile_coordinates = (CoordinateT*) allocations[2]; // Agent starting coordinates

	// Get search/init grid dims
	int search_block_size = INIT_KERNEL_THREADS;
	int search_grid_size = cub::DivideAndRoundUp(num_merge_tiles + 1, search_block_size);

	if (search_grid_size < sm_count)
	// if (num_merge_tiles < spmv_sm_occupancy * sm_count)
	{
	// Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords
	d_tile_coordinates = NULL;
	}
	else
	{
	// Use separate search kernel if we have enough spmv tiles to saturate the device

	// Log spmv_search_kernel configuration
	#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
	_CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n",
	search_grid_size, search_block_size, (long long) stream);
	#endif

	// Invoke spmv_search_kernel
	THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
	search_grid_size, search_block_size, 0, stream
	).doit(spmv_search_kernel,
	num_merge_tiles,
	d_tile_coordinates,
	spmv_params);

	// Check for failure to launch
	if (CubDebug(error = cudaPeekAtLastError())) break;

	// Sync the stream if specified to flush runtime errors
	error = detail::DebugSyncStream(stream);
	if (CubDebug(error))
	{
	break;
	}
	}

	// Log spmv_kernel configuration
	#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
	_CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
	spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy);
	#endif

	// Invoke spmv_kernel
	THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
	spmv_grid_size, spmv_config.block_threads, 0, stream
	).doit(spmv_kernel,
	spmv_params,
	d_tile_coordinates,
	d_tile_carry_pairs,
	num_merge_tiles,
	tile_state,
	num_segment_fixup_tiles);

	// Check for failure to launch
	if (CubDebug(error = cudaPeekAtLastError())) break;

	// Sync the stream if specified to flush runtime errors
	error = detail::DebugSyncStream(stream);
	if (CubDebug(error))
	{
	break;
	}

	// Run reduce-by-key fixup if necessary
	if (num_merge_tiles > 1)
	{
	// Log segment_fixup_kernel configuration
	#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
	_CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
	segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy);
	#endif

	// Invoke segment_fixup_kernel
	THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
	segment_fixup_grid_size, segment_fixup_config.block_threads,
	0, stream
	).doit(segment_fixup_kernel,
	d_tile_carry_pairs,
	spmv_params.d_vector_y,
	num_merge_tiles,
	num_segment_fixup_tiles,
	tile_state);

	// Check for failure to launch
	if (CubDebug(error = cudaPeekAtLastError())) break;

	// Sync the stream if specified to flush runtime errors
	error = detail::DebugSyncStream(stream);
	if (CubDebug(error))
	{
	break;
	}
	}
	}
	while (0);

	return error;
	}

	template <typename Spmv1ColKernelT,
	typename SpmvSearchKernelT,
	typename SpmvKernelT,
	typename SegmentFixupKernelT,
	typename SpmvEmptyMatrixKernelT>
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
	CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
	Dispatch(void *d_temp_storage,
	size_t &temp_storage_bytes,
	SpmvParamsT &spmv_params,
	cudaStream_t stream,
	bool debug_synchronous,
	Spmv1ColKernelT spmv_1col_kernel,
	SpmvSearchKernelT spmv_search_kernel,
	SpmvKernelT spmv_kernel,
	SegmentFixupKernelT segment_fixup_kernel,
	SpmvEmptyMatrixKernelT spmv_empty_matrix_kernel,
	KernelConfig spmv_config,
	KernelConfig segment_fixup_config)
	{
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

	return Dispatch<Spmv1ColKernelT,
	SpmvSearchKernelT,
	SpmvKernelT,
	SegmentFixupKernelT,
	SpmvEmptyMatrixKernelT>(d_temp_storage,
	temp_storage_bytes,
	spmv_params,
	stream,
	spmv_1col_kernel,
	spmv_search_kernel,
	spmv_kernel,
	segment_fixup_kernel,
	spmv_empty_matrix_kernel,
	spmv_config,
	segment_fixup_config);
	}

	/**
	* Internal dispatch routine for computing a device-wide reduction
	*/
	CUB_RUNTIME_FUNCTION __forceinline__
	static cudaError_t Dispatch(
	void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
	size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
	SpmvParamsT& spmv_params, ///< SpMV input parameter bundle
	cudaStream_t stream = 0) ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
	{
	cudaError error = cudaSuccess;
	do
	{
	// Get PTX version
	int ptx_version = 0;
	if (CubDebug(error = PtxVersion(ptx_version))) break;

	// Get kernel kernel dispatch configurations
	KernelConfig spmv_config, segment_fixup_config;
	InitConfigs(ptx_version, spmv_config, segment_fixup_config);

	constexpr bool has_alpha = false;
	constexpr bool has_beta = false;

	if (CubDebug(error = Dispatch(
	d_temp_storage, temp_storage_bytes, spmv_params, stream,
	DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,
	DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,
	DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, has_alpha, has_beta>,
	DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT, ValueT, OffsetT, ScanTileStateT>,
	DeviceSpmvEmptyMatrixKernel<ValueT, OffsetT, has_beta>,
	spmv_config, segment_fixup_config))) break;

	}
	while (0);

	return error;
	}

	CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
	CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
	Dispatch(void *d_temp_storage,
	size_t &temp_storage_bytes,
	SpmvParamsT &spmv_params,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

	return Dispatch(d_temp_storage, temp_storage_bytes, spmv_params, stream);
	}
	};


	CUB_NAMESPACE_END