Spaces:
Runtime error
Runtime error
| /****************************************************************************** | |
| * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. | |
| * | |
| * Redistribution and use in source and binary forms, with or without | |
| * modification, are permitted provided that the following conditions are met: | |
| * * Redistributions of source code must retain the above copyright | |
| * notice, this list of conditions and the following disclaimer. | |
| * * Redistributions in binary form must reproduce the above copyright | |
| * notice, this list of conditions and the following disclaimer in the | |
| * documentation and/or other materials provided with the distribution. | |
| * * Neither the name of the NVIDIA CORPORATION nor the | |
| * names of its contributors may be used to endorse or promote products | |
| * derived from this software without specific prior written permission. | |
| * | |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
| * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
| * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
| * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY | |
| * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
| * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
| * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIAeBILITY, OR TORT | |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
| * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| * | |
| ******************************************************************************/ | |
| //--------------------------------------------------------------------- | |
| // SpMV comparison tool | |
| //--------------------------------------------------------------------- | |
| #include <stdio.h> | |
| #include <map> | |
| #include <vector> | |
| #include <algorithm> | |
| #include <cstdio> | |
| #include <fstream> | |
| #include <cusparse.h> | |
| #include "sparse_matrix.h" | |
| // Ensure printing of CUDA runtime errors to console | |
| #define CUB_STDERR | |
| #include <cub/device/device_spmv.cuh> | |
| #include <cub/util_allocator.cuh> | |
| #include <cub/iterator/tex_ref_input_iterator.cuh> | |
| #include <test/test_util.h> | |
| using namespace cub; | |
| //--------------------------------------------------------------------- | |
| // Globals, constants, and type declarations | |
| //--------------------------------------------------------------------- | |
| bool g_quiet = false; // Whether to display stats in CSV format | |
| bool g_verbose = false; // Whether to display output to console | |
| bool g_verbose2 = false; // Whether to display input to console | |
| CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory | |
| //--------------------------------------------------------------------- | |
| // SpMV verification | |
| //--------------------------------------------------------------------- | |
| // Compute reference SpMV y = Ax | |
| template < | |
| typename ValueT, | |
| typename OffsetT> | |
| void SpmvGold( | |
| CsrMatrix<ValueT, OffsetT>& a, | |
| ValueT* vector_x, | |
| ValueT* vector_y_in, | |
| ValueT* vector_y_out, | |
| ValueT alpha, | |
| ValueT beta) | |
| { | |
| for (OffsetT row = 0; row < a.num_rows; ++row) | |
| { | |
| ValueT partial = beta * vector_y_in[row]; | |
| for ( | |
| OffsetT offset = a.row_offsets[row]; | |
| offset < a.row_offsets[row + 1]; | |
| ++offset) | |
| { | |
| partial += alpha * a.values[offset] * vector_x[a.column_indices[offset]]; | |
| } | |
| vector_y_out[row] = partial; | |
| } | |
| } | |
| //--------------------------------------------------------------------- | |
| // GPU I/O proxy | |
| //--------------------------------------------------------------------- | |
| /** | |
| * Read every matrix nonzero value, read every corresponding vector value | |
| */ | |
| template < | |
| int BLOCK_THREADS, | |
| int ITEMS_PER_THREAD, | |
| typename ValueT, | |
| typename OffsetT, | |
| typename VectorItr> | |
| __launch_bounds__ (int(BLOCK_THREADS)) | |
| __global__ void NonZeroIoKernel( | |
| SpmvParams<ValueT, OffsetT> params, | |
| VectorItr d_vector_x) | |
| { | |
| enum | |
| { | |
| TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, | |
| }; | |
| ValueT nonzero = 0.0; | |
| int tile_idx = blockIdx.x; | |
| OffsetT block_offset = tile_idx * TILE_ITEMS; | |
| OffsetT column_indices[ITEMS_PER_THREAD]; | |
| ValueT values[ITEMS_PER_THREAD]; | |
| #pragma unroll | |
| for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) | |
| { | |
| OffsetT nonzero_idx = block_offset + (ITEM * BLOCK_THREADS) + threadIdx.x; | |
| OffsetT* ci = params.d_column_indices + nonzero_idx; | |
| ValueT*a = params.d_values + nonzero_idx; | |
| column_indices[ITEM] = (nonzero_idx < params.num_nonzeros) ? *ci : 0; | |
| values[ITEM] = (nonzero_idx < params.num_nonzeros) ? *a : 0.0; | |
| } | |
| __syncthreads(); | |
| // Read vector | |
| #pragma unroll | |
| for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) | |
| { | |
| ValueT vector_value = ThreadLoad<LOAD_LDG>(params.d_vector_x + column_indices[ITEM]); | |
| nonzero += vector_value * values[ITEM]; | |
| } | |
| __syncthreads(); | |
| if (block_offset < params.num_rows) | |
| { | |
| #pragma unroll | |
| for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) | |
| { | |
| OffsetT row_idx = block_offset + (ITEM * BLOCK_THREADS) + threadIdx.x; | |
| if (row_idx < params.num_rows) | |
| { | |
| OffsetT row_end_offset = ThreadLoad<LOAD_DEFAULT>(params.d_row_end_offsets + row_idx); | |
| if ((row_end_offset >= 0) && (nonzero == nonzero)) | |
| params.d_vector_y[row_idx] = nonzero; | |
| } | |
| } | |
| } | |
| } | |
| /** | |
| * Run GPU I/O proxy | |
| */ | |
| template < | |
| typename ValueT, | |
| typename OffsetT> | |
| float TestGpuCsrIoProxy( | |
| SpmvParams<ValueT, OffsetT>& params, | |
| int timing_iterations) | |
| { | |
| enum { | |
| BLOCK_THREADS = 128, | |
| ITEMS_PER_THREAD = 7, | |
| TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD, | |
| }; | |
| // size_t smem = 1024 * 16; | |
| size_t smem = 1024 * 0; | |
| unsigned int nonzero_blocks = (params.num_nonzeros + TILE_SIZE - 1) / TILE_SIZE; | |
| unsigned int row_blocks = (params.num_rows + TILE_SIZE - 1) / TILE_SIZE; | |
| unsigned int blocks = std::max(nonzero_blocks, row_blocks); | |
| typedef TexRefInputIterator<ValueT, 1234, int> TexItr; | |
| TexItr x_itr; | |
| CubDebugExit(x_itr.BindTexture(params.d_vector_x)); | |
| // Get device ordinal | |
| int device_ordinal; | |
| CubDebugExit(cudaGetDevice(&device_ordinal)); | |
| // Get device SM version | |
| int sm_version; | |
| CubDebugExit(SmVersion(sm_version, device_ordinal)); | |
| void (*kernel)(SpmvParams<ValueT, OffsetT>, TexItr) = NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD>; | |
| int spmv_sm_occupancy; | |
| CubDebugExit(MaxSmOccupancy(spmv_sm_occupancy, kernel, BLOCK_THREADS, smem)); | |
| if (!g_quiet) | |
| printf("NonZeroIoKernel<%d,%d><<<%d, %d>>>, sm occupancy %d\n", BLOCK_THREADS, ITEMS_PER_THREAD, blocks, BLOCK_THREADS, spmv_sm_occupancy); | |
| // Warmup | |
| NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr); | |
| // Check for failures | |
| CubDebugExit(cudaPeekAtLastError()); | |
| CubDebugExit(SyncStream(0)); | |
| // Timing | |
| GpuTimer timer; | |
| float elapsed_millis = 0.0; | |
| timer.Start(); | |
| for (int it = 0; it < timing_iterations; ++it) | |
| { | |
| NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr); | |
| } | |
| timer.Stop(); | |
| elapsed_millis += timer.ElapsedMillis(); | |
| CubDebugExit(x_itr.UnbindTexture()); | |
| return elapsed_millis / timing_iterations; | |
| } | |
| //--------------------------------------------------------------------- | |
| // cuSparse HybMV | |
| //--------------------------------------------------------------------- | |
| /** | |
| * Run cuSparse HYB SpMV (specialized for fp32) | |
| */ | |
| template < | |
| typename OffsetT> | |
| float TestCusparseHybmv( | |
| float* vector_y_in, | |
| float* reference_vector_y_out, | |
| SpmvParams<float, OffsetT>& params, | |
| int timing_iterations, | |
| cusparseHandle_t cusparse) | |
| { | |
| CpuTimer cpu_timer; | |
| cpu_timer.Start(); | |
| // Construct Hyb matrix | |
| cusparseMatDescr_t mat_desc; | |
| cusparseHybMat_t hyb_desc; | |
| AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc)); | |
| AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc)); | |
| cusparseStatus_t status = cusparseScsr2hyb( | |
| cusparse, | |
| params.num_rows, params.num_cols, | |
| mat_desc, | |
| params.d_values, params.d_row_end_offsets, params.d_column_indices, | |
| hyb_desc, | |
| 0, | |
| CUSPARSE_HYB_PARTITION_AUTO); | |
| AssertEquals(CUSPARSE_STATUS_SUCCESS, status); | |
| cudaDeviceSynchronize(); | |
| cpu_timer.Stop(); | |
| float elapsed_millis = cpu_timer.ElapsedMillis(); | |
| printf("HYB setup ms, %.5f, ", elapsed_millis); | |
| // Reset input/output vector y | |
| CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice)); | |
| // Warmup | |
| AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseShybmv( | |
| cusparse, | |
| CUSPARSE_OPERATION_NON_TRANSPOSE, | |
| ¶ms.alpha, mat_desc, | |
| hyb_desc, | |
| params.d_vector_x, ¶ms.beta, params.d_vector_y)); | |
| if (!g_quiet) | |
| { | |
| int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose); | |
| printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout); | |
| } | |
| // Timing | |
| elapsed_millis = 0.0; | |
| GpuTimer timer; | |
| timer.Start(); | |
| for(int it = 0; it < timing_iterations; ++it) | |
| { | |
| AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseShybmv( | |
| cusparse, | |
| CUSPARSE_OPERATION_NON_TRANSPOSE, | |
| ¶ms.alpha, mat_desc, | |
| hyb_desc, | |
| params.d_vector_x, ¶ms.beta, params.d_vector_y)); | |
| } | |
| timer.Stop(); | |
| elapsed_millis += timer.ElapsedMillis(); | |
| // Cleanup | |
| AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc)); | |
| AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc)); | |
| return elapsed_millis / timing_iterations; | |
| } | |
| /** | |
| * Run cuSparse HYB SpMV (specialized for fp64) | |
| */ | |
| template < | |
| typename OffsetT> | |
| float TestCusparseHybmv( | |
| double* vector_y_in, | |
| double* reference_vector_y_out, | |
| SpmvParams<double, OffsetT>& params, | |
| int timing_iterations, | |
| cusparseHandle_t cusparse) | |
| { | |
| CpuTimer cpu_timer; | |
| cpu_timer.Start(); | |
| // Construct Hyb matrix | |
| cusparseMatDescr_t mat_desc; | |
| cusparseHybMat_t hyb_desc; | |
| AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc)); | |
| AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc)); | |
| AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsr2hyb( | |
| cusparse, | |
| params.num_rows, params.num_cols, | |
| mat_desc, | |
| params.d_values, params.d_row_end_offsets, params.d_column_indices, | |
| hyb_desc, | |
| 0, | |
| CUSPARSE_HYB_PARTITION_AUTO)); | |
| cudaDeviceSynchronize(); | |
| cpu_timer.Stop(); | |
| float elapsed_millis = cpu_timer.ElapsedMillis(); | |
| printf("HYB setup ms, %.5f, ", elapsed_millis); | |
| // Reset input/output vector y | |
| CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice)); | |
| // Warmup | |
| AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDhybmv( | |
| cusparse, | |
| CUSPARSE_OPERATION_NON_TRANSPOSE, | |
| ¶ms.alpha, mat_desc, | |
| hyb_desc, | |
| params.d_vector_x, ¶ms.beta, params.d_vector_y)); | |
| if (!g_quiet) | |
| { | |
| int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose); | |
| printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout); | |
| } | |
| // Timing | |
| elapsed_millis = 0.0; | |
| GpuTimer timer; | |
| timer.Start(); | |
| for(int it = 0; it < timing_iterations; ++it) | |
| { | |
| AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDhybmv( | |
| cusparse, | |
| CUSPARSE_OPERATION_NON_TRANSPOSE, | |
| ¶ms.alpha, mat_desc, | |
| hyb_desc, | |
| params.d_vector_x, ¶ms.beta, params.d_vector_y)); | |
| } | |
| timer.Stop(); | |
| elapsed_millis += timer.ElapsedMillis(); | |
| // Cleanup | |
| AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc)); | |
| AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc)); | |
| return elapsed_millis / timing_iterations; | |
| } | |
| //--------------------------------------------------------------------- | |
| // cuSparse CsrMV | |
| //--------------------------------------------------------------------- | |
| /** | |
| * Run cuSparse SpMV (specialized for fp32) | |
| */ | |
| template < | |
| typename OffsetT> | |
| float TestCusparseCsrmv( | |
| float* vector_y_in, | |
| float* reference_vector_y_out, | |
| SpmvParams<float, OffsetT>& params, | |
| int timing_iterations, | |
| cusparseHandle_t cusparse) | |
| { | |
| cusparseMatDescr_t desc; | |
| AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&desc)); | |
| // Reset input/output vector y | |
| CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice)); | |
| // Warmup | |
| AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv( | |
| cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, | |
| params.num_rows, params.num_cols, params.num_nonzeros, ¶ms.alpha, desc, | |
| params.d_values, params.d_row_end_offsets, params.d_column_indices, | |
| params.d_vector_x, ¶ms.beta, params.d_vector_y)); | |
| if (!g_quiet) | |
| { | |
| int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose); | |
| printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout); | |
| } | |
| // Timing | |
| float elapsed_millis = 0.0; | |
| GpuTimer timer; | |
| timer.Start(); | |
| for(int it = 0; it < timing_iterations; ++it) | |
| { | |
| AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv( | |
| cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, | |
| params.num_rows, params.num_cols, params.num_nonzeros, ¶ms.alpha, desc, | |
| params.d_values, params.d_row_end_offsets, params.d_column_indices, | |
| params.d_vector_x, ¶ms.beta, params.d_vector_y)); | |
| } | |
| timer.Stop(); | |
| elapsed_millis += timer.ElapsedMillis(); | |
| AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc)); | |
| return elapsed_millis / timing_iterations; | |
| } | |
| /** | |
| * Run cuSparse SpMV (specialized for fp64) | |
| */ | |
| template < | |
| typename OffsetT> | |
| float TestCusparseCsrmv( | |
| double* vector_y_in, | |
| double* reference_vector_y_out, | |
| SpmvParams<double, OffsetT>& params, | |
| int timing_iterations, | |
| cusparseHandle_t cusparse) | |
| { | |
| cusparseMatDescr_t desc; | |
| AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&desc)); | |
| // Reset input/output vector y | |
| CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice)); | |
| // Warmup | |
| AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv( | |
| cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, | |
| params.num_rows, params.num_cols, params.num_nonzeros, ¶ms.alpha, desc, | |
| params.d_values, params.d_row_end_offsets, params.d_column_indices, | |
| params.d_vector_x, ¶ms.beta, params.d_vector_y)); | |
| if (!g_quiet) | |
| { | |
| int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose); | |
| printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout); | |
| } | |
| // Timing | |
| float elapsed_millis = 0.0; | |
| GpuTimer timer; | |
| timer.Start(); | |
| for(int it = 0; it < timing_iterations; ++it) | |
| { | |
| AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv( | |
| cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, | |
| params.num_rows, params.num_cols, params.num_nonzeros, ¶ms.alpha, desc, | |
| params.d_values, params.d_row_end_offsets, params.d_column_indices, | |
| params.d_vector_x, ¶ms.beta, params.d_vector_y)); | |
| } | |
| timer.Stop(); | |
| elapsed_millis += timer.ElapsedMillis(); | |
| AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc)); | |
| return elapsed_millis / timing_iterations; | |
| } | |
| //--------------------------------------------------------------------- | |
| // GPU Merge-based SpMV | |
| //--------------------------------------------------------------------- | |
| /** | |
| * Run CUB SpMV | |
| */ | |
| template < | |
| typename ValueT, | |
| typename OffsetT> | |
| float TestGpuMergeCsrmv( | |
| ValueT* vector_y_in, | |
| ValueT* reference_vector_y_out, | |
| SpmvParams<ValueT, OffsetT>& params, | |
| int timing_iterations) | |
| { | |
| // Allocate temporary storage | |
| size_t temp_storage_bytes = 0; | |
| void *d_temp_storage = NULL; | |
| // Get amount of temporary storage needed | |
| CubDebugExit(DeviceSpmv::CsrMV( | |
| d_temp_storage, temp_storage_bytes, | |
| params.d_values, params.d_row_end_offsets, params.d_column_indices, | |
| params.d_vector_x, params.d_vector_y, | |
| params.num_rows, params.num_cols, params.num_nonzeros, | |
| // params.alpha, params.beta, | |
| (cudaStream_t) 0, false)); | |
| // Allocate | |
| CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); | |
| // Reset input/output vector y | |
| CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(ValueT) * params.num_rows, cudaMemcpyHostToDevice)); | |
| // Warmup | |
| CubDebugExit(DeviceSpmv::CsrMV( | |
| d_temp_storage, temp_storage_bytes, | |
| params.d_values, params.d_row_end_offsets, params.d_column_indices, | |
| params.d_vector_x, params.d_vector_y, | |
| params.num_rows, params.num_cols, params.num_nonzeros, | |
| // params.alpha, params.beta, | |
| (cudaStream_t) 0, !g_quiet)); | |
| if (!g_quiet) | |
| { | |
| int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose); | |
| printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout); | |
| } | |
| // Timing | |
| GpuTimer timer; | |
| float elapsed_millis = 0.0; | |
| timer.Start(); | |
| for(int it = 0; it < timing_iterations; ++it) | |
| { | |
| CubDebugExit(DeviceSpmv::CsrMV( | |
| d_temp_storage, temp_storage_bytes, | |
| params.d_values, params.d_row_end_offsets, params.d_column_indices, | |
| params.d_vector_x, params.d_vector_y, | |
| params.num_rows, params.num_cols, params.num_nonzeros, | |
| // params.alpha, params.beta, | |
| (cudaStream_t) 0, false)); | |
| } | |
| timer.Stop(); | |
| elapsed_millis += timer.ElapsedMillis(); | |
| return elapsed_millis / timing_iterations; | |
| } | |
| //--------------------------------------------------------------------- | |
| // Test generation | |
| //--------------------------------------------------------------------- | |
| /** | |
| * Display perf | |
| */ | |
| template <typename ValueT, typename OffsetT> | |
| void DisplayPerf( | |
| float device_giga_bandwidth, | |
| double avg_millis, | |
| CsrMatrix<ValueT, OffsetT>& csr_matrix) | |
| { | |
| double nz_throughput, effective_bandwidth; | |
| size_t total_bytes = (csr_matrix.num_nonzeros * (sizeof(ValueT) * 2 + sizeof(OffsetT))) + | |
| (csr_matrix.num_rows) * (sizeof(OffsetT) + sizeof(ValueT)); | |
| nz_throughput = double(csr_matrix.num_nonzeros) / avg_millis / 1.0e6; | |
| effective_bandwidth = double(total_bytes) / avg_millis / 1.0e6; | |
| if (!g_quiet) | |
| printf("fp%d: %.4f avg ms, %.5f gflops, %.3lf effective GB/s (%.2f%% peak)\n", | |
| sizeof(ValueT) * 8, | |
| avg_millis, | |
| 2 * nz_throughput, | |
| effective_bandwidth, | |
| effective_bandwidth / device_giga_bandwidth * 100); | |
| else | |
| printf("%.5f, %.6f, %.3lf, %.2f%%, ", | |
| avg_millis, | |
| 2 * nz_throughput, | |
| effective_bandwidth, | |
| effective_bandwidth / device_giga_bandwidth * 100); | |
| fflush(stdout); | |
| } | |
| /** | |
| * Run tests | |
| */ | |
| template < | |
| typename ValueT, | |
| typename OffsetT> | |
| void RunTest( | |
| bool rcm_relabel, | |
| ValueT alpha, | |
| ValueT beta, | |
| CooMatrix<ValueT, OffsetT>& coo_matrix, | |
| int timing_iterations, | |
| CommandLineArgs& args) | |
| { | |
| // Adaptive timing iterations: run 16 billion nonzeros through | |
| if (timing_iterations == -1) | |
| timing_iterations = std::min(50000ull, std::max(100ull, ((16ull << 30) / coo_matrix.num_nonzeros))); | |
| if (!g_quiet) | |
| printf("\t%d timing iterations\n", timing_iterations); | |
| // Convert to CSR | |
| CsrMatrix<ValueT, OffsetT> csr_matrix; | |
| csr_matrix.FromCoo(coo_matrix); | |
| if (!args.CheckCmdLineFlag("csrmv")) | |
| coo_matrix.Clear(); | |
| // Relabel | |
| if (rcm_relabel) | |
| { | |
| if (!g_quiet) | |
| { | |
| csr_matrix.Stats().Display(); | |
| printf("\n"); | |
| csr_matrix.DisplayHistogram(); | |
| printf("\n"); | |
| if (g_verbose2) | |
| csr_matrix.Display(); | |
| printf("\n"); | |
| } | |
| RcmRelabel(csr_matrix, !g_quiet); | |
| if (!g_quiet) printf("\n"); | |
| } | |
| // Display matrix info | |
| csr_matrix.Stats().Display(!g_quiet); | |
| if (!g_quiet) | |
| { | |
| printf("\n"); | |
| csr_matrix.DisplayHistogram(); | |
| printf("\n"); | |
| if (g_verbose2) | |
| csr_matrix.Display(); | |
| printf("\n"); | |
| } | |
| fflush(stdout); | |
| // Allocate input and output vectors | |
| ValueT* vector_x = new ValueT[csr_matrix.num_cols]; | |
| ValueT* vector_y_in = new ValueT[csr_matrix.num_rows]; | |
| ValueT* vector_y_out = new ValueT[csr_matrix.num_rows]; | |
| for (int col = 0; col < csr_matrix.num_cols; ++col) | |
| vector_x[col] = 1.0; | |
| for (int row = 0; row < csr_matrix.num_rows; ++row) | |
| vector_y_in[row] = 1.0; | |
| // Compute reference answer | |
| SpmvGold(csr_matrix, vector_x, vector_y_in, vector_y_out, alpha, beta); | |
| float avg_millis; | |
| if (g_quiet) { | |
| printf("%s, %s, ", args.deviceProp.name, (sizeof(ValueT) > 4) ? "fp64" : "fp32"); fflush(stdout); | |
| } | |
| // Get GPU device bandwidth (GB/s) | |
| float device_giga_bandwidth = args.device_giga_bandwidth; | |
| // Allocate and initialize GPU problem | |
| SpmvParams<ValueT, OffsetT> params; | |
| CubDebugExit(g_allocator.DeviceAllocate((void **) ¶ms.d_values, sizeof(ValueT) * csr_matrix.num_nonzeros)); | |
| CubDebugExit(g_allocator.DeviceAllocate((void **) ¶ms.d_row_end_offsets, sizeof(OffsetT) * (csr_matrix.num_rows + 1))); | |
| CubDebugExit(g_allocator.DeviceAllocate((void **) ¶ms.d_column_indices, sizeof(OffsetT) * csr_matrix.num_nonzeros)); | |
| CubDebugExit(g_allocator.DeviceAllocate((void **) ¶ms.d_vector_x, sizeof(ValueT) * csr_matrix.num_cols)); | |
| CubDebugExit(g_allocator.DeviceAllocate((void **) ¶ms.d_vector_y, sizeof(ValueT) * csr_matrix.num_rows)); | |
| params.num_rows = csr_matrix.num_rows; | |
| params.num_cols = csr_matrix.num_cols; | |
| params.num_nonzeros = csr_matrix.num_nonzeros; | |
| params.alpha = alpha; | |
| params.beta = beta; | |
| CubDebugExit(cudaMemcpy(params.d_values, csr_matrix.values, sizeof(ValueT) * csr_matrix.num_nonzeros, cudaMemcpyHostToDevice)); | |
| CubDebugExit(cudaMemcpy(params.d_row_end_offsets, csr_matrix.row_offsets, sizeof(OffsetT) * (csr_matrix.num_rows + 1), cudaMemcpyHostToDevice)); | |
| CubDebugExit(cudaMemcpy(params.d_column_indices, csr_matrix.column_indices, sizeof(OffsetT) * csr_matrix.num_nonzeros, cudaMemcpyHostToDevice)); | |
| CubDebugExit(cudaMemcpy(params.d_vector_x, vector_x, sizeof(ValueT) * csr_matrix.num_cols, cudaMemcpyHostToDevice)); | |
| if (!g_quiet) printf("\n\n"); | |
| printf("GPU CSR I/O Prox, "); fflush(stdout); | |
| avg_millis = TestGpuCsrIoProxy(params, timing_iterations); | |
| DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix); | |
| if (args.CheckCmdLineFlag("csrmv")) | |
| { | |
| if (!g_quiet) printf("\n\n"); | |
| printf("CUB, "); fflush(stdout); | |
| avg_millis = TestGpuMergeCsrmv(vector_y_in, vector_y_out, params, timing_iterations); | |
| DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix); | |
| } | |
| // Initialize cuSparse | |
| cusparseHandle_t cusparse; | |
| AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreate(&cusparse)); | |
| if (args.CheckCmdLineFlag("csrmv")) | |
| { | |
| if (!g_quiet) printf("\n\n"); | |
| printf("Cusparse CsrMV, "); fflush(stdout); | |
| avg_millis = TestCusparseCsrmv(vector_y_in, vector_y_out, params, timing_iterations, cusparse); | |
| DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix); | |
| } | |
| if (args.CheckCmdLineFlag("hybmv")) | |
| { | |
| if (!g_quiet) printf("\n\n"); | |
| printf("Cusparse HybMV, "); fflush(stdout); | |
| avg_millis = TestCusparseHybmv(vector_y_in, vector_y_out, params, timing_iterations, cusparse); | |
| DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix); | |
| } | |
| // Cleanup | |
| if (params.d_values) CubDebugExit(g_allocator.DeviceFree(params.d_values)); | |
| if (params.d_row_end_offsets) CubDebugExit(g_allocator.DeviceFree(params.d_row_end_offsets)); | |
| if (params.d_column_indices) CubDebugExit(g_allocator.DeviceFree(params.d_column_indices)); | |
| if (params.d_vector_x) CubDebugExit(g_allocator.DeviceFree(params.d_vector_x)); | |
| if (params.d_vector_y) CubDebugExit(g_allocator.DeviceFree(params.d_vector_y)); | |
| if (vector_x) delete[] vector_x; | |
| if (vector_y_in) delete[] vector_y_in; | |
| if (vector_y_out) delete[] vector_y_out; | |
| } | |
| /** | |
| * Run tests | |
| */ | |
| template < | |
| typename ValueT, | |
| typename OffsetT> | |
| void RunTests( | |
| bool rcm_relabel, | |
| ValueT alpha, | |
| ValueT beta, | |
| const std::string& mtx_filename, | |
| int grid2d, | |
| int grid3d, | |
| int wheel, | |
| int dense, | |
| int timing_iterations, | |
| CommandLineArgs& args) | |
| { | |
| // Initialize matrix in COO form | |
| CooMatrix<ValueT, OffsetT> coo_matrix; | |
| if (!mtx_filename.empty()) | |
| { | |
| // Parse matrix market file | |
| printf("%s, ", mtx_filename.c_str()); fflush(stdout); | |
| coo_matrix.InitMarket(mtx_filename, 1.0, !g_quiet); | |
| if ((coo_matrix.num_rows == 1) || (coo_matrix.num_cols == 1) || (coo_matrix.num_nonzeros == 1)) | |
| { | |
| if (!g_quiet) printf("Trivial dataset\n"); | |
| exit(0); | |
| } | |
| } | |
| else if (grid2d > 0) | |
| { | |
| // Generate 2D lattice | |
| printf("grid2d_%d, ", grid2d); fflush(stdout); | |
| coo_matrix.InitGrid2d(grid2d, false); | |
| } | |
| else if (grid3d > 0) | |
| { | |
| // Generate 3D lattice | |
| printf("grid3d_%d, ", grid3d); fflush(stdout); | |
| coo_matrix.InitGrid3d(grid3d, false); | |
| } | |
| else if (wheel > 0) | |
| { | |
| // Generate wheel graph | |
| printf("wheel_%d, ", grid2d); fflush(stdout); | |
| coo_matrix.InitWheel(wheel); | |
| } | |
| else if (dense > 0) | |
| { | |
| // Generate dense graph | |
| OffsetT size = 1 << 24; // 16M nnz | |
| args.GetCmdLineArgument("size", size); | |
| OffsetT rows = size / dense; | |
| printf("dense_%d_x_%d, ", rows, dense); fflush(stdout); | |
| coo_matrix.InitDense(rows, dense); | |
| } | |
| else | |
| { | |
| fprintf(stderr, "No graph type specified.\n"); | |
| exit(1); | |
| } | |
| RunTest( | |
| rcm_relabel, | |
| alpha, | |
| beta, | |
| coo_matrix, | |
| timing_iterations, | |
| args); | |
| } | |
| /** | |
| * Main | |
| */ | |
| int main(int argc, char **argv) | |
| { | |
| // Initialize command line | |
| CommandLineArgs args(argc, argv); | |
| if (args.CheckCmdLineFlag("help")) | |
| { | |
| printf( | |
| "%s " | |
| "[--csrmv | --hybmv | --bsrmv ] " | |
| "[--device=<device-id>] " | |
| "[--quiet] " | |
| "[--v] " | |
| "[--i=<timing iterations>] " | |
| "[--fp64] " | |
| "[--rcm] " | |
| "[--alpha=<alpha scalar (default: 1.0)>] " | |
| "[--beta=<beta scalar (default: 0.0)>] " | |
| "\n\t" | |
| "--mtx=<matrix market file> " | |
| "\n\t" | |
| "--dense=<cols>" | |
| "\n\t" | |
| "--grid2d=<width>" | |
| "\n\t" | |
| "--grid3d=<width>" | |
| "\n\t" | |
| "--wheel=<spokes>" | |
| "\n", argv[0]); | |
| exit(0); | |
| } | |
| bool fp64; | |
| bool rcm_relabel; | |
| std::string mtx_filename; | |
| int grid2d = -1; | |
| int grid3d = -1; | |
| int wheel = -1; | |
| int dense = -1; | |
| int timing_iterations = -1; | |
| float alpha = 1.0; | |
| float beta = 0.0; | |
| g_verbose = args.CheckCmdLineFlag("v"); | |
| g_verbose2 = args.CheckCmdLineFlag("v2"); | |
| g_quiet = args.CheckCmdLineFlag("quiet"); | |
| fp64 = args.CheckCmdLineFlag("fp64"); | |
| rcm_relabel = args.CheckCmdLineFlag("rcm"); | |
| args.GetCmdLineArgument("i", timing_iterations); | |
| args.GetCmdLineArgument("mtx", mtx_filename); | |
| args.GetCmdLineArgument("grid2d", grid2d); | |
| args.GetCmdLineArgument("grid3d", grid3d); | |
| args.GetCmdLineArgument("wheel", wheel); | |
| args.GetCmdLineArgument("dense", dense); | |
| args.GetCmdLineArgument("alpha", alpha); | |
| args.GetCmdLineArgument("beta", beta); | |
| // Initialize device | |
| CubDebugExit(args.DeviceInit()); | |
| // Run test(s) | |
| if (fp64) | |
| { | |
| RunTests<double, int>(rcm_relabel, alpha, beta, mtx_filename, grid2d, grid3d, wheel, dense, timing_iterations, args); | |
| } | |
| else | |
| { | |
| RunTests<float, int>(rcm_relabel, alpha, beta, mtx_filename, grid2d, grid3d, wheel, dense, timing_iterations, args); | |
| } | |
| CubDebugExit(cudaDeviceSynchronize()); | |
| printf("\n"); | |
| return 0; | |
| } | |