| /****************************************************************************** |
| * Copyright (c) 2011, Duane Merrill. All rights reserved. |
| * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * * Neither the name of the NVIDIA CORPORATION nor the |
| * names of its contributors may be used to endorse or promote products |
| * derived from this software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY |
| * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| * |
| ******************************************************************************/ |
| |
| /****************************************************************************** |
| * Test of BlockLoad and BlockStore utilities |
| ******************************************************************************/ |
| |
| // Ensure printing of CUDA runtime errors to console |
| #define CUB_STDERR |
| |
| #include <iterator> |
| #include <stdio.h> |
| |
| #include <cub/block/block_load.cuh> |
| #include <cub/block/block_store.cuh> |
| #include <cub/iterator/cache_modified_input_iterator.cuh> |
| #include <cub/iterator/cache_modified_output_iterator.cuh> |
| #include <cub/iterator/discard_output_iterator.cuh> |
| #include <cub/util_allocator.cuh> |
| |
| #include "test_util.h" |
| |
| using namespace cub; |
| |
| //--------------------------------------------------------------------- |
| // Globals, constants and typedefs |
| //--------------------------------------------------------------------- |
| |
| bool g_verbose = false; |
| CachingDeviceAllocator g_allocator(true); |
| |
| |
| //--------------------------------------------------------------------- |
| // Test kernels |
| //--------------------------------------------------------------------- |
| |
| |
| /** |
| * Test load/store kernel. |
| */ |
| template < |
| int BLOCK_THREADS, |
| int ITEMS_PER_THREAD, |
| BlockLoadAlgorithm LOAD_ALGORITHM, |
| BlockStoreAlgorithm STORE_ALGORITHM, |
| typename InputIteratorT, |
| typename OutputIteratorT> |
| __launch_bounds__ (BLOCK_THREADS, 1) |
| __global__ void Kernel( |
| InputIteratorT d_in, |
| OutputIteratorT d_out_unguarded, |
| OutputIteratorT d_out_guarded, |
| int num_items) |
| { |
| enum |
| { |
| TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD |
| }; |
| |
| // The input value type |
| typedef typename std::iterator_traits<InputIteratorT>::value_type InputT; |
| |
| // The output value type |
| typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? |
| typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type, |
| typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type |
| |
| // Threadblock load/store abstraction types |
| typedef BlockLoad<InputT, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM> BlockLoad; |
| typedef BlockStore<OutputT, BLOCK_THREADS, ITEMS_PER_THREAD, STORE_ALGORITHM> BlockStore; |
| |
| // Shared memory type for this thread block |
| union TempStorage |
| { |
| typename BlockLoad::TempStorage load; |
| typename BlockStore::TempStorage store; |
| }; |
| |
| // Allocate temp storage in shared memory |
| __shared__ TempStorage temp_storage; |
| |
| // Threadblock work bounds |
| int block_offset = blockIdx.x * TILE_SIZE; |
| int guarded_elements = num_items - block_offset; |
| |
| // Tile of items |
| OutputT data[ITEMS_PER_THREAD]; |
| |
| // Load data |
| BlockLoad(temp_storage.load).Load(d_in + block_offset, data); |
| |
| __syncthreads(); |
| |
| // Store data |
| BlockStore(temp_storage.store).Store(d_out_unguarded + block_offset, data); |
| |
| __syncthreads(); |
| |
| // reset data |
| #pragma unroll |
| for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) |
| data[ITEM] = OutputT(); |
| |
| __syncthreads(); |
| |
| // Load data |
| BlockLoad(temp_storage.load).Load(d_in + block_offset, data, guarded_elements); |
| |
| __syncthreads(); |
| |
| // Store data |
| BlockStore(temp_storage.store).Store(d_out_guarded + block_offset, data, guarded_elements); |
| } |
| |
|
|
| //--------------------------------------------------------------------- |
| // Host testing subroutines |
| //--------------------------------------------------------------------- |
|
|
|
|
| /** |
| * Test load/store variants |
| */ |
| template < |
| typename T, |
| int BLOCK_THREADS, |
| int ITEMS_PER_THREAD, |
| BlockLoadAlgorithm LOAD_ALGORITHM, |
| BlockStoreAlgorithm STORE_ALGORITHM, |
| typename InputIteratorT, |
| typename OutputIteratorT> |
| void TestKernel( |
| T *h_in, |
| InputIteratorT d_in, |
| OutputIteratorT d_out_unguarded_itr, |
| OutputIteratorT d_out_guarded_itr, |
| T *d_out_unguarded_ptr, |
| T *d_out_guarded_ptr, |
| int grid_size, |
| int guarded_elements) |
| { |
| int compare; |
| |
| int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD; |
| |
| // Test with discard output iterator |
| typedef typename std::iterator_traits<InputIteratorT>::difference_type OffsetT; |
| DiscardOutputIterator<OffsetT> discard_itr; |
| |
| Kernel<BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM> |
| <<<grid_size, BLOCK_THREADS>>>( |
| d_in, |
| discard_itr, |
| discard_itr, |
| guarded_elements); |
| |
| // Test with regular output iterator |
| Kernel<BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM> |
| <<<grid_size, BLOCK_THREADS>>>( |
| d_in, |
| d_out_unguarded_itr, |
| d_out_guarded_itr, |
| guarded_elements); |
| |
| CubDebugExit(cudaPeekAtLastError()); |
| CubDebugExit(cudaDeviceSynchronize()); |
| |
| // Check results |
| compare = CompareDeviceResults(h_in, d_out_guarded_ptr, guarded_elements, g_verbose, g_verbose); |
| printf("\tGuarded: %s\n", (compare) ? "FAIL" : "PASS"); |
| AssertEquals(0, compare); |
| |
| // Check results |
| compare = CompareDeviceResults(h_in, d_out_unguarded_ptr, unguarded_elements, g_verbose, g_verbose); |
| printf("\tUnguarded: %s\n", (compare) ? "FAIL" : "PASS"); |
| AssertEquals(0, compare); |
| } |
| |
|
|
| /** |
| * Test native pointer. Specialized for sufficient resources |
| */ |
| template < |
| typename T, |
| int BLOCK_THREADS, |
| int ITEMS_PER_THREAD, |
| BlockLoadAlgorithm LOAD_ALGORITHM, |
| BlockStoreAlgorithm STORE_ALGORITHM> |
| void TestNative( |
| int grid_size, |
| float fraction_valid, |
| Int2Type<true> /*sufficient_resources*/) |
| { |
| int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD; |
| int guarded_elements = int(fraction_valid * float(unguarded_elements)); |
| |
| // Allocate host arrays |
| T *h_in = (T*) malloc(unguarded_elements * sizeof(T)); |
| |
| // Allocate device arrays |
| T *d_in = NULL; |
| T *d_out_unguarded = NULL; |
| T *d_out_guarded = NULL; |
| CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * unguarded_elements)); |
| CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_unguarded, sizeof(T) * unguarded_elements)); |
| CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_guarded, sizeof(T) * guarded_elements)); |
| CubDebugExit(cudaMemset(d_out_unguarded, 0, sizeof(T) * unguarded_elements)); |
| CubDebugExit(cudaMemset(d_out_guarded, 0, sizeof(T) * guarded_elements)); |
| |
| // Initialize problem on host and device |
| for (int i = 0; i < unguarded_elements; ++i) |
| { |
| InitValue(INTEGER_SEED, h_in[i], i); |
| } |
| CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * unguarded_elements, cudaMemcpyHostToDevice)); |
| |
| printf("TestNative " |
| "grid_size(%d) " |
| "guarded_elements(%d) " |
| "unguarded_elements(%d) " |
| "BLOCK_THREADS(%d) " |
| "ITEMS_PER_THREAD(%d) " |
| "LOAD_ALGORITHM(%d) " |
| "STORE_ALGORITHM(%d) " |
| "sizeof(T)(%d)\n", |
| grid_size, guarded_elements, unguarded_elements, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, (int) sizeof(T)); |
| |
| TestKernel<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>( |
| h_in, |
| (T const *) d_in, // Test const |
| d_out_unguarded, |
| d_out_guarded, |
| d_out_unguarded, |
| d_out_guarded, |
| grid_size, |
| guarded_elements); |
| |
| // Cleanup |
| if (h_in) free(h_in); |
| if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); |
| if (d_out_unguarded) CubDebugExit(g_allocator.DeviceFree(d_out_unguarded)); |
| if (d_out_guarded) CubDebugExit(g_allocator.DeviceFree(d_out_guarded)); |
| } |
| |
|
|
| /** |
| * Test native pointer. Specialized for insufficient resources |
| */ |
| template < |
| typename T, |
| int BLOCK_THREADS, |
| int ITEMS_PER_THREAD, |
| BlockLoadAlgorithm LOAD_ALGORITHM, |
| BlockStoreAlgorithm STORE_ALGORITHM> |
| void TestNative( |
| int /*grid_size*/, |
| float /*fraction_valid*/, |
| Int2Type<false> /*sufficient_resources*/) |
| {} |
| |
| |
| /** |
| * Test iterator. Specialized for sufficient resources. |
| */ |
| template < |
| typename T, |
| int BLOCK_THREADS, |
| int ITEMS_PER_THREAD, |
| BlockLoadAlgorithm LOAD_ALGORITHM, |
| BlockStoreAlgorithm STORE_ALGORITHM, |
| CacheLoadModifier LOAD_MODIFIER, |
| CacheStoreModifier STORE_MODIFIER> |
| void TestIterator( |
| int grid_size, |
| float fraction_valid, |
| Int2Type<true> /*sufficient_resources*/) |
| { |
| int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD; |
| int guarded_elements = int(fraction_valid * float(unguarded_elements)); |
| |
| // Allocate host arrays |
| T *h_in = (T*) malloc(unguarded_elements * sizeof(T)); |
| |
| // Allocate device arrays |
| T *d_in = NULL; |
| T *d_out_unguarded = NULL; |
| T *d_out_guarded = NULL; |
| CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * unguarded_elements)); |
| CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_unguarded, sizeof(T) * unguarded_elements)); |
| CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_guarded, sizeof(T) * guarded_elements)); |
| CubDebugExit(cudaMemset(d_out_unguarded, 0, sizeof(T) * unguarded_elements)); |
| CubDebugExit(cudaMemset(d_out_guarded, 0, sizeof(T) * guarded_elements)); |
| |
| // Initialize problem on host and device |
| for (int i = 0; i < unguarded_elements; ++i) |
| { |
| InitValue(INTEGER_SEED, h_in[i], i); |
| } |
| CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * unguarded_elements, cudaMemcpyHostToDevice)); |
| |
| printf("TestIterator " |
| "grid_size(%d) " |
| "guarded_elements(%d) " |
| "unguarded_elements(%d) " |
| "BLOCK_THREADS(%d) " |
| "ITEMS_PER_THREAD(%d) " |
| "LOAD_ALGORITHM(%d) " |
| "STORE_ALGORITHM(%d) " |
| "LOAD_MODIFIER(%d) " |
| "STORE_MODIFIER(%d) " |
| "sizeof(T)(%d)\n", |
| grid_size, guarded_elements, unguarded_elements, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, LOAD_MODIFIER, STORE_MODIFIER, (int) sizeof(T)); |
| |
| TestKernel<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>( |
| h_in, |
| CacheModifiedInputIterator<LOAD_MODIFIER, T>(d_in), |
| CacheModifiedOutputIterator<STORE_MODIFIER, T>(d_out_unguarded), |
| CacheModifiedOutputIterator<STORE_MODIFIER, T>(d_out_guarded), |
| d_out_unguarded, |
| d_out_guarded, |
| grid_size, |
| guarded_elements); |
| |
| // Cleanup |
| if (h_in) free(h_in); |
| if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); |
| if (d_out_unguarded) CubDebugExit(g_allocator.DeviceFree(d_out_unguarded)); |
| if (d_out_guarded) CubDebugExit(g_allocator.DeviceFree(d_out_guarded)); |
| } |
| |
| /** |
| * Test iterator. Specialized for insufficient resources. |
| */ |
| template < |
| typename T, |
| int BLOCK_THREADS, |
| int ITEMS_PER_THREAD, |
| BlockLoadAlgorithm LOAD_ALGORITHM, |
| BlockStoreAlgorithm STORE_ALGORITHM, |
| CacheLoadModifier LOAD_MODIFIER, |
| CacheStoreModifier STORE_MODIFIER> |
| void TestIterator( |
| int /*grid_size*/, |
| float /*fraction_valid*/, |
| Int2Type<false> /*sufficient_resources*/) |
| {} |
| |
| |
| /** |
| * Evaluate different pointer access types |
| */ |
| template < |
| typename T, |
| int BLOCK_THREADS, |
| int ITEMS_PER_THREAD, |
| BlockLoadAlgorithm LOAD_ALGORITHM, |
| BlockStoreAlgorithm STORE_ALGORITHM> |
| void TestPointerType( |
| int grid_size, |
| float fraction_valid) |
| { |
| // Threadblock load/store abstraction types |
| typedef BlockLoad<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM> BlockLoad; |
| typedef BlockStore<T, BLOCK_THREADS, ITEMS_PER_THREAD, STORE_ALGORITHM> BlockStore; |
| |
| #if defined(SM100) || defined(SM110) || defined(SM130) |
| static const bool sufficient_load_smem = sizeof(typename BlockLoad::TempStorage) <= 1024 * 16; |
| static const bool sufficient_store_smem = sizeof(typename BlockStore::TempStorage) <= 1024 * 16; |
| static const bool sufficient_threads = BLOCK_THREADS <= 512; |
| #else |
| static const bool sufficient_load_smem = sizeof(typename BlockLoad::TempStorage) <= 1024 * 48; |
| static const bool sufficient_store_smem = sizeof(typename BlockStore::TempStorage) <= 1024 * 48; |
| static const bool sufficient_threads = BLOCK_THREADS <= 1024; |
| #endif |
| |
| static const bool sufficient_resources = sufficient_load_smem && sufficient_store_smem && sufficient_threads; |
| |
| TestNative<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>(grid_size, fraction_valid, Int2Type<sufficient_resources>()); |
| TestIterator<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, LOAD_DEFAULT, STORE_DEFAULT>(grid_size, fraction_valid, Int2Type<sufficient_resources>()); |
| } |
| |
|
|
| /** |
| * Evaluate different time-slicing strategies |
| */ |
| template < |
| typename T, |
| int BLOCK_THREADS, |
| int ITEMS_PER_THREAD, |
| BlockLoadAlgorithm LOAD_ALGORITHM, |
| BlockStoreAlgorithm STORE_ALGORITHM> |
| void TestSlicedStrategy( |
| int grid_size, |
| float fraction_valid) |
| { |
| TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, true>(grid_size, fraction_valid); |
| TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, false>(grid_size, fraction_valid); |
| } |
| |
| |
| |
| /** |
| * Evaluate different load/store strategies (specialized for block sizes that are not a multiple of 32) |
| */ |
| template < |
| typename T, |
| int BLOCK_THREADS, |
| int ITEMS_PER_THREAD> |
| void TestStrategy( |
| int grid_size, |
| float fraction_valid, |
| Int2Type<false> /*is_warp_multiple*/) |
| { |
| TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_DIRECT, BLOCK_STORE_DIRECT>(grid_size, fraction_valid); |
| TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE, BLOCK_STORE_TRANSPOSE>(grid_size, fraction_valid); |
| TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_VECTORIZE, BLOCK_STORE_VECTORIZE>(grid_size, fraction_valid); |
| } |
| |
| |
| /** |
| * Evaluate different load/store strategies (specialized for block sizes that are a multiple of 32) |
| */ |
| template < |
| typename T, |
| int BLOCK_THREADS, |
| int ITEMS_PER_THREAD> |
| void TestStrategy( |
| int grid_size, |
| float fraction_valid, |
| Int2Type<true> /*is_warp_multiple*/) |
| { |
| TestStrategy<T, BLOCK_THREADS, ITEMS_PER_THREAD>(grid_size, fraction_valid, Int2Type<false>()); |
| TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE>(grid_size, fraction_valid); |
| TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED>(grid_size, fraction_valid); |
| } |
| |
| |
| /** |
| * Evaluate different register blocking |
| */ |
| template < |
| typename T, |
| int BLOCK_THREADS> |
| void TestItemsPerThread( |
| int grid_size, |
| float fraction_valid) |
| { |
| Int2Type<BLOCK_THREADS % 32 == 0> is_warp_multiple; |
| |
| TestStrategy<T, BLOCK_THREADS, 1>(grid_size, fraction_valid, is_warp_multiple); |
| TestStrategy<T, BLOCK_THREADS, 3>(grid_size, fraction_valid, is_warp_multiple); |
| TestStrategy<T, BLOCK_THREADS, 4>(grid_size, fraction_valid, is_warp_multiple); |
| TestStrategy<T, BLOCK_THREADS, 11>(grid_size, fraction_valid, is_warp_multiple); |
| } |
| |
| |
| /** |
| * Evaluate different thread block sizes |
| */ |
| template <typename T> |
| void TestThreads( |
| int grid_size, |
| float fraction_valid) |
| { |
| TestItemsPerThread<T, 15>(grid_size, fraction_valid); |
| TestItemsPerThread<T, 32>(grid_size, fraction_valid); |
| TestItemsPerThread<T, 72>(grid_size, fraction_valid); |
| TestItemsPerThread<T, 96>(grid_size, fraction_valid); |
| TestItemsPerThread<T, 128>(grid_size, fraction_valid); |
| } |
| |
| |
| /** |
| * Main |
| */ |
| int main(int argc, char** argv) |
| { |
| // Initialize command line |
| CommandLineArgs args(argc, argv); |
| g_verbose = args.CheckCmdLineFlag("v"); |
| |
| // Print usage |
| if (args.CheckCmdLineFlag("help")) |
| { |
| printf("%s " |
| "[--device=<device-id>] " |
| "[--v] " |
| "\n", argv[0]); |
| exit(0); |
| } |
| |
| // Initialize device |
| CubDebugExit(args.DeviceInit()); |
| |
| // Get ptx version |
| int ptx_version = 0; |
| CubDebugExit(PtxVersion(ptx_version)); |
| |
| #ifdef QUICK_TEST |
| |
| // Compile/run quick tests |
| TestNative< int, 64, 2, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE>(1, 0.8f, Int2Type<true>()); |
| TestIterator< int, 64, 2, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE, LOAD_DEFAULT, STORE_DEFAULT>(1, 0.8f, Int2Type<true>()); |
| |
| #else |
| |
| // Compile/run thorough tests |
| TestThreads<char>(2, 0.8f); |
| TestThreads<int>(2, 0.8f); |
| TestThreads<long>(2, 0.8f); |
| TestThreads<long2>(2, 0.8f); |
| |
| if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted |
| TestThreads<double2>(2, 0.8f); |
| TestThreads<TestFoo>(2, 0.8f); |
| TestThreads<TestBar>(2, 0.8f); |
| |
| #endif |
|
|
| return 0; |
| } |
| |
|
|
|
|
|
|