Spaces:
Runtime error
Runtime error
| /****************************************************************************** | |
| * Copyright (c) 2011, Duane Merrill. All rights reserved. | |
| * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. | |
| * | |
| * Redistribution and use in source and binary forms, with or without | |
| * modification, are permitted provided that the following conditions are met: | |
| * * Redistributions of source code must retain the above copyright | |
| * notice, this list of conditions and the following disclaimer. | |
| * * Redistributions in binary form must reproduce the above copyright | |
| * notice, this list of conditions and the following disclaimer in the | |
| * documentation and/or other materials provided with the distribution. | |
| * * Neither the name of the NVIDIA CORPORATION nor the | |
| * names of its contributors may be used to endorse or promote products | |
| * derived from this software without specific prior written permission. | |
| * | |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
| * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
| * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
| * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY | |
| * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
| * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
| * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
| * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| * | |
| ******************************************************************************/ | |
| /****************************************************************************** | |
| * Test evaluation for caching allocator of device memory | |
| ******************************************************************************/ | |
| // Ensure printing of CUDA runtime errors to console | |
| #define CUB_STDERR | |
| #include <stdio.h> | |
| #include <cub/util_allocator.cuh> | |
| #include "test_util.h" | |
| using namespace cub; | |
| //--------------------------------------------------------------------- | |
| // Main | |
| //--------------------------------------------------------------------- | |
| /** | |
| * Main | |
| */ | |
| int main(int argc, char** argv) | |
| { | |
| // Initialize command line | |
| CommandLineArgs args(argc, argv); | |
| // Print usage | |
| if (args.CheckCmdLineFlag("help")) | |
| { | |
| printf("%s " | |
| "[--device=<device-id>]" | |
| "[--bytes=<timing bytes>]" | |
| "[--i=<timing iterations>]" | |
| "\n", argv[0]); | |
| exit(0); | |
| } | |
| #if (CUB_PTX_ARCH == 0) | |
| // Initialize device | |
| CubDebugExit(args.DeviceInit()); | |
| // Get number of GPUs and current GPU | |
| int num_gpus; | |
| int initial_gpu; | |
| int timing_iterations = 10000; | |
| int timing_bytes = 1024 * 1024; | |
| if (CubDebug(cudaGetDeviceCount(&num_gpus))) exit(1); | |
| if (CubDebug(cudaGetDevice(&initial_gpu))) exit(1); | |
| args.GetCmdLineArgument("i", timing_iterations); | |
| args.GetCmdLineArgument("bytes", timing_bytes); | |
| // Create default allocator (caches up to 6MB in device allocations per GPU) | |
| CachingDeviceAllocator allocator; | |
| allocator.debug = true; | |
| printf("Running single-gpu tests...\n"); fflush(stdout); | |
| // | |
| // Test0 | |
| // | |
| // Create a new stream | |
| cudaStream_t other_stream; | |
| CubDebugExit(cudaStreamCreate(&other_stream)); | |
| // Allocate 999 bytes on the current gpu in stream0 | |
| char *d_999B_stream0_a; | |
| char *d_999B_stream0_b; | |
| CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0)); | |
| // Run some big kernel in stream 0 | |
| EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>(); | |
| // Free d_999B_stream0_a | |
| CubDebugExit(allocator.DeviceFree(d_999B_stream0_a)); | |
| // Allocate another 999 bytes in stream 0 | |
| CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0)); | |
| // Check that that we have 1 live block on the initial GPU | |
| AssertEquals(allocator.live_blocks.size(), 1); | |
| // Check that that we have no cached block on the initial GPU | |
| AssertEquals(allocator.cached_blocks.size(), 0); | |
| // Run some big kernel in stream 0 | |
| EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>(); | |
| // Free d_999B_stream0_b | |
| CubDebugExit(allocator.DeviceFree(d_999B_stream0_b)); | |
| // Allocate 999 bytes on the current gpu in other_stream | |
| char *d_999B_stream_other_a; | |
| char *d_999B_stream_other_b; | |
| allocator.DeviceAllocate((void **) &d_999B_stream_other_a, 999, other_stream); | |
| // Check that that we have 1 live blocks on the initial GPU (that we allocated a new one because d_999B_stream0_b is only available for stream 0 until it becomes idle) | |
| AssertEquals(allocator.live_blocks.size(), 1); | |
| // Check that that we have one cached block on the initial GPU | |
| AssertEquals(allocator.cached_blocks.size(), 1); | |
| // Run some big kernel in other_stream | |
| EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>(); | |
| // Free d_999B_stream_other | |
| CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a)); | |
| // Check that we can now use both allocations in stream 0 after synchronizing the device | |
| CubDebugExit(cudaDeviceSynchronize()); | |
| CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0)); | |
| CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0)); | |
| // Check that that we have 2 live blocks on the initial GPU | |
| AssertEquals(allocator.live_blocks.size(), 2); | |
| // Check that that we have no cached block on the initial GPU | |
| AssertEquals(allocator.cached_blocks.size(), 0); | |
| // Free d_999B_stream0_a and d_999B_stream0_b | |
| CubDebugExit(allocator.DeviceFree(d_999B_stream0_a)); | |
| CubDebugExit(allocator.DeviceFree(d_999B_stream0_b)); | |
| // Check that we can now use both allocations in other_stream | |
| CubDebugExit(cudaDeviceSynchronize()); | |
| CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream_other_a, 999, other_stream)); | |
| CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream_other_b, 999, other_stream)); | |
| // Check that that we have 2 live blocks on the initial GPU | |
| AssertEquals(allocator.live_blocks.size(), 2); | |
| // Check that that we have no cached block on the initial GPU | |
| AssertEquals(allocator.cached_blocks.size(), 0); | |
| // Run some big kernel in other_stream | |
| EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>(); | |
| // Free d_999B_stream_other_a and d_999B_stream_other_b | |
| CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a)); | |
| CubDebugExit(allocator.DeviceFree(d_999B_stream_other_b)); | |
| // Check that we can now use both allocations in stream 0 after synchronizing the device and destroying the other stream | |
| CubDebugExit(cudaDeviceSynchronize()); | |
| CubDebugExit(cudaStreamDestroy(other_stream)); | |
| CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0)); | |
| CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0)); | |
| // Check that that we have 2 live blocks on the initial GPU | |
| AssertEquals(allocator.live_blocks.size(), 2); | |
| // Check that that we have no cached block on the initial GPU | |
| AssertEquals(allocator.cached_blocks.size(), 0); | |
| // Free d_999B_stream0_a and d_999B_stream0_b | |
| CubDebugExit(allocator.DeviceFree(d_999B_stream0_a)); | |
| CubDebugExit(allocator.DeviceFree(d_999B_stream0_b)); | |
| // Free all cached | |
| CubDebugExit(allocator.FreeAllCached()); | |
| // | |
| // Test1 | |
| // | |
| // Allocate 5 bytes on the current gpu | |
| char *d_5B; | |
| CubDebugExit(allocator.DeviceAllocate((void **) &d_5B, 5)); | |
| // Check that that we have zero free bytes cached on the initial GPU | |
| AssertEquals(allocator.cached_bytes[initial_gpu].free, 0); | |
| // Check that that we have 1 live block on the initial GPU | |
| AssertEquals(allocator.live_blocks.size(), 1); | |
| // | |
| // Test2 | |
| // | |
| // Allocate 4096 bytes on the current gpu | |
| char *d_4096B; | |
| CubDebugExit(allocator.DeviceAllocate((void **) &d_4096B, 4096)); | |
| // Check that that we have 2 live blocks on the initial GPU | |
| AssertEquals(allocator.live_blocks.size(), 2); | |
| // | |
| // Test3 | |
| // | |
| // DeviceFree d_5B | |
| CubDebugExit(allocator.DeviceFree(d_5B)); | |
| // Check that that we have min_bin_bytes free bytes cached on the initial gpu | |
| AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes); | |
| // Check that that we have 1 live block on the initial GPU | |
| AssertEquals(allocator.live_blocks.size(), 1); | |
| // Check that that we have 1 cached block on the initial GPU | |
| AssertEquals(allocator.cached_blocks.size(), 1); | |
| // | |
| // Test4 | |
| // | |
| // DeviceFree d_4096B | |
| CubDebugExit(allocator.DeviceFree(d_4096B)); | |
| // Check that that we have the 4096 + min_bin free bytes cached on the initial gpu | |
| AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes + 4096); | |
| // Check that that we have 0 live block on the initial GPU | |
| AssertEquals(allocator.live_blocks.size(), 0); | |
| // Check that that we have 2 cached block on the initial GPU | |
| AssertEquals(allocator.cached_blocks.size(), 2); | |
| // | |
| // Test5 | |
| // | |
| // Allocate 768 bytes on the current gpu | |
| char *d_768B; | |
| CubDebugExit(allocator.DeviceAllocate((void **) &d_768B, 768)); | |
| // Check that that we have the min_bin free bytes cached on the initial gpu (4096 was reused) | |
| AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes); | |
| // Check that that we have 1 live block on the initial GPU | |
| AssertEquals(allocator.live_blocks.size(), 1); | |
| // Check that that we have 1 cached block on the initial GPU | |
| AssertEquals(allocator.cached_blocks.size(), 1); | |
| // | |
| // Test6 | |
| // | |
| // Allocate max_cached_bytes on the current gpu | |
| char *d_max_cached; | |
| CubDebugExit(allocator.DeviceAllocate((void **) &d_max_cached, allocator.max_cached_bytes)); | |
| // DeviceFree d_max_cached | |
| CubDebugExit(allocator.DeviceFree(d_max_cached)); | |
| // Check that that we have the min_bin free bytes cached on the initial gpu (max cached was not returned because we went over) | |
| AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes); | |
| // Check that that we have 1 live block on the initial GPU | |
| AssertEquals(allocator.live_blocks.size(), 1); | |
| // Check that that we still have 1 cached block on the initial GPU | |
| AssertEquals(allocator.cached_blocks.size(), 1); | |
| // | |
| // Test7 | |
| // | |
| // Free all cached blocks on all GPUs | |
| CubDebugExit(allocator.FreeAllCached()); | |
| // Check that that we have 0 bytes cached on the initial GPU | |
| AssertEquals(allocator.cached_bytes[initial_gpu].free, 0); | |
| // Check that that we have 0 cached blocks across all GPUs | |
| AssertEquals(allocator.cached_blocks.size(), 0); | |
| // Check that that still we have 1 live block across all GPUs | |
| AssertEquals(allocator.live_blocks.size(), 1); | |
| // | |
| // Test8 | |
| // | |
| // Allocate max cached bytes + 1 on the current gpu | |
| char *d_max_cached_plus; | |
| CubDebugExit(allocator.DeviceAllocate((void **) &d_max_cached_plus, allocator.max_cached_bytes + 1)); | |
| // DeviceFree max cached bytes | |
| CubDebugExit(allocator.DeviceFree(d_max_cached_plus)); | |
| // DeviceFree d_768B | |
| CubDebugExit(allocator.DeviceFree(d_768B)); | |
| unsigned int power; | |
| size_t rounded_bytes; | |
| allocator.NearestPowerOf(power, rounded_bytes, allocator.bin_growth, 768); | |
| // Check that that we have 4096 free bytes cached on the initial gpu | |
| AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes); | |
| // Check that that we have 1 cached blocks across all GPUs | |
| AssertEquals(allocator.cached_blocks.size(), 1); | |
| // Check that that still we have 0 live block across all GPUs | |
| AssertEquals(allocator.live_blocks.size(), 0); | |
| #ifndef CUB_CDP | |
| // BUG: find out why these tests fail when one GPU is CDP compliant and the other is not | |
| if (num_gpus > 1) | |
| { | |
| printf("\nRunning multi-gpu tests...\n"); fflush(stdout); | |
| // | |
| // Test9 | |
| // | |
| // Allocate 768 bytes on the next gpu | |
| int next_gpu = (initial_gpu + 1) % num_gpus; | |
| char *d_768B_2; | |
| CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768)); | |
| // DeviceFree d_768B on the next gpu | |
| CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2)); | |
| // Re-allocate 768 bytes on the next gpu | |
| CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768)); | |
| // Re-free d_768B on the next gpu | |
| CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2)); | |
| // Check that that we have 4096 free bytes cached on the initial gpu | |
| AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes); | |
| // Check that that we have 4096 free bytes cached on the second gpu | |
| AssertEquals(allocator.cached_bytes[next_gpu].free, rounded_bytes); | |
| // Check that that we have 2 cached blocks across all GPUs | |
| AssertEquals(allocator.cached_blocks.size(), 2); | |
| // Check that that still we have 0 live block across all GPUs | |
| AssertEquals(allocator.live_blocks.size(), 0); | |
| } | |
| #endif // CUB_CDP | |
| // | |
| // Performance | |
| // | |
| printf("\nCPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes); | |
| fflush(stdout); fflush(stderr); | |
| // CPU performance comparisons vs cached. Allocate and free a 1MB block 2000 times | |
| CpuTimer cpu_timer; | |
| char *d_1024MB = NULL; | |
| allocator.debug = false; | |
| // Prime the caching allocator and the kernel | |
| CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes)); | |
| CubDebugExit(allocator.DeviceFree(d_1024MB)); | |
| cub::EmptyKernel<void><<<1, 32>>>(); | |
| // CUDA | |
| cpu_timer.Start(); | |
| for (int i = 0; i < timing_iterations; ++i) | |
| { | |
| CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes)); | |
| CubDebugExit(cudaFree(d_1024MB)); | |
| } | |
| cpu_timer.Stop(); | |
| float cuda_malloc_elapsed_millis = cpu_timer.ElapsedMillis(); | |
| // CUB | |
| cpu_timer.Start(); | |
| for (int i = 0; i < timing_iterations; ++i) | |
| { | |
| CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes)); | |
| CubDebugExit(allocator.DeviceFree(d_1024MB)); | |
| } | |
| cpu_timer.Stop(); | |
| float cub_calloc_elapsed_millis = cpu_timer.ElapsedMillis(); | |
| printf("\t CUB CachingDeviceAllocator allocation CPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n", | |
| cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis, | |
| cuda_malloc_elapsed_millis / timing_iterations, | |
| cub_calloc_elapsed_millis / timing_iterations); | |
| // GPU performance comparisons. Allocate and free a 1MB block 2000 times | |
| GpuTimer gpu_timer; | |
| printf("\nGPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes); | |
| fflush(stdout); fflush(stderr); | |
| // Kernel-only | |
| gpu_timer.Start(); | |
| for (int i = 0; i < timing_iterations; ++i) | |
| { | |
| cub::EmptyKernel<void><<<1, 32>>>(); | |
| } | |
| gpu_timer.Stop(); | |
| float cuda_empty_elapsed_millis = gpu_timer.ElapsedMillis(); | |
| // CUDA | |
| gpu_timer.Start(); | |
| for (int i = 0; i < timing_iterations; ++i) | |
| { | |
| CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes)); | |
| cub::EmptyKernel<void><<<1, 32>>>(); | |
| CubDebugExit(cudaFree(d_1024MB)); | |
| } | |
| gpu_timer.Stop(); | |
| cuda_malloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis; | |
| // CUB | |
| gpu_timer.Start(); | |
| for (int i = 0; i < timing_iterations; ++i) | |
| { | |
| CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes)); | |
| cub::EmptyKernel<void><<<1, 32>>>(); | |
| CubDebugExit(allocator.DeviceFree(d_1024MB)); | |
| } | |
| gpu_timer.Stop(); | |
| cub_calloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis; | |
| printf("\t CUB CachingDeviceAllocator allocation GPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n", | |
| cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis, | |
| cuda_malloc_elapsed_millis / timing_iterations, | |
| cub_calloc_elapsed_millis / timing_iterations); | |
| #endif | |
| printf("Success\n"); | |
| return 0; | |
| } | |