# Copyright (c) 2023 NVIDIA CORPORATION. All rights reserved. # NVIDIA CORPORATION and its licensors retain all intellectual property # and proprietary rights in and to this software, related documentation # and any modifications thereto. Any use, reproduction, disclosure or # distribution of this software and related documentation without an express # license agreement from NVIDIA CORPORATION is strictly prohibited. import math import unittest import warp as wp from warp.tests.unittest_utils import * wp.init() @wp.kernel def conditional_sum(result: wp.array(dtype=wp.uint64)): i, j, k = wp.tid() if i == 0: wp.atomic_add(result, 0, wp.uint64(1)) def test_large_launch_large_kernel(test, device): """Test tid() on kernel launch of 2**33 threads. The function conditional sum will add 1 to result for every thread that has an i index of 0. Due to the size of the grid, this test is not run on CPUs """ test_result = wp.zeros(shape=(1,), dtype=wp.uint64, device=device) large_dim_length = 2**16 half_result = large_dim_length * large_dim_length wp.launch(kernel=conditional_sum, dim=[2, large_dim_length, large_dim_length], inputs=[test_result], device=device) test.assertEqual(test_result.numpy()[0], half_result) @wp.kernel def count_elements(result: wp.array(dtype=wp.uint64)): wp.atomic_add(result, 0, wp.uint64(1)) def test_large_launch_max_blocks(test, device): # Loop over 1000x1x1 elements using a grid of 256 threads test_result = wp.zeros(shape=(1,), dtype=wp.uint64, device=device) wp.launch(count_elements, (1000,), inputs=[test_result], max_blocks=1, device=device) test.assertEqual(test_result.numpy()[0], 1000) # Loop over 2x10x10 elements using a grid of 256 threads, using the tid() index to count half the elements test_result.zero_() wp.launch( conditional_sum, ( 2, 50, 10, ), inputs=[test_result], max_blocks=1, device=device, ) test.assertEqual(test_result.numpy()[0], 500) def test_large_launch_very_large_kernel(test, device): """Due to the size of the grid, this test is not run on CPUs""" # Dim is chosen to be larger than the maximum CUDA one-dimensional grid size (total threads) dim = (2**31 - 1) * 256 + 1 test_result = wp.zeros(shape=(1,), dtype=wp.uint64, device=device) wp.launch(count_elements, (dim,), inputs=[test_result], device=device) test.assertEqual(test_result.numpy()[0], dim) def test_large_arrays_slow(test, device): # The goal of this test is to use arrays just large enough to know # if there's a flaw in handling arrays with more than 2**31-1 elements # Unfortunately, it takes a long time to run so it won't be run automatically # without changes to support how frequently a test may be run total_elements = 2**31 + 8 # 1-D to 4-D arrays: test zero_, fill_, then zero_ for scalar data types: for total_dims in range(1, 5): dim_x = math.ceil(total_elements ** (1 / total_dims)) shape_tuple = tuple([dim_x] * total_dims) for nptype, wptype in wp.types.np_dtype_to_warp_type.items(): a1 = wp.zeros(shape_tuple, dtype=wptype, device=device) assert_np_equal(a1.numpy(), np.zeros_like(a1.numpy())) a1.fill_(127) assert_np_equal(a1.numpy(), 127 * np.ones_like(a1.numpy())) a1.zero_() assert_np_equal(a1.numpy(), np.zeros_like(a1.numpy())) def test_large_arrays_fast(test, device): # A truncated version of test_large_arrays_slow meant to catch basic errors total_elements = 2**31 + 8 nptype = np.dtype(np.int8) wptype = wp.types.np_dtype_to_warp_type[nptype] a1 = wp.zeros((total_elements,), dtype=wptype, device=device) assert_np_equal(a1.numpy(), np.zeros_like(a1.numpy())) a1.fill_(127) assert_np_equal(a1.numpy(), 127 * np.ones_like(a1.numpy())) a1.zero_() assert_np_equal(a1.numpy(), np.zeros_like(a1.numpy())) devices = get_test_devices() class TestLarge(unittest.TestCase): pass add_function_test( TestLarge, "test_large_launch_large_kernel", test_large_launch_large_kernel, devices=get_unique_cuda_test_devices() ) add_function_test(TestLarge, "test_large_launch_max_blocks", test_large_launch_max_blocks, devices=devices) add_function_test( TestLarge, "test_large_launch_very_large_kernel", test_large_launch_very_large_kernel, devices=get_unique_cuda_test_devices(), ) add_function_test(TestLarge, "test_large_arrays_fast", test_large_arrays_fast, devices=devices) if __name__ == "__main__": wp.build.clear_kernel_cache() unittest.main(verbosity=2)