qbhf2's picture
added NvidiaWarp and GarmentCode repos
66c9c8a
# Copyright (c) 2023 NVIDIA CORPORATION. All rights reserved.
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto. Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
import math
import unittest
import warp as wp
from warp.tests.unittest_utils import *
wp.init()
@wp.kernel
def conditional_sum(result: wp.array(dtype=wp.uint64)):
i, j, k = wp.tid()
if i == 0:
wp.atomic_add(result, 0, wp.uint64(1))
def test_large_launch_large_kernel(test, device):
"""Test tid() on kernel launch of 2**33 threads.
The function conditional sum will add 1 to result for every thread that has an i index of 0.
Due to the size of the grid, this test is not run on CPUs
"""
test_result = wp.zeros(shape=(1,), dtype=wp.uint64, device=device)
large_dim_length = 2**16
half_result = large_dim_length * large_dim_length
wp.launch(kernel=conditional_sum, dim=[2, large_dim_length, large_dim_length], inputs=[test_result], device=device)
test.assertEqual(test_result.numpy()[0], half_result)
@wp.kernel
def count_elements(result: wp.array(dtype=wp.uint64)):
wp.atomic_add(result, 0, wp.uint64(1))
def test_large_launch_max_blocks(test, device):
# Loop over 1000x1x1 elements using a grid of 256 threads
test_result = wp.zeros(shape=(1,), dtype=wp.uint64, device=device)
wp.launch(count_elements, (1000,), inputs=[test_result], max_blocks=1, device=device)
test.assertEqual(test_result.numpy()[0], 1000)
# Loop over 2x10x10 elements using a grid of 256 threads, using the tid() index to count half the elements
test_result.zero_()
wp.launch(
conditional_sum,
(
2,
50,
10,
),
inputs=[test_result],
max_blocks=1,
device=device,
)
test.assertEqual(test_result.numpy()[0], 500)
def test_large_launch_very_large_kernel(test, device):
"""Due to the size of the grid, this test is not run on CPUs"""
# Dim is chosen to be larger than the maximum CUDA one-dimensional grid size (total threads)
dim = (2**31 - 1) * 256 + 1
test_result = wp.zeros(shape=(1,), dtype=wp.uint64, device=device)
wp.launch(count_elements, (dim,), inputs=[test_result], device=device)
test.assertEqual(test_result.numpy()[0], dim)
def test_large_arrays_slow(test, device):
# The goal of this test is to use arrays just large enough to know
# if there's a flaw in handling arrays with more than 2**31-1 elements
# Unfortunately, it takes a long time to run so it won't be run automatically
# without changes to support how frequently a test may be run
total_elements = 2**31 + 8
# 1-D to 4-D arrays: test zero_, fill_, then zero_ for scalar data types:
for total_dims in range(1, 5):
dim_x = math.ceil(total_elements ** (1 / total_dims))
shape_tuple = tuple([dim_x] * total_dims)
for nptype, wptype in wp.types.np_dtype_to_warp_type.items():
a1 = wp.zeros(shape_tuple, dtype=wptype, device=device)
assert_np_equal(a1.numpy(), np.zeros_like(a1.numpy()))
a1.fill_(127)
assert_np_equal(a1.numpy(), 127 * np.ones_like(a1.numpy()))
a1.zero_()
assert_np_equal(a1.numpy(), np.zeros_like(a1.numpy()))
def test_large_arrays_fast(test, device):
# A truncated version of test_large_arrays_slow meant to catch basic errors
total_elements = 2**31 + 8
nptype = np.dtype(np.int8)
wptype = wp.types.np_dtype_to_warp_type[nptype]
a1 = wp.zeros((total_elements,), dtype=wptype, device=device)
assert_np_equal(a1.numpy(), np.zeros_like(a1.numpy()))
a1.fill_(127)
assert_np_equal(a1.numpy(), 127 * np.ones_like(a1.numpy()))
a1.zero_()
assert_np_equal(a1.numpy(), np.zeros_like(a1.numpy()))
devices = get_test_devices()
class TestLarge(unittest.TestCase):
pass
add_function_test(
TestLarge, "test_large_launch_large_kernel", test_large_launch_large_kernel, devices=get_unique_cuda_test_devices()
)
add_function_test(TestLarge, "test_large_launch_max_blocks", test_large_launch_max_blocks, devices=devices)
add_function_test(
TestLarge,
"test_large_launch_very_large_kernel",
test_large_launch_very_large_kernel,
devices=get_unique_cuda_test_devices(),
)
add_function_test(TestLarge, "test_large_arrays_fast", test_large_arrays_fast, devices=devices)
if __name__ == "__main__":
wp.build.clear_kernel_cache()
unittest.main(verbosity=2)