File size: 4,744 Bytes
66c9c8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# Copyright (c) 2023 NVIDIA CORPORATION.  All rights reserved.
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.

import math
import unittest

import warp as wp
from warp.tests.unittest_utils import *

wp.init()


@wp.kernel
def conditional_sum(result: wp.array(dtype=wp.uint64)):
    i, j, k = wp.tid()

    if i == 0:
        wp.atomic_add(result, 0, wp.uint64(1))


def test_large_launch_large_kernel(test, device):
    """Test tid() on kernel launch of 2**33 threads.

    The function conditional sum will add 1 to result for every thread that has an i index of 0.
    Due to the size of the grid, this test is not run on CPUs
    """
    test_result = wp.zeros(shape=(1,), dtype=wp.uint64, device=device)

    large_dim_length = 2**16
    half_result = large_dim_length * large_dim_length

    wp.launch(kernel=conditional_sum, dim=[2, large_dim_length, large_dim_length], inputs=[test_result], device=device)
    test.assertEqual(test_result.numpy()[0], half_result)


@wp.kernel
def count_elements(result: wp.array(dtype=wp.uint64)):
    wp.atomic_add(result, 0, wp.uint64(1))


def test_large_launch_max_blocks(test, device):
    # Loop over 1000x1x1 elements using a grid of 256 threads
    test_result = wp.zeros(shape=(1,), dtype=wp.uint64, device=device)
    wp.launch(count_elements, (1000,), inputs=[test_result], max_blocks=1, device=device)
    test.assertEqual(test_result.numpy()[0], 1000)

    # Loop over 2x10x10 elements using a grid of 256 threads, using the tid() index to count half the elements
    test_result.zero_()
    wp.launch(
        conditional_sum,
        (
            2,
            50,
            10,
        ),
        inputs=[test_result],
        max_blocks=1,
        device=device,
    )
    test.assertEqual(test_result.numpy()[0], 500)


def test_large_launch_very_large_kernel(test, device):
    """Due to the size of the grid, this test is not run on CPUs"""

    # Dim is chosen to be larger than the maximum CUDA one-dimensional grid size (total threads)
    dim = (2**31 - 1) * 256 + 1
    test_result = wp.zeros(shape=(1,), dtype=wp.uint64, device=device)
    wp.launch(count_elements, (dim,), inputs=[test_result], device=device)
    test.assertEqual(test_result.numpy()[0], dim)


def test_large_arrays_slow(test, device):
    # The goal of this test is to use arrays just large enough to know
    # if there's a flaw in handling arrays with more than 2**31-1 elements
    # Unfortunately, it takes a long time to run so it won't be run automatically
    # without changes to support how frequently a test may be run
    total_elements = 2**31 + 8

    # 1-D to 4-D arrays: test zero_, fill_, then zero_ for scalar data types:
    for total_dims in range(1, 5):
        dim_x = math.ceil(total_elements ** (1 / total_dims))
        shape_tuple = tuple([dim_x] * total_dims)

        for nptype, wptype in wp.types.np_dtype_to_warp_type.items():
            a1 = wp.zeros(shape_tuple, dtype=wptype, device=device)
            assert_np_equal(a1.numpy(), np.zeros_like(a1.numpy()))

            a1.fill_(127)
            assert_np_equal(a1.numpy(), 127 * np.ones_like(a1.numpy()))

            a1.zero_()
            assert_np_equal(a1.numpy(), np.zeros_like(a1.numpy()))


def test_large_arrays_fast(test, device):
    # A truncated version of test_large_arrays_slow meant to catch basic errors
    total_elements = 2**31 + 8

    nptype = np.dtype(np.int8)
    wptype = wp.types.np_dtype_to_warp_type[nptype]

    a1 = wp.zeros((total_elements,), dtype=wptype, device=device)
    assert_np_equal(a1.numpy(), np.zeros_like(a1.numpy()))

    a1.fill_(127)
    assert_np_equal(a1.numpy(), 127 * np.ones_like(a1.numpy()))

    a1.zero_()
    assert_np_equal(a1.numpy(), np.zeros_like(a1.numpy()))


devices = get_test_devices()


class TestLarge(unittest.TestCase):
    pass


add_function_test(
    TestLarge, "test_large_launch_large_kernel", test_large_launch_large_kernel, devices=get_unique_cuda_test_devices()
)

add_function_test(TestLarge, "test_large_launch_max_blocks", test_large_launch_max_blocks, devices=devices)
add_function_test(
    TestLarge,
    "test_large_launch_very_large_kernel",
    test_large_launch_very_large_kernel,
    devices=get_unique_cuda_test_devices(),
)

add_function_test(TestLarge, "test_large_arrays_fast", test_large_arrays_fast, devices=devices)


if __name__ == "__main__":
    wp.build.clear_kernel_cache()
    unittest.main(verbosity=2)