danieldk HF Staff commited on 3 days ago

Commit

06534f8

verified ·

1 Parent(s): d7f7ab1

Build uploaded using `kernels` (batch 5/10).

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_layout_sm80_90.py +173 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_load_sm80_90.py +142 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_mixed_sm80_90.py +319 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_store_sm80_90.py +180 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/run_all_tests.py +44 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/utils/evt_testbed.py +235 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_batched.py +134 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f16_sm80.py +128 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f16_sm90.py +146 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f32_sm80.py +104 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f64_sm80.py +103 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f64_sm90.py +71 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f8_sm90.py +112 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_mixed_sm80.py +75 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_s8_sm80.py +103 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_s8_sm90.py +98 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_testbed.py +423 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/run_all_tests.py +44 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/utils.py +260 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/installation.py +57 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/interface/conv2d_interface.py +284 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/interface/evt_interface.py +254 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/interface/gemm_interface.py +354 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/interface/utils.py +69 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/pycute/run_all_tests.py +75 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_coalesce.py +95 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_complement.py +92 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_composition.py +213 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_int_tuple.py +80 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_left_inverse.py +87 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_right_inverse.py +96 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_typing.py +59 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/common/cutlass_unit_test.h +102 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/cache_testbed_output.h +907 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_problems.h +927 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_testbed.h +818 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_testbed_interleaved.h +666 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_with_absmax_testbed.h +622 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_with_broadcast_testbed.h +734 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_with_reduction_testbed.h +643 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv3d_problems.h +293 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv3d_testbed.h +716 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv3d_with_broadcast_testbed.h +732 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/depthwise_conv2d_direct_conv_testbed.h +473 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/device_3x/conv_problem_sizes.hpp +1385 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/device_3x/testbed_conv.hpp +768 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/cute/ampere/tiled_cp_async_testbed.hpp +158 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/cute/cooperative_gemm_common.hpp +775 -0
build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/cute/hopper/tma_load_testbed.hpp +217 -0

.gitattributes CHANGED Viewed

@@ -14,3 +14,4 @@ build/torch29-cxx11-cu130-x86_64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lf
 build/torch210-cxx11-cu126-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text
 build/torch210-cxx11-cu128-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text
 build/torch210-cxx11-cu130-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text

 build/torch210-cxx11-cu126-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text
 build/torch210-cxx11-cu128-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text
 build/torch210-cxx11-cu130-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_layout_sm80_90.py ADDED Viewed

	@@ -0,0 +1,173 @@

+################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+"""
+Unit test for store nodes in SM90
+"""
+import logging
+import unittest
+import cutlass_cppgen
+from cutlass_cppgen.backend import *
+from cutlass_cppgen.epilogue import *
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+cutlass_cppgen.set_log_level(logging.WARNING)
+@unittest.skipIf(device_cc() not in [80, 86, 89, 90], "This unittest is only supported on CC [80, 86, 89, 90]")
+class TestEVTLayout(EVTTestCaseBase):
+    def test_permute_1(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_permute(accum, alpha, C):
+            F = alpha * accum
+            F_permute = permute(F, indices=(0, 2, 1))
+            D_permute = F_permute + permute(C, indices=(0, 2, 1))
+            D = permute(D_permute, indices=(0, 2, 1))
+            return D, F
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+            launcher = EVTTestBed(self.element, evt_permute, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    @unittest.skipIf(device_cc() != 90, "This unittest is for cc = Sm90 only")
+    def test_permute_2(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_permute(accum, alpha, C):
+            F = alpha * accum
+            F_permute = permute(F, indices=(0, 2, 1))
+            D = F_permute + C
+            return D, F
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (l, n, m)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, n, m)),
+            }
+            launcher = EVTTestBed(self.element, evt_permute, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    @unittest.skipIf(device_cc() != 90, "This unittest is for cc = Sm90 only")
+    def test_permute_3(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_permute(accum, alpha, C):
+            F = alpha * accum
+            F_permute = permute(F, indices=(1, 0, 2))
+            D = F_permute + C
+            return D, F
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (m, l, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (m, l, n)),
+            }
+            launcher = EVTTestBed(self.element, evt_permute, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_reshape(self):
+        """
+        Test reshape
+        """
+        def evt_reshape(accum, alpha, TensorE):
+            F = alpha * accum
+            E_reshape = reshape(TensorE, new_shape=(512, 1))
+            D = F + E_reshape
+            return D
+        example_inputs = {
+            "accum": self.fake_tensor(self.element, (self.l, self.m, self.n)),
+            "alpha": 0.5,
+            "TensorE": self.fake_tensor(self.element, (16, 32)),
+            "D": self.fake_tensor(self.element, (self.l, self.m, self.n)),
+        }
+        launcher = EVTTestBed(self.element, evt_reshape, example_inputs)
+        input_keys = ["alpha", "TensorE"]
+        result_keys = ["D"]
+        launcher.verify(self.problem_size, input_keys, result_keys, self.l)
+    def test_reshape2(self):
+        """
+        Test reshape
+        """
+        def evt_reshape(accum, alpha, TensorE):
+            F = alpha * accum
+            F_reshape = reshape(F, new_shape=(2, 3, 512, 256))
+            D = F_reshape + TensorE
+            return D
+        example_inputs = {
+            "accum": self.fake_tensor(self.element, (self.l, self.m, self.n)),
+            "alpha": 0.5,
+            "TensorE": self.fake_tensor(self.element, (2, 3, 1, self.n)),
+            "D": self.fake_tensor(self.element, (2, 3, self.m, self.n)),
+        }
+        launcher = EVTTestBed(self.element, evt_reshape, example_inputs)
+        input_keys = ["alpha", "TensorE"]
+        result_keys = ["D"]
+        launcher.verify(self.problem_size, input_keys, result_keys, self.l)
+if __name__ == '__main__':
+    unittest.main()

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_load_sm80_90.py ADDED Viewed

	@@ -0,0 +1,142 @@

+################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+"""
+Unit test for load nodes in SM90
+"""
+import logging
+import unittest
+import cutlass_cppgen
+from cutlass_cppgen.backend import *
+from cutlass_cppgen.epilogue import *
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+cutlass_cppgen.set_log_level(logging.WARNING)
+@unittest.skipIf(device_cc() not in [80, 86, 89, 90], "This unittest is only supported on CC [80, 86, 89, 90]")
+class TestEVTLoad(EVTTestCaseBase):
+    def test_tensor_load(self):
+        """
+        Load extra tensor with shape [m, n]
+        """
+        def evt_tensor_load(accum, C, aux, aux_batch):
+            D = accum + C + aux + aux_batch
+            return D
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "aux": self.fake_tensor(self.element, (m, n)),
+                "aux_batch": self.fake_tensor(np.float32, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+            launcher = EVTTestBed(self.element, evt_tensor_load, example_inputs)
+            input_keys = ["C", "aux", "aux_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_row_broadcast(self):
+        """
+        Load extra tensor with shape [1, n]
+        """
+        def evt_row_broadcast(accum, C, bias, bias_batch):
+            D = accum + C + bias + bias_batch
+            return D
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "bias": self.fake_tensor(self.element, (n,)),
+                "bias_batch": self.fake_tensor(np.float32, (l, 1, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+            launcher = EVTTestBed(self.element, evt_row_broadcast, example_inputs)
+            input_keys = ["C", "bias", "bias_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_column_broadcast(self):
+        """
+        Load extra tensor with shape [m, 1]
+        """
+        def evt_column_broadcast(accum, C, bias, bias_batch):
+            D = accum + C + bias + bias_batch
+            return D
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "bias": self.fake_tensor(self.element, (m, 1)),
+                "bias_batch": self.fake_tensor(np.float32, (l, m, 1)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+            launcher = EVTTestBed(self.element, evt_column_broadcast, example_inputs)
+            input_keys = ["C", "bias", "bias_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_scalar_broadcast(self):
+        """
+        Load extra tensor with shape [1, 1]
+        """
+        def evt_scalar_broadcast(accum, C, alpha, alpha_batch):
+            D = accum + C + alpha + alpha_batch
+            return D
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "alpha_batch": self.fake_tensor(np.float32, (l, 1, 1)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+            launcher = EVTTestBed(self.element, evt_scalar_broadcast, example_inputs)
+            input_keys = ["C", "alpha", "alpha_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+if __name__ == '__main__':
+    unittest.main()

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_mixed_sm80_90.py ADDED Viewed

	@@ -0,0 +1,319 @@

+################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+"""
+Unittest for mixed types of nodes in SM90
+"""
+import logging
+import unittest
+import cutlass_cppgen
+from cutlass_cppgen.backend import *
+from cutlass_cppgen.epilogue import *
+from cutlass_cppgen.swizzle import ThreadblockSwizzleStreamK
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+cutlass_cppgen.set_log_level(logging.WARNING)
+@unittest.skipIf(device_cc() not in [80, 86, 89, 90], "This unittest is only supported on CC [80, 86, 89, 90]")
+class TestEVTMixed(EVTTestCaseBase):
+    def test_same_variable_used_multiple_times(self):
+        """
+        The same variable z0 is used multiple times
+        """
+        def evt_aux_store(accum):
+            z0 = relu(accum)
+            D = z0 + z0
+            return z0, D
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+                "z0": self.fake_tensor(self.element, (l, m, n)),
+            }
+            launcher = EVTTestBed(self.element, evt_aux_store, example_inputs)
+            input_keys = ["accum"]
+            result_keys = ["z0", "D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_no_lca(self):
+        """
+        The same variable z0 is used multiple times
+        """
+        def evt_no_lca(accum, bias):
+            E = relu(accum)
+            F = E + bias
+            tmp_2 = E + 2
+            D = tmp_2 + E
+            return D
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+                "bias": self.fake_tensor(self.element, (m,1), stride=(1,0)),
+            }
+            launcher = EVTTestBed(self.element, evt_no_lca, example_inputs)
+            input_keys = ["accum", "bias"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_mixed_dag(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+        if device_cc() == 80:
+            alignments = [2, 4, 8]
+        else:
+            # Sm90 EVT currently only supports 128-bit alignment
+            alignments = [8,]
+        for align in alignments:
+            for m, n, k, l in self.get_problem_sizes(align):
+                example_inputs = {
+                    "accum": self.fake_tensor(self.element, (l, m, n)),
+                    "alpha": 1.0,
+                    "C": self.fake_tensor(self.element, (l, m, n)),
+                    "beta": 1.0,
+                    "aux": self.fake_tensor(self.element, (l, m, n)),
+                    "cbias": self.fake_tensor(self.element, (m, 1)),
+                    "rbias": self.fake_tensor(self.element, (n,)),
+                    "D": self.fake_tensor(self.element, (l, m, n)),
+                    "F": self.fake_tensor(self.element, (l, m, n)),
+                    "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                    "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+                }
+                launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs)
+                input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+                result_keys = ["D", "F", "F_row_max", "E_col_max"]
+                launcher.verify((m, n, k), input_keys, result_keys, l)
+    @unittest.skipIf(device_cc() not in [80, 89], "This unittest is for cc 80 and 89 only")
+    def test_mixed_dag_float(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+        for align in [3, 2, 4]:
+            for m, n, k, l in self.get_problem_sizes(align):
+                example_inputs = {
+                    "accum": self.fake_tensor(np.float32, (l, m, n)),
+                    "alpha": 1.0,
+                    "C": self.fake_tensor(np.float32, (l, m, n)),
+                    "beta": 1.0,
+                    "aux": self.fake_tensor(np.float32, (l, m, n)),
+                    "cbias": self.fake_tensor(np.float32, (m, 1)),
+                    "rbias": self.fake_tensor(np.float32, (n,)),
+                    "D": self.fake_tensor(np.float32, (l, m, n)),
+                    "F": self.fake_tensor(np.float32, (l, m, n)),
+                    "F_row_max": self.fake_tensor(np.float32, (n,)),
+                    "E_col_max": self.fake_tensor(np.float32, (m, 1))
+                }
+                launcher = EVTTestBed(DataType.f32, evt_mixed_dag, example_inputs)
+                input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+                result_keys = ["D", "F", "F_row_max", "E_col_max"]
+                launcher.verify((m, n, k), input_keys, result_keys, l)
+    @unittest.skipIf(device_cc() not in [80, 89], "This unittest is for cc 80 and 89 only")
+    def test_mixed_dag_stage2(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "beta": 1.0,
+                "aux": self.fake_tensor(self.element, (l, m, n)),
+                "cbias": self.fake_tensor(self.element, (m, 1)),
+                "rbias": self.fake_tensor(self.element, (n,)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+            }
+            launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs, epilogue_stages=2)
+            input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+            result_keys = ["D", "F", "F_row_max", "E_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    @unittest.skipIf(device_cc() not in [80, 89], "This unittest is for cc 80 and 89 only")
+    def test_mixed_dag_partition_k(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "beta": 1.0,
+                "aux": self.fake_tensor(self.element, (l, m, n)),
+                "cbias": self.fake_tensor(self.element, (m, 1)),
+                "rbias": self.fake_tensor(self.element, (n,)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+            }
+            tile_description = {
+                "threadblock_shape": [128, 128, 64],
+                "warp_count": [2, 2, 2]
+            }
+            launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs, tile_description=tile_description, epilogue_stages=2)
+            input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+            result_keys = ["D", "F", "F_row_max", "E_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    @unittest.skipIf(device_cc() not in [80, 89], "This unittest is for cc 80 and 89 only")
+    def test_mixed_dag_stream_k(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+        # High per-sm occupancy tile_description
+        tile_description = {
+            "threadblock_shape": [128, 128, 32],
+            "warp_count": [2, 2, 1],
+            "stages": 3
+        }
+        tds = [None, tile_description]
+        for td in tds:
+            for m, n, k, l in self.get_problem_sizes(8, k=960, batch_count=[1, 3]):
+                if l == 1:
+                    example_inputs = {
+                        "accum": self.fake_tensor(self.element, (m, n)),
+                        "alpha": 1.0,
+                        "C": self.fake_tensor(self.element, (m, n)),
+                        "beta": 1.0,
+                        "aux": self.fake_tensor(self.element, (m, n)),
+                        "cbias": self.fake_tensor(self.element, (m, 1)),
+                        "rbias": self.fake_tensor(self.element, (n,)),
+                        "D": self.fake_tensor(self.element, (m, n)),
+                        "F": self.fake_tensor(self.element, (m, n)),
+                        "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                        "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+                    }
+                else:
+                    example_inputs = {
+                        "accum": self.fake_tensor(self.element, (l, m, n)),
+                        "alpha": 1.0,
+                        "C": self.fake_tensor(self.element, (l, m, n)),
+                        "beta": 1.0,
+                        "aux": self.fake_tensor(self.element, (l, m, n)),
+                        "cbias": self.fake_tensor(self.element, (m, 1)),
+                        "rbias": self.fake_tensor(self.element, (n,)),
+                        "D": self.fake_tensor(self.element, (l, m, n)),
+                        "F": self.fake_tensor(self.element, (l, m, n)),
+                        "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                        "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+                    }
+                if td is not None:
+                    launcher = EVTTestBed(
+                        self.element, evt_mixed_dag, example_inputs,
+                        tile_description=td,
+                        swizzling_functor=ThreadblockSwizzleStreamK, backend="torch")
+                else:
+                    launcher = EVTTestBed(
+                        self.element, evt_mixed_dag, example_inputs,
+                        swizzling_functor=ThreadblockSwizzleStreamK, backend="torch")
+                input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+                result_keys = ["D", "F", "F_row_max", "E_col_max"]
+                launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_mixed_dag_no_batch(self):
+        def evt_mixed_dag_no_batch(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+        for m, n, k, _ in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (m, n)),
+                "alpha": 1.0,
+                "C": self.fake_tensor(self.element, (m, n)),
+                "beta": 1.0,
+                "aux": self.fake_tensor(self.element, (m, n)),
+                "cbias": self.fake_tensor(self.element, (m, 1)),
+                "rbias": self.fake_tensor(self.element, (n,)),
+                "D": self.fake_tensor(self.element, (m, n)),
+                "F": self.fake_tensor(self.element, (m, n)),
+                "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+            }
+            launcher = EVTTestBed(self.element, evt_mixed_dag_no_batch, example_inputs)
+            input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+            result_keys = ["D", "F", "F_row_max", "E_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, 1)
+if __name__ == '__main__':
+    unittest.main()

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_store_sm80_90.py ADDED Viewed

	@@ -0,0 +1,180 @@

+################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+"""
+Unit test for store nodes in SM90
+"""
+import logging
+import unittest
+import cutlass_cppgen
+from cutlass_cppgen.backend import *
+from cutlass_cppgen.epilogue import *
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+cutlass_cppgen.set_log_level(logging.WARNING)
+@unittest.skipIf(device_cc() not in [80, 86, 89, 90], "This unittest is only supported on CC [80, 86, 89, 90]")
+class TestEVTStore(EVTTestCaseBase):
+    @unittest.skipIf(device_cc() != 90, "This test is only for CC 90")
+    def test_invalid_store(self):
+        """
+        Test invalid store
+        """
+        def evt_invalid_store(accum):
+            D = accum
+            F = D + 1 # D has users, which is not allowed on SM90 or higher
+            return D, F
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n))
+            }
+            with self.assertRaisesRegex(
+                    RuntimeError,
+                    r"On SM90 or higher, D is expected to be a output node with 0 users "
+                    r"to enable smem reuse between C and D, but got 1"
+                ):
+                launcher = EVTTestBed(self.element, evt_invalid_store, example_inputs)
+            break  # Only need to test once
+    def test_aux_store(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_aux_store(accum, alpha, C):
+            F = alpha * accum
+            D = F + C
+            return D, F
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+            launcher = EVTTestBed(self.element, evt_aux_store, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_col_reduce(self):
+        """
+        Reduction [m, n] -> [m, 1]
+        """
+        def evt_row_reduce(accum, alpha, C):
+            acc_row_max = max(accum, dim=[2,])
+            F = alpha * accum
+            F_row_max = max(F, dim=[0, 2])
+            D = F + C
+            return D, F_row_max, acc_row_max
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 2.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F_row_max": self.fake_tensor(np.float32, (m, 1)),
+                "acc_row_max": self.fake_tensor(np.float32, (l, m, 1)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+            launcher = EVTTestBed(self.element, evt_row_reduce, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F_row_max", "acc_row_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_row_reduce(self):
+        """
+        Reduction [m, n] -> [n]
+        """
+        def evt_col_reduce(accum, alpha, C):
+            acc_col_max = max(accum, dim=[1,])
+            F = alpha * accum
+            F_col_max = max(F, dim=[0, 1])
+            D = F + C
+            return D, F_col_max, acc_col_max
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 2.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F_col_max": self.fake_tensor(np.float32, (n,)),
+                "acc_col_max": self.fake_tensor(np.float32, (l, 1, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+            launcher = EVTTestBed(self.element, evt_col_reduce, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F_col_max", "acc_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_scalar_reduce(self):
+        """
+        Reduction [m, n] -> [1,]
+        """
+        def evt_scalar_reduce(accum, alpha, C):
+            acc_max = max(accum, dim=[1, 2])
+            F = alpha * accum
+            F_max = max(F, dim=[0, 1, 2])
+            D = F + C
+            return D, F_max, acc_max
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 2.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "acc_max": self.fake_tensor(np.float32, (l, 1, 1)),
+                "F_max": self.fake_tensor(np.float32, (1,)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+            launcher = EVTTestBed(self.element, evt_scalar_reduce, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F_max", "acc_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+if __name__ == '__main__':
+    unittest.main()

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/run_all_tests.py ADDED Viewed

	@@ -0,0 +1,44 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+import pathlib
+import unittest
+if __name__ == '__main__':
+    loader = unittest.TestLoader()
+    script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
+    tests = loader.discover(script_dir, 'evt_*.py')
+    testRunner = unittest.runner.TextTestRunner()
+    results = testRunner.run(tests)
+    if not results.wasSuccessful():
+        raise Exception('Test cases failed')

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/utils/evt_testbed.py ADDED Viewed

	@@ -0,0 +1,235 @@

+################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+"""
+Testbed classes of EVT
+"""
+import torch
+import unittest
+import cutlass_cppgen
+from cutlass_cppgen import Tensor
+import cutlass_cppgen.backend.evt
+from cutlass_cppgen.shape import GemmCoord
+from cutlass_cppgen.utils.datatypes import torch_type
+from cutlass_cppgen.utils.profiler import CUDAEventProfiler
+class EVTReferenceModule:
+    def __init__(self, layout_A, layout_B, layout_C, epilogue_visitor):
+        self.layout_A = layout_A
+        self.layout_B = layout_B
+        self.layout_C = layout_C
+        self.epilogue_visitor = epilogue_visitor
+    def run(self, A, B, C, problem_size, alpha, beta, batch=1):
+        if self.layout_A == cutlass_cppgen.LayoutType.RowMajor:
+            A_row = A.view((batch, problem_size.m, problem_size.k))
+        else:
+            A_col = A.view((batch, problem_size.k, problem_size.m))
+            A_row = torch.permute(A_col, (0, 2, 1))
+        if self.layout_B == cutlass_cppgen.LayoutType.RowMajor:
+            B_row = B.view((batch, problem_size.k, problem_size.n))
+        else:
+            B_col = B.view((batch, problem_size.n, problem_size.k))
+            B_row = torch.permute(B_col, (0, 2, 1))
+        if self.layout_C == cutlass_cppgen.LayoutType.RowMajor:
+            C_row = C.view((batch, problem_size.m, problem_size.n))
+        else:
+            C_col = C.view((batch, problem_size.n, problem_size.m))
+            C_row = torch.permute(C_col, (0, 2, 1))
+        out_row = torch.matmul(A_row, B_row) * alpha + C_row * beta
+        if self.layout_C == cutlass_cppgen.LayoutType.ColumnMajor:
+            out = torch.permute(out_row, (0, 2, 1))
+        else:
+            out = out_row
+        return torch.flatten(out)
+    def __call__(self, A, B, C, problem_size, batch=1, epilogue_args=None):
+        # Running the mainloop
+        accum = self.run(
+            A, B, C, problem_size, 1.0, 0.0, batch=batch
+        ).reshape(batch, problem_size.m, problem_size.n)
+        # Running the epilogue
+        epilogue_args["accum"] = accum
+        references = self.epilogue_visitor(**epilogue_args)
+        # Return the results
+        if not isinstance(references, tuple):
+            references = (references,)
+        return references
+class EVTTestBed:
+    """
+    Epilogue Visitor Testbed
+    """
+    def __init__(self, element, evt_fn, example_inputs, profile=False, **kwargs) -> None:
+        self.element = element
+        layout = cutlass_cppgen.LayoutType.RowMajor
+        self.example_inputs = example_inputs
+        # Create the Gemm plan
+        self.plan = cutlass_cppgen.op.Gemm(element=element, layout=layout, element_accumulator=torch.float32)
+        if "tile_description" in kwargs:
+            self.plan.tile_description = kwargs["tile_description"]
+        if "swizzling_functor" in kwargs:
+            self.plan.swizzling_functor = kwargs["swizzling_functor"]
+        # Compile the epilogue visitor
+        epilogue_visitor = cutlass_cppgen.epilogue.trace(evt_fn, example_inputs)
+        if "epilogue_stages" in kwargs:
+            epilogue_visitor.epilogue_stages = kwargs["epilogue_stages"]
+        self.plan.epilogue_visitor = epilogue_visitor
+        # Reference model
+        self.reference_fn = EVTReferenceModule(layout, layout, layout, epilogue_visitor)
+        self.profile = profile
+    def get_torch_tensor(self, shape, dtype=None, fill=None):
+        if dtype is None:
+            dtype = self.element
+        dtype = torch_type(dtype)
+        if fill is None:
+            return torch.ceil(
+                torch.empty(size=shape, dtype=dtype, device="cuda").uniform_(-4.5, 3.5)
+            )
+        else:
+            return torch.full(shape, fill, dtype=dtype, device="cuda")
+    def verify(self, problem_size, input_keys, result_keys, batch_count=1):
+        """
+        Verify the results
+        """
+        problem_size = GemmCoord(*problem_size)
+        # Initiate the GEMM arguments
+        tensor_A = self.get_torch_tensor((batch_count, problem_size.m, problem_size.k))
+        tensor_B = self.get_torch_tensor((batch_count, problem_size.k, problem_size.n))
+        # Initialize the epilogue args
+        epilogue_args = {}
+        for key in self.example_inputs.keys():
+            if key in input_keys:
+                tensor = self.example_inputs[key]
+                if isinstance(tensor, Tensor):
+                    epilogue_args[key] = self.get_torch_tensor(tensor.shape, tensor.element)
+                else:
+                    epilogue_args[key] = tensor
+            elif key in result_keys:
+                tensor = self.example_inputs[key]
+                if isinstance(tensor, Tensor):
+                    if "max" in key:
+                        fill = -1000
+                    else:
+                        fill = 0
+                    epilogue_args[key] = self.get_torch_tensor(tensor.shape, tensor.element, fill=fill)
+                else:
+                    epilogue_args[key] = tensor
+        tensor_D = epilogue_args["D"]
+        if "C" in epilogue_args:
+            tensor_C = epilogue_args["C"]
+        else:
+            tensor_C = tensor_D
+        # Run the device kernel
+        self.plan.run(tensor_A, tensor_B, tensor_C, tensor_D, visitor_args=epilogue_args)
+        # Run the host reference
+        evt_args_inputs = {}
+        for key in input_keys:
+            evt_args_inputs[key] = epilogue_args[key]
+        reference_results = self.reference_fn(
+            tensor_A, tensor_B, tensor_C, problem_size, batch_count, evt_args_inputs)
+        # Compare the results
+        for result, ref in zip(result_keys, reference_results):
+            assert torch.equal(
+                epilogue_args[result].flatten(),
+                ref.masked_fill(torch.isnan(ref), float('inf')).flatten())
+        # Run profile
+        if self.profile:
+            profiler = CUDAEventProfiler(
+                self.plan, 100, 100, tensor_A, tensor_B, tensor_C, tensor_D,
+                visitor_args = epilogue_args
+            )
+            print(f"Cutlass Python Duration: {profiler()}")
+class EVTTestCaseBase(unittest.TestCase):
+    """
+    Base class for EVT Unittest
+    """
+    def __init__(self, methodName: str = "runTest", lmnk=(6, 512, 256, 128)) -> None:
+        super().__init__(methodName)
+        self.element = cutlass_cppgen.DataType.f16
+        self.l, self.m, self.n, self.k = lmnk
+        self.problem_size = (self.m, self.n, self.k)
+        torch.random.manual_seed(42)
+    def fake_tensor(self, element, shape, stride=None):
+        if stride is None:
+            return Tensor(element=element, shape=shape, layout_tag=cutlass_cppgen.LayoutType.RowMajor)
+        else:
+            return Tensor(element=element, shape=shape, stride=stride)
+    def get_problem_sizes(self, alignment, k=None, batch_count=[3,]):
+        k = k if k else self.k
+        problem_size_m = [alignment, 512 - 3 * alignment]
+        problem_size_n = [alignment, 512 - alignment]
+        if alignment % 8 == 0:
+            problem_size_m.append(768)
+            problem_size_n.append(768)
+        problem_size_l = batch_count
+        problem_sizes = []
+        for m in problem_size_m:
+            for n in problem_size_n:
+                for l in problem_size_l:
+                    problem_sizes.append((m, n, k, l))
+        return problem_sizes

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_batched.py ADDED Viewed

	@@ -0,0 +1,134 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+High-level tests for running batched GEMMs
+"""
+from functools import partial
+import logging
+from math import prod
+import unittest
+import cutlass_cppgen
+from cutlass_cppgen.backend.utils.device import device_cc
+import torch
+from utils import LayoutCombination
+cutlass_cppgen.set_log_level(logging.WARNING)
+torch.manual_seed(2023)
+def pytorch_reference(A, B, C, alpha, beta):
+    # Get the batch count. Assume that any of A, B, and C
+    # with a batch dimension ahve matching batch count. Thus,
+    # we break out of the loop once we have found the first
+    # tensor containing a batch dimension.
+    batch_count = (1,)
+    for tensor in [A, B, C]:
+        if len(tensor.shape) > 2:
+            batch_count = tensor.shape[:-2]
+            break
+    int_batch_count = prod(batch_count)
+    def add_batch(tensor):
+        if len(tensor.shape) == 2:
+            return tensor.unsqueeze(0).repeat(int_batch_count, 1, 1)
+        else:
+            return tensor.reshape(-1, tensor.size(-2), tensor.size(-1))
+    # Reshape tensors to have batch dimension
+    A = add_batch(A)
+    B = add_batch(B)
+    C = add_batch(C)
+    ret = (torch.bmm(A, B) * alpha) + (C * beta)
+    reshape_vals = batch_count + C.shape[-2:]
+    return ret.reshape(*reshape_vals)
+def initialize(rows, cols, batch):
+    tensor = torch.randint(-3, 3, size=(rows*cols*prod(batch),), device='cuda').half()
+    if len(batch) > 0 and prod(batch) > 1:
+        reshape_vals = batch + (rows, cols)
+        return tensor.reshape(*reshape_vals)
+    else:
+        return tensor.reshape(rows, cols)
+class GemmF16Batched(unittest.TestCase):
+    def run_batched(self, batch_count: tuple, batch_A: bool, batch_B: bool, batch_C: bool):
+        M = 512
+        N = 256
+        K = 128
+        alpha = 1.
+        beta = 2.
+        A = initialize(M, K, batch_count if batch_A else (1,))
+        B = initialize(K, N, batch_count if batch_B else (1,))
+        C = initialize(M, N, batch_count if batch_C else (1,))
+        D = initialize(M, N, batch_count)
+        plan = cutlass_cppgen.op.Gemm(A=A, B=B, C=C, D=D, element_accumulator=cutlass_cppgen.DataType.f32)
+        plan.run(A, B, C, D, alpha, beta)
+        reference = pytorch_reference(A, B, C, alpha, beta)
+        assert reference.equal(D)
+    def test_batched_ABC(self):
+        self.run_batched((3,), True, True, True)
+        self.run_batched((2, 3), True, True, True)
+    def test_batched_AB(self):
+        self.run_batched((3,), True, True, False)
+        self.run_batched((2, 3), True, True, False)
+    def test_batched_AC(self):
+        self.run_batched((3,), True, False, True)
+        self.run_batched((2, 3), True, False, True)
+    def test_batched_BC(self):
+        self.run_batched((3,), False, True, True)
+        self.run_batched((2, 3), False, True, True)
+    def test_batched_A(self):
+        self.run_batched((3,), True, False, False)
+        self.run_batched((2, 3), True, False, False)
+    def test_batched_B(self):
+        self.run_batched((3,), False, True, False)
+        self.run_batched((2, 3), False, True, False)
+if __name__ == '__main__':
+    unittest.main()

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f16_sm80.py ADDED Viewed

	@@ -0,0 +1,128 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Low-level functionality tests for GEMM with F16 operands on SM80
+"""
+from functools import partial
+import logging
+import unittest
+import cutlass_cppgen
+from cutlass_cppgen.backend.utils.device import device_cc
+from utils import LayoutCombination, add_test_gemm
+cutlass_cppgen.set_log_level(logging.WARNING)
+cc = 80
+dtype = cutlass_cppgen.DataType.f16
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF16Sm80(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF16Sm80StreamK(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+add_test_specialized = partial(add_test_gemm, element=dtype, cc=cc, cluster_shape=[1, 1, 1])
+# Tests using TensorOp
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.TensorOp)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NNT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NTN, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NTT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNN, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TTN, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TTT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[ 64, 128, 32], warp_count=[1, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128,  64, 32], warp_count=[2, 1, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[ 64,  64, 64], warp_count=[1, 1, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[ 64,  64, 64], warp_count=[1, 1, 1], stages=5)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[2, 2, 2], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+# Tests using SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.Simt)
+add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+              element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+              element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
+add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+              element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128,  64, 8], warp_count=[2, 1, 1], stages=2)
+add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+              element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[ 64,  64, 8], warp_count=[1, 1, 1], stages=2)
+add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+              element_accumulator=cutlass_cppgen.DataType.f16, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+# Stream K tests
+add_test_streamk = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.TensorOp, swizzle=cutlass_cppgen.swizzle.ThreadblockSwizzleStreamK)
+add_test_streamk(cls=GemmF16Sm80StreamK, layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                 element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_streamk(cls=GemmF16Sm80StreamK, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                 element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[ 64,  64, 64], warp_count=[1, 1, 1], stages=5)
+if __name__ == '__main__':
+    unittest.main()

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f16_sm90.py ADDED Viewed

	@@ -0,0 +1,146 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Low-level functionality tests for GEMM with F16 operands on SM90
+"""
+from functools import partial
+import logging
+import unittest
+import cutlass_cppgen
+from cutlass_cppgen.backend.utils.device import device_cc
+from utils import LayoutCombination, add_test_gemm
+cutlass_cppgen.set_log_level(logging.WARNING)
+cc = 90
+dtype = cutlass_cppgen.DataType.f16
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF16Sm90(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+add_test_specialized = partial(add_test_gemm, cls=GemmF16Sm90, element=dtype,
+                               warp_count=None, compilation_modes=['nvcc'])
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.TensorOp)
+# Tests with 1x1x1 clusters
+add_test_unit_cluster = partial(add_test_tensorop, cluster_shape=[1, 1, 1])
+add_test_unit_cluster(layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16,
+                      element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], stages=3)
+add_test_unit_cluster(layouts=LayoutCombination.NNT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16,
+                      element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.NTN, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16,
+                      element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.NTT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16,
+                      element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.TNN, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16,
+                      element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass_cppgen.DataType.f16,
+                      element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass_cppgen.DataType.f16,
+                      element_accumulator=cutlass_cppgen.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16,
+                      element_accumulator=cutlass_cppgen.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16,
+                      element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[ 64,  64, 64], stages=5)
+add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[2, 2, 2], element_output=cutlass_cppgen.DataType.f16,
+                      element_accumulator=cutlass_cppgen.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
+# Tests with different cluster shapes
+add_test_cluster_shape = partial(add_test_tensorop, threadblock_shape=[64, 128, 64], stages=None)
+add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16,
+                       element_accumulator=cutlass_cppgen.DataType.f16, cluster_shape=[2, 2, 1])
+add_test_cluster_shape(layouts=LayoutCombination.TNN, alignments=[8, 8, 4], element_output=cutlass_cppgen.DataType.f32,
+                       element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[2, 2, 1])
+add_test_cluster_shape(layouts=LayoutCombination.NTN, alignments=[8, 8, 4], element_output=cutlass_cppgen.DataType.f32,
+                       element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[2, 2, 1])
+add_test_cluster_shape(layouts=LayoutCombination.NNN, alignments=[8, 8, 4], element_output=cutlass_cppgen.DataType.f32,
+                       element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[2, 2, 1])
+add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass_cppgen.DataType.f32,
+                       element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[1, 4, 1])
+add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass_cppgen.DataType.f32,
+                       element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[2, 4, 1])
+add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass_cppgen.DataType.f32,
+                       element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[4, 1, 1])
+add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass_cppgen.DataType.f32,
+                       element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[4, 2, 1])
+# Tests for different schedule modes
+add_test_schedule = partial(add_test_specialized, layouts=LayoutCombination.TTN, alignments=[8, 8, 4],
+                            element_output=cutlass_cppgen.DataType.f32, element_accumulator=cutlass_cppgen.DataType.f32,
+                            opclass=cutlass_cppgen.OpcodeClass.TensorOp, threadblock_shape=[128, 128, 64], stages=None)
+add_test_schedule(
+    cluster_shape=[1, 1, 1],
+    kernel_schedule=cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedPingpong,
+    epilogue_schedule=cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecialized
+)
+add_test_schedule(
+    cluster_shape=[1, 1, 1],
+    kernel_schedule=cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedCooperative,
+    epilogue_schedule=cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecializedCooperative
+)
+add_test_schedule(
+    cluster_shape=[2, 1, 1],
+    kernel_schedule=cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedPingpong,
+    epilogue_schedule=cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecialized
+)
+add_test_schedule(
+    cluster_shape=[2, 1, 1],
+    kernel_schedule=cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedCooperative,
+    epilogue_schedule=cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecializedCooperative
+)
+# Tests using SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.Simt, alignments=[1, 1, 1], cluster_shape=[1, 1, 1], stages=2)
+add_test_simt(layouts=LayoutCombination.NNN, element_output=cutlass_cppgen.DataType.f16, element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 8])
+add_test_simt(layouts=LayoutCombination.TNN, element_output=cutlass_cppgen.DataType.f16, element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[ 64, 128, 8])
+add_test_simt(layouts=LayoutCombination.NTN, element_output=cutlass_cppgen.DataType.f16, element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128,  64, 8])
+add_test_simt(layouts=LayoutCombination.TTN, element_output=cutlass_cppgen.DataType.f16, element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[ 64,  64, 8])
+add_test_simt(layouts=LayoutCombination.NNT, element_output=cutlass_cppgen.DataType.f16, element_accumulator=cutlass_cppgen.DataType.f16, threadblock_shape=[128, 128, 8])
+# Tests with void-C kernels
+add_test_cluster_shape(layouts=LayoutCombination.NNT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16,
+                       element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], stages=None,
+                       cluster_shape=[2, 1, 1], element_C=cutlass_cppgen.DataType.void)
+if __name__ == '__main__':
+    unittest.main()

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f32_sm80.py ADDED Viewed

	@@ -0,0 +1,104 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Low-level functionality tests for GEMM with F32 operands on SM80
+"""
+from functools import partial
+import logging
+import unittest
+import cutlass_cppgen
+from cutlass_cppgen.backend.utils.device import device_cc
+from utils import LayoutCombination, add_test_gemm
+cutlass_cppgen.set_log_level(logging.WARNING)
+cc = 80
+dtype = cutlass_cppgen.DataType.f32
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF32Sm80(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF32Sm80StreamK(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+add_test_specialized = partial(add_test_gemm, element=dtype, cc=cc, cluster_shape=[1, 1, 1])
+# Tests using TensorOp
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.TensorOp)
+add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NNN, alignments=[4, 4, 4], element_output=dtype, element_C=dtype,
+                  element_accumulator=dtype, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NNT, alignments=[4, 4, 4], element_output=dtype, element_C=dtype,
+                  element_accumulator=dtype, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[4, 4, 4], element_output=dtype, element_C=dtype,
+                  element_accumulator=dtype, threadblock_shape=[ 64, 128, 32], warp_count=[1, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[4, 4, 4], element_output=dtype, element_C=dtype,
+                  element_accumulator=dtype, threadblock_shape=[ 64,  64, 32], warp_count=[1, 1, 1], stages=4)
+# Tests using SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.Simt)
+add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
+add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[128,  64, 8], warp_count=[2, 1, 1], stages=2)
+add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[ 64,  64, 8], warp_count=[1, 1, 1], stages=2)
+add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+# Stream K tests
+add_test_streamk = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.TensorOp, swizzle=cutlass_cppgen.swizzle.ThreadblockSwizzleStreamK)
+add_test_streamk(cls=GemmF32Sm80StreamK, layouts=LayoutCombination.TTN, alignments=[4, 4, 4], element_output=dtype, element_C=dtype,
+                 element_accumulator=dtype, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+if __name__ == '__main__':
+    unittest.main()

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f64_sm80.py ADDED Viewed

	@@ -0,0 +1,103 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Low-level functionality tests for GEMM with F64 operands on SM80
+"""
+from functools import partial
+import logging
+import unittest
+import cutlass_cppgen
+from cutlass_cppgen.backend.utils.device import device_cc
+from utils import LayoutCombination, add_test_gemm
+cutlass_cppgen.set_log_level(logging.WARNING)
+cc = 80
+dtype = cutlass_cppgen.DataType.f64
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF64Sm80(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF64Sm80StreamK(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+add_test_specialized = partial(add_test_gemm, element=dtype, cc=cc, cluster_shape=[1, 1, 1])
+# Tests using TensorOp
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.TensorOp)
+add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+                  element_accumulator=dtype, threadblock_shape=[128, 128, 16], warp_count=[4, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+                  element_accumulator=dtype, threadblock_shape=[ 64,  64, 16], warp_count=[2, 2, 1], stages=4)
+add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+                  element_accumulator=dtype, threadblock_shape=[ 32,  32, 16], warp_count=[2, 1, 1], stages=5)
+# Tests using SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.Simt)
+add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
+add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[128,  64, 8], warp_count=[2, 1, 1], stages=2)
+add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[ 64,  64, 8], warp_count=[1, 1, 1], stages=2)
+add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+# Stream K tests
+add_test_streamk = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.TensorOp, swizzle=cutlass_cppgen.swizzle.ThreadblockSwizzleStreamK)
+add_test_streamk(cls=GemmF64Sm80StreamK, layouts=LayoutCombination.NTT, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+                 element_accumulator=dtype, threadblock_shape=[128, 128, 16], warp_count=[4, 2, 1], stages=3)
+if __name__ == '__main__':
+    unittest.main()

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f64_sm90.py ADDED Viewed

	@@ -0,0 +1,71 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Low-level functionality tests for GEMM with F64 operands on SM90
+"""
+from functools import partial
+import logging
+import unittest
+import cutlass_cppgen
+from cutlass_cppgen.backend.utils.device import device_cc
+from utils import LayoutCombination, add_test_gemm
+cutlass_cppgen.set_log_level(logging.WARNING)
+cc = 90
+dtype = cutlass_cppgen.DataType.f64
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF64Sm90(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+add_test_specialized = partial(add_test_gemm, cls=GemmF64Sm90, alignments=[1, 1, 1], cluster_shape=[1, 1, 1],
+                               element=dtype, element_output=dtype, element_accumulator=dtype, compilation_modes=['nvcc'])
+add_test_specialized(opclass=cutlass_cppgen.OpcodeClass.TensorOp, layouts=LayoutCombination.NNT, threadblock_shape=[128, 128, 32], stages=3)
+add_test_specialized(opclass=cutlass_cppgen.OpcodeClass.TensorOp, layouts=LayoutCombination.TNN, threadblock_shape=[128, 128, 32], stages=3)
+add_test_specialized(    opclass=cutlass_cppgen.OpcodeClass.Simt, layouts=LayoutCombination.NNN, threadblock_shape=[128, 128,  8], stages=2)
+add_test_specialized(    opclass=cutlass_cppgen.OpcodeClass.Simt, layouts=LayoutCombination.TTT, threadblock_shape=[ 64, 128,  8], stages=2)
+if __name__ == '__main__':
+    unittest.main()

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f8_sm90.py ADDED Viewed

	@@ -0,0 +1,112 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Low-level functionality tests for GEMM with S8 operands on SM90
+"""
+from functools import partial
+import logging
+import unittest
+import cutlass_cppgen
+from cutlass_cppgen.backend.utils.device import device_cc
+from utils import LayoutCombination, add_test_gemm
+cutlass_cppgen.set_log_level(logging.WARNING)
+cc = 90
+dtype = cutlass_cppgen.DataType.e4m3
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF8E4M3Sm90(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+add_test_specialized = partial(add_test_gemm, cls=GemmF8E4M3Sm90, element=dtype, compilation_modes=['nvcc'])
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.TensorOp)
+# Test with 1x1x1 clusters
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass_cppgen.DataType.e4m3,
+                  element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
+# Tests with different cluster shapes
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass_cppgen.DataType.e4m3,
+                  element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[2, 2, 1], threadblock_shape=[128, 128, 128], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass_cppgen.DataType.e4m3,
+                  element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[1, 4, 1], threadblock_shape=[128, 128, 128], stages=None)
+# Tests with warp-specialized ping-pong schedule
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass_cppgen.DataType.e4m3,
+                  element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[2, 1, 1], threadblock_shape=[128, 128, 128], stages=None,
+                  kernel_schedule=cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedPingpong,
+                  epilogue_schedule=cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecialized)
+# Tests for SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.Simt)
+add_test_simt(layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass_cppgen.DataType.e4m3,
+              element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[1, 1, 1], threadblock_shape=[64, 32, 8], stages=2)
+#
+# Add a test for E5M2
+#
+dtype = cutlass_cppgen.DataType.e5m2
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF8E5M2Sm90(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+add_test_specialized = partial(add_test_gemm, cls=GemmF8E5M2Sm90, element=dtype, compilation_modes=['nvcc'])
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.TensorOp)
+# Tests with 1x1x1 clusters
+add_test_tensorop(layouts=LayoutCombination.TNN, alignments=[16, 16, 16], element_output=dtype,
+                  element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=3)
+if __name__ == '__main__':
+    unittest.main()

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_mixed_sm80.py ADDED Viewed

	@@ -0,0 +1,75 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Low-level functionality tests for GEMM with mixed operands on SM80
+"""
+from functools import partial
+import logging
+import unittest
+import cutlass_cppgen
+from cutlass_cppgen.backend.utils.device import device_cc
+from utils import LayoutCombination, add_test_gemm
+cutlass_cppgen.set_log_level(logging.WARNING)
+cc = 80
+dtype =cutlass_cppgen.DataType.f16
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmMixedSm80(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+add_test_mixed = partial(add_test_gemm, cls=GemmMixedSm80, element=dtype, cc=cc, cluster_shape=[1, 1, 1],
+                         opclass=cutlass_cppgen.OpcodeClass.TensorOp, threadblock_shape=[128, 128, 64],
+                         warp_count=[2, 2, 1], stages=3, element_accumulator=cutlass_cppgen.DataType.f32)
+# Test with upcast on A
+add_test_mixed(element_A=cutlass_cppgen.DataType.s8, alignments=[16, 8, 8], layouts=LayoutCombination.TNT)
+add_test_mixed(element_A=cutlass_cppgen.DataType.s8, alignments=[16, 8, 8], layouts=LayoutCombination.TNN)
+# Test with upcast on B
+add_test_mixed(element_B=cutlass_cppgen.DataType.s8, alignments=[8, 16, 8], layouts=LayoutCombination.TNT)
+add_test_mixed(element_B=cutlass_cppgen.DataType.s8, alignments=[8, 16, 8], layouts=LayoutCombination.TNN)
+if __name__ == '__main__':
+    unittest.main()

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_s8_sm80.py ADDED Viewed

	@@ -0,0 +1,103 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Low-level functionality tests for GEMM with S8 operands on SM80
+"""
+from functools import partial
+import logging
+import unittest
+import cutlass_cppgen
+from cutlass_cppgen.backend.utils.device import device_cc
+from utils import LayoutCombination, add_test_gemm
+cutlass_cppgen.set_log_level(logging.WARNING)
+cc = 80
+dtype = cutlass_cppgen.DataType.s8
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmS8Sm80(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmS8Sm80StreamK(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+add_test_specialized = partial(add_test_gemm, element=dtype, cc=cc, cluster_shape=[1, 1, 1])
+# Tests using TensorOp
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.TensorOp)
+add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[16, 16, 16],  element_output=cutlass_cppgen.DataType.s8, element_C=cutlass_cppgen.DataType.s8,
+                  element_accumulator=cutlass_cppgen.DataType.s32, threadblock_shape=[256, 128, 64], warp_count=[4, 2, 1], stages=3)
+add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNT, alignments=[16, 16, 16],  element_output=cutlass_cppgen.DataType.s8, element_C=cutlass_cppgen.DataType.s8,
+                  element_accumulator=cutlass_cppgen.DataType.s32, threadblock_shape=[128, 256, 64], warp_count=[2, 4, 1], stages=3)
+add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[16, 16,  4], element_output=cutlass_cppgen.DataType.s32, element_C=cutlass_cppgen.DataType.s32,
+                  element_accumulator=cutlass_cppgen.DataType.s32, threadblock_shape=[ 64,  64, 64], warp_count=[1, 1, 1], stages=4)
+# Tests using SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.Simt)
+add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1],  element_output=cutlass_cppgen.DataType.s8, element_C=cutlass_cppgen.DataType.s8,
+              element_accumulator=cutlass_cppgen.DataType.s32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1],  element_output=cutlass_cppgen.DataType.s8, element_C=cutlass_cppgen.DataType.s8,
+              element_accumulator=cutlass_cppgen.DataType.s32, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
+add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1],  element_output=cutlass_cppgen.DataType.s8, element_C=cutlass_cppgen.DataType.s8,
+              element_accumulator=cutlass_cppgen.DataType.s32, threadblock_shape=[128,  64, 8], warp_count=[2, 1, 1], stages=2)
+add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass_cppgen.DataType.s32, element_C=cutlass_cppgen.DataType.s32,
+              element_accumulator=cutlass_cppgen.DataType.s32, threadblock_shape=[ 64,  64, 8], warp_count=[1, 1, 1], stages=2)
+add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass_cppgen.DataType.s32, element_C=cutlass_cppgen.DataType.s32,
+              element_accumulator=cutlass_cppgen.DataType.s32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+# Stream K tests
+add_test_streamk = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.TensorOp, swizzle=cutlass_cppgen.swizzle.ThreadblockSwizzleStreamK)
+add_test_streamk(cls=GemmS8Sm80StreamK, layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass_cppgen.DataType.s8, element_C=cutlass_cppgen.DataType.s8,
+                 element_accumulator=cutlass_cppgen.DataType.s32, threadblock_shape=[128, 256, 64], warp_count=[2, 4, 1], stages=3)
+if __name__ == '__main__':
+    unittest.main()

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_s8_sm90.py ADDED Viewed

	@@ -0,0 +1,98 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Low-level functionality tests for GEMM with S8 operands on SM90
+"""
+from functools import partial
+import logging
+import unittest
+import cutlass_cppgen
+from cutlass_cppgen.backend.utils.device import device_cc
+from utils import LayoutCombination, add_test_gemm
+cutlass_cppgen.set_log_level(logging.WARNING)
+cc = 90
+dtype = cutlass_cppgen.DataType.s8
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmS8Sm90(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+add_test_specialized = partial(add_test_gemm, cls=GemmS8Sm90, element=dtype, compilation_modes=['nvcc'])
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.TensorOp)
+# Tests with 1x1x1 clusters
+add_test_tensorop(layouts=LayoutCombination.TNN, alignments=[16, 16, 16], element_output=cutlass_cppgen.DataType.s8,
+                  element_accumulator=cutlass_cppgen.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=3)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass_cppgen.DataType.s8,
+                  element_accumulator=cutlass_cppgen.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16,  8], element_output=cutlass_cppgen.DataType.s8,
+                  element_accumulator=cutlass_cppgen.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass_cppgen.DataType.s8,
+                  element_accumulator=cutlass_cppgen.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[64,  128, 128], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass_cppgen.DataType.s8,
+                  element_accumulator=cutlass_cppgen.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128,  64,  32], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[ 4,  4, 16], element_output=cutlass_cppgen.DataType.s8,
+                  element_accumulator=cutlass_cppgen.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
+# Tests with different cluster shapes
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass_cppgen.DataType.s8,
+                  element_accumulator=cutlass_cppgen.DataType.s32, cluster_shape=[2, 2, 1], threadblock_shape=[128, 128, 128], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass_cppgen.DataType.s8,
+                  element_accumulator=cutlass_cppgen.DataType.s32, cluster_shape=[1, 4, 1], threadblock_shape=[128, 128, 128], stages=None)
+# Tests with warp-specialized ping-pong schedule
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass_cppgen.DataType.s8,
+                  element_accumulator=cutlass_cppgen.DataType.s32, cluster_shape=[2, 1, 1], threadblock_shape=[128, 128, 128], stages=None,
+                  kernel_schedule=cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedPingpong,
+                  epilogue_schedule=cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecialized)
+# Tests for SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.Simt)
+add_test_simt(layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass_cppgen.DataType.s8,
+              element_accumulator=cutlass_cppgen.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[64, 32, 8], stages=2)
+if __name__ == '__main__':
+    unittest.main()

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_testbed.py ADDED Viewed

	@@ -0,0 +1,423 @@

+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+from math import prod
+import os
+import re
+import subprocess
+import torch
+from cutlass_library import (
+    DataType,
+    DataTypeSize,
+    GemmUniversalMode,
+    LayoutType,
+    OpcodeClass,
+    ShortDataTypeNames,
+    SwizzlingFunctor
+)
+from cutlass_cppgen.backend import compiler
+from cutlass_cppgen.backend.gemm_operation import GemmArguments, GemmOperationUniversal
+from cutlass_cppgen.backend.reduction_operation import ReductionArguments, ReductionOperation
+from cutlass_cppgen.shape import GemmCoord, MatrixCoord
+from cutlass_cppgen.utils.datatypes import torch_type
+class GemmUniversalLauncher:
+    def __init__(
+        self,
+        operation,
+        seed=2080,
+        verification=True,
+        iterations=500,
+        compiler_mode= "nvcc",
+        **kwargs,
+    ) -> None:
+        self.math_operation = operation.tile_description.math_instruction.math_operation
+        self.verification = verification
+        if compiler_mode == "nvcc":
+            compiler.nvcc()
+        elif compiler_mode == "nvrtc":
+            compiler.nvrtc()
+        else:
+            raise Exception(f"Unexpected compiler string {compiler_mode}")
+        op_list = [operation]
+        if operation.arch < 90:
+            # Split K via Python is currently only supported for pre-SM90 kernels
+            self.reduction_operation: ReductionOperation = ReductionOperation(
+                shape=MatrixCoord(4, 32 * operation.C.alignment),
+                C=operation.C,
+                element_accumulator=operation.tile_description.math_instruction.element_accumulator,
+                element_compute=operation.epilogue_functor.element_epilogue,
+                epilogue_functor=operation.epilogue_functor,
+                count=operation.C.alignment,
+            )
+            op_list.append(self.reduction_operation)
+        compiler.add_module(op_list, bypass_cache=False)
+        self.operation = operation
+        self.dtype_A = torch_type(operation.A.element if not self.operation.switched else self.operation.B.element)
+        self.dtype_B = torch_type(operation.B.element if not self.operation.switched else self.operation.A.element)
+        self.dtype_C = torch_type(operation.C.element)
+        self.dtype_D = torch_type(operation.epilogue_functor.element_output)
+        element_size = min(DataTypeSize[operation.A.element], DataTypeSize[operation.B.element])
+        if element_size == 1:
+            self.rand_max = 1
+            self.rand_min = 0
+        elif element_size <= 8:
+            self.rand_max = 1
+            self.rand_min = -1
+        elif element_size == 16:
+            self.rand_max = 4
+            self.rand_min = -4
+        else:
+            self.rand_max = 8
+            self.rand_min = -8
+        self.seed = seed
+        self.compute_type = operation.epilogue_functor.element_epilogue
+        self.accumulator_type = operation.tile_description.math_instruction.element_accumulator
+    def print_problem_size(self, p, mode, batch_count):
+        if mode == GemmUniversalMode.Gemm:
+            mode = "Gemm"
+        elif mode == GemmUniversalMode.Batched:
+            mode = "GemmBatched"
+        elif mode == GemmUniversalMode.GemmSplitKParallel:
+            mode = "GemmSplitKParallel"
+        print(f"problem: {p.m}, {p.n}, {p.k}\n batch_count: {batch_count}\n mode: {mode}")
+    def uniform_init(self, shape, dtype, layout):
+        size = prod(shape)
+        if dtype.is_floating_point:
+            # Initialize data in FP32 and call convert to the data type we desire.
+            # This is a workaround for the following error that occurs when attempting to
+            # call uniform_ on a tensor with torch.float8_e4m3fn data:
+            # RuntimeError: "check_uniform_bounds" not implemented for 'Float8_e4m3fn'
+            data = torch.ceil(
+                torch.empty(size=(size,), dtype=torch.float32, device="cuda").uniform_(
+                    self.rand_min - 0.5, self.rand_max - 0.5)
+                ).to(dtype)
+        else:
+            # PyTorch does not currently support integer-typed matrix multiplications on GPU.
+            # Fall back to CPU for integer type references.
+            data = torch.empty(size=(size,), dtype=dtype, device="cpu").random_(self.rand_min, self.rand_max + 1)
+        is_fp8 = dtype == getattr(torch, "float8_e4m3fn", -1) or dtype == dtype == getattr(torch, "float8_e5m2", -1)
+        if dtype == torch.float64 or dtype == torch.float32 or is_fp8:
+            data = data.to("cpu")
+        data_ref = data.reshape(shape)
+        if layout == LayoutType.RowMajor:
+            data_cutlass = data_ref
+        else:
+            data_cutlass = data_ref.transpose(-1, -2).contiguous()
+        data_cutlass = data_cutlass.to("cuda")
+        # As of this writing, few operations in PyTorch are supported with FP8 data.
+        # Thus, we perform computation in FP32 for FP8 reference checks.
+        if is_fp8:
+            data_ref = data_ref.to(torch.float32)
+        return data_cutlass, data_ref
+    def reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta):
+        # If any tensor is on CPU, place all tensors on CPU unless only
+        # tensor C is on CPU
+        # Handle mixed-input cases by casting to the larger data type and overriding
+        # to whatever the data type of the larger type is
+        if self.dtype_A != self.dtype_B:
+            if DataTypeSize[self.operation.A.element] < DataTypeSize[self.operation.B.element]:
+                tensor_A = tensor_A.to(self.dtype_B).to(tensor_B.device)
+            else:
+                tensor_B = tensor_B.to(self.dtype_A).to(tensor_A.device)
+        devices = [x.device.type for x in [tensor_A, tensor_B]]
+        if tensor_C is not None:
+            devices.append(tensor_C.device.type)
+        if "cpu" in devices and devices != ["cuda", "cuda", "cpu"]:
+            device = torch.device("cpu")
+        else:
+            device = tensor_A.device
+        tensor_A = tensor_A.to(device)
+        tensor_B = tensor_B.to(device)
+        if tensor_C is not None:
+            tensor_C = tensor_C.to(device)
+        dtype = torch_type(self.compute_type)
+        alpha_torch = torch.tensor([alpha], device=device).to(dtype)
+        beta_torch = torch.tensor([beta], device=device).to(dtype)
+        tmp = tensor_A @ tensor_B
+        tensor_D_ref = (alpha_torch * tmp)
+        if tensor_C is not None:
+            tensor_D_ref += (tensor_C * beta_torch)
+        return tensor_D_ref.to(self.dtype_D)
+    def run(self, mode, problem_size, batch_count=1, split_k_slices=1, alpha=1.0, beta=0.0):
+        torch.random.manual_seed(self.seed)
+        # Assign an actual batch count in cases where we are not running in batched mode.
+        # This is to differentiate between the number of split K slices and the batch count,
+        # which are overloaded within the single `batch_count` variable.
+        if mode == GemmUniversalMode.Batched:
+            true_batch_count = batch_count
+        else:
+            true_batch_count = 1
+        def transpose(layout):
+            if layout == LayoutType.RowMajor:
+                return LayoutType.ColumnMajor
+            else:
+                return LayoutType.RowMajor
+        tensor_A, tensor_A_ref = self.uniform_init(
+            (true_batch_count, problem_size.m, problem_size.k),
+            self.dtype_A,
+            self.operation.A.layout if not self.operation.switched else transpose(self.operation.B.layout),
+        )
+        tensor_B, tensor_B_ref = self.uniform_init(
+            (true_batch_count, problem_size.k, problem_size.n),
+            self.dtype_B,
+            self.operation.B.layout if not self.operation.switched else transpose(self.operation.A.layout),
+        )
+        if self.dtype_C is not None:
+            tensor_C, tensor_C_ref = self.uniform_init(
+                (true_batch_count, problem_size.m, problem_size.n),
+                self.dtype_C,
+                self.operation.C.layout if not self.operation.switched else transpose(self.operation.C.layout),
+            )
+        else:
+            tensor_C = None
+            tensor_C_ref = None
+        tensor_D, _ = self.uniform_init(
+            (true_batch_count, problem_size.m, problem_size.n),
+            self.dtype_D,
+            self.operation.C.layout if not self.operation.switched else transpose(self.operation.C.layout),
+        )
+        tensor_D = torch.zeros_like(tensor_D)
+        if self.compute_type in [DataType.s8, DataType.s32, DataType.u8, DataType.u32]:
+            alpha = int(alpha)
+            beta = int(beta)
+        #
+        # Launch kernel
+        #
+        arguments = GemmArguments(
+            operation=self.operation,
+            problem_size=problem_size,
+            A=tensor_A,
+            B=tensor_B,
+            C=tensor_C,
+            D=tensor_D,
+            output_op=self.operation.epilogue_type(alpha, beta),
+            gemm_mode=mode,
+            split_k_slices=split_k_slices,
+            batch=batch_count,
+        )
+        if mode == GemmUniversalMode.GemmSplitKParallel:
+            reduction_arguments = ReductionArguments(
+                self.reduction_operation,
+                problem_size=[problem_size.m, problem_size.n],
+                partitions=split_k_slices,
+                workspace=arguments.ptr_D,
+                destination=tensor_D,
+                source=tensor_C,
+                output_op=self.reduction_operation.epilogue_type(alpha, beta),
+            )
+        self.operation.run(arguments)
+        if mode == GemmUniversalMode.GemmSplitKParallel:
+            self.reduction_operation.run(reduction_arguments)
+        passed = True
+        if self.verification:
+            if mode == GemmUniversalMode.GemmSplitKParallel:
+                reduction_arguments.sync()
+                # Free memory allocated by args because we are not
+                # calling `arguments.sync()` in this case (which will free memory)
+                arguments.free()
+            else:
+                arguments.sync()
+            tensor_D_ref = self.reference(
+                problem_size,
+                tensor_A_ref,
+                tensor_B_ref,
+                tensor_C_ref,
+                alpha,
+                beta,
+            )
+            tensor_D_ref = tensor_D_ref.to('cuda')
+            if self.operation.switched or self.operation.C.layout == LayoutType.ColumnMajor:
+                tensor_D = tensor_D.transpose(-1, -2).contiguous()
+            passed = tensor_D.equal(tensor_D_ref)
+            try:
+                assert passed
+            except AssertionError:
+                self.print_problem_size(problem_size, mode, batch_count)
+        del arguments
+        if mode == GemmUniversalMode.GemmSplitKParallel:
+            del reduction_arguments
+        return passed
+def test_all_gemm(operation: "GemmOperationUniversal", testcase="universal", compilation_mode="nvcc"):
+    passed = True
+    minimum_operand_element_size = min(
+        DataTypeSize[operation.A.element], DataTypeSize[operation.B.element]
+    )
+    opcode_class = operation.tile_description.math_instruction.opcode_class
+    if opcode_class == OpcodeClass.Simt:
+        alignment = 1
+    else:
+        alignment = 128 // minimum_operand_element_size
+    alignment_m = alignment
+    alignment_n = alignment
+    alignment_k = alignment
+    # INT8 alignment constraints
+    if opcode_class == OpcodeClass.Simt:
+        A_is_s8 = operation.A.element == DataType.s8
+        B_is_s8 = operation.B.element == DataType.s8
+        if A_is_s8 and operation.A.layout == LayoutType.ColumnMajor:
+            alignment_m = 4
+        if B_is_s8 == DataType.s8 and operation.A.layout == LayoutType.RowMajor:
+            alignment_n = 4
+        if A_is_s8 and B_is_s8 and (operation.A.layout == LayoutType.RowMajor or operation.B.layout == LayoutType.ColumnMajor):
+            alignment_k = 4
+    threadblock_k = operation.tile_description.threadblock_shape[2]
+    assert testcase != "interleaved"
+    supports_split_k = operation.arch < 90 and not operation.swizzling_functor == SwizzlingFunctor.StreamK
+    if testcase == "multistage":
+        modes = [GemmUniversalMode.Gemm]
+        problem_size_m = [16, 528]
+        problem_size_n = [16, 528]
+        problem_size_k = [
+            threadblock_k,
+            threadblock_k * operation.tile_description.stages
+            + operation.tile_description.math_instruction.instruction_shape[2],
+        ]
+        problem_alpha = [1.0]
+        problem_beta = [0.0]
+        batch_counts = [1]
+    else:
+        modes = [GemmUniversalMode.Gemm]
+        batch_counts = [1, 2, 3, 5, 7]
+        if supports_split_k:
+            modes.append(GemmUniversalMode.GemmSplitKParallel)
+        problem_size_m = [alignment_m, 512 - 3 * alignment_m]
+        problem_size_n = [alignment_n, 512 - 2 * alignment_n]
+        if operation.tile_description.stages is None:
+            stages_for_k_calc = 7
+        else:
+            stages_for_k_calc = operation.tile_description.stages
+        problem_size_k = [
+            alignment_k,
+            threadblock_k * stages_for_k_calc - alignment_k,
+            threadblock_k * stages_for_k_calc * 3 - alignment_k,
+        ]
+        problem_alpha = [1.0]
+        problem_beta = [2.0]
+    testbed = GemmUniversalLauncher(operation, compiler_mode=compilation_mode)
+    for mode in modes:
+        for m in problem_size_m:
+            for n in problem_size_n:
+                for k in problem_size_k:
+                    for batch_count in batch_counts:
+                        for alpha in problem_alpha:
+                            for beta in problem_beta:
+                                # skip very small K problems
+                                if testcase == "universal":
+                                    if k // batch_count < 2 * threadblock_k:
+                                        continue
+                                problem_size = GemmCoord(m, n, k)
+                                if supports_split_k:
+                                    split_k_slices = batch_count
+                                else:
+                                    split_k_slices = 1
+                                overridden_mode = mode
+                                if mode == GemmUniversalMode.Gemm and batch_count > 1:
+                                    overridden_mode = GemmUniversalMode.Batched
+                                passed = testbed.run(
+                                    overridden_mode,
+                                    problem_size,
+                                    batch_count,
+                                    split_k_slices,
+                                    alpha,
+                                    beta,
+                                )
+                                if not passed:
+                                    return False
+    return passed

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/run_all_tests.py ADDED Viewed

	@@ -0,0 +1,44 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+import pathlib
+import unittest
+if __name__ == '__main__':
+    loader = unittest.TestLoader()
+    script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
+    tests = loader.discover(script_dir, 'gemm_*.py')
+    testRunner = unittest.runner.TextTestRunner()
+    results = testRunner.run(tests)
+    if not results.wasSuccessful():
+        raise Exception('Test cases failed')

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/utils.py ADDED Viewed

	@@ -0,0 +1,260 @@

+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+from cutlass_library import SubstituteTemplate
+import cutlass_cppgen
+from cutlass_library import (
+    DataTypeNames,
+    EpilogueScheduleSuffixes,
+    KernelScheduleSuffixes,
+    LayoutType,
+    OpcodeClassNames,
+    ShortDataTypeNames,
+    ShortLayoutTypeNames
+)
+from cutlass_cppgen.backend import library
+from gemm_testbed import test_all_gemm
+class Layout:
+    """
+    Utility class to map transpose and non-transpose terminology to row- and column-major terminology
+    """
+    T = LayoutType.RowMajor
+    N = LayoutType.ColumnMajor
+class LayoutCombination:
+    """
+    Utility class defining all combinations of row- and column-major layouts for operands to a GEMMs
+    """
+    NNN = (Layout.N, Layout.N, Layout.N)
+    NNT = (Layout.N, Layout.N, Layout.T)
+    NTN = (Layout.N, Layout.T, Layout.N)
+    NTT = (Layout.N, Layout.T, Layout.T)
+    TNN = (Layout.T, Layout.N, Layout.N)
+    TNT = (Layout.T, Layout.N, Layout.T)
+    TTN = (Layout.T, Layout.T, Layout.N)
+    TTT = (Layout.T, Layout.T, Layout.T)
+def get_name(
+    layouts,
+    alignments,
+    element_output,
+    element_accumulator,
+    element_epilogue,
+    cluster_shape,
+    threadblock_shape,
+    stages,
+    element_a,
+    element_b,
+    element_c,
+    arch,
+    opclass,
+    kernel_schedule=None,
+    epilogue_schedule=None,
+    suffix="",
+):
+    """
+    Generates a procedural name for a test case.
+    :param layouts: indexable container of layouts of A, B, and C operands
+    :param alignments: indexable container of alignments of A, B, and C operands
+    :param element_output: data type of the output element
+    :param element_accumulator: data type used in accumulation
+    :param element_epilogue: data type used in computing the epilogue
+    :param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
+    :param threadblock_shape: indexable container of dimensions of threadblock tiles
+    :param stages: number of pipeline stages to use in the kernel
+    :type stages: int
+    :param element_a: data type of operand A
+    :param element_b: data type of operand B
+    :param element_c: data type of operand C
+    :param arch: compute capability of kernel being generated
+    :type arch: int
+    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
+    :type opclass: cutlass_cppgen.OpcodeClass
+    :param kernel_schedule: kernel_schedule type
+    :type kernel_schedule: cutlass_cppgen.KernelScheduleType
+    :param epilogue_schedule: epilogue_schedule type
+    :type epilogue_schedule: cutlass_cppgen.EpilogueScheduleType
+    :param suffix: additional string to add to the suffix of the name
+    :type suffix: str
+    :return: str
+    """
+    name_format = "test_SM${arch}_Device_Gemm_${eA}${lA}_${eB}${lB}_${eC}${lC}_${opclass}_${acc}_${tbM}x${tbN}x${tbK}_${cM}x${cN}x${cK}_${stages}_align${aA}-${aB}-${aC}${k}${e}${suffix}"
+    return SubstituteTemplate(
+        name_format,
+        {
+            "arch": str(arch),
+            "eA": DataTypeNames[element_a],
+            "eB": DataTypeNames[element_b],
+            "eC": DataTypeNames[element_c],
+            "lA": ShortLayoutTypeNames[layouts[0]],
+            "lB": ShortLayoutTypeNames[layouts[1]],
+            "lC": ShortLayoutTypeNames[layouts[2]],
+            "opclass": OpcodeClassNames[opclass],
+            "acc": DataTypeNames[element_accumulator],
+            "cM": str(cluster_shape[0]),
+            "cN": str(cluster_shape[1]),
+            "cK": str(cluster_shape[2]),
+            "tbM": str(threadblock_shape[0]),
+            "tbN": str(threadblock_shape[1]),
+            "tbK": str(threadblock_shape[2]),
+            "stages": str(stages) if stages is not None else "auto",
+            "aA": str(alignments[0]),
+            "aB": str(alignments[1]),
+            "aC": str(alignments[2]),
+            "k": "" if kernel_schedule is None else KernelScheduleSuffixes[kernel_schedule],
+            "e": "" if epilogue_schedule is None else EpilogueScheduleSuffixes[epilogue_schedule],
+            "suffix": "" if suffix is None else suffix,
+        },
+    )
+def add_test_gemm(
+    cls=None,
+    cc=None,
+    element=None,
+    layouts=None,
+    alignments=None,
+    element_output=None,
+    element_accumulator=None,
+    cluster_shape=None,
+    threadblock_shape=None,
+    warp_count=None,
+    stages=None,
+    opclass=None,
+    swizzle=None,
+    kernel_schedule=None,
+    epilogue_schedule=None,
+    compilation_modes=['nvcc', 'nvrtc'],
+    element_A=None,
+    element_B=None,
+    element_C=None):
+    """
+    Create test-running functions with the given specification and set it as a method of ``cls``.
+    :param cls: class to which the generated method will be added
+    :type cls: type
+    :param cc: compute capability to compile for
+    :type cc: int
+    :param element: data type of A and B operands
+    :type element: cutlass_cppgen.DataType.f16
+    :param layouts: layouts of A, B, and C operands
+    :type layouts: list or tuple
+    :param alignments: alingments of A, B, and C operands
+    :type alignments: list or tuple
+    :param element_output: data type of the output element
+    :type element_output: cutlass_cppgen.DataType
+    :param element_accumulator: data type used in accumulation
+    :type element_accumulator: cutlass_cppgen.DataType
+    :param cluster_shape: dimensions of clusters
+    :type cluster_shape: list or tuple
+    :param threadblock_shape: dimensions of threadblock tiles
+    :type threadblock_shape: list or tuple
+    :param warp_count: warps to be launched per threadblock dimension
+    :type warp_count: list or tuple
+    :param stages: number of pipeline stages to use in the kernel
+    :type stages: int
+    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
+    :type opclass: cutlass_cppgen.OpcodeClass
+    :param swizzle: threadblock swizzling functor
+    :param kernel_schedule: kernel schedule to use
+    :type kernel_schedule: cutlass_cppgen.KernelScheduleType
+    :param epilogue_schedule: epilogue schedule to use
+    :type epilogue_schedule: cutlass_cppgen.EpilogueScheduleType
+    :param compilation_modes: list of compilers to used in testing the kernel (options: 'nvrtc', 'nvcc')
+    :type compilation_modes: list,
+    :param element_A: data type of operand A. If set, overrides ``element``
+    :type element_A: cutlass_cppgen.DataType
+    :param element_B: data type of operand B. If set, overrides ``element``
+    :type element_B: cutlass_cppgen.DataType
+    :param element_C: data type of operand C. If set, overrides ``element``
+    :type element_C: cutlass_cppgen.DataType
+    """
+    if element_A is None:
+        element_A = element
+    if element_B is None:
+        element_B = element
+    if element_C is None:
+        element_C = element
+    if element_output is None:
+        element_output = element
+    if element_accumulator is None:
+        element_accumulator = element
+    for compilation_mode in compilation_modes:
+        def run(self):
+            """
+            Dynamically-generated function that constructs a GEMM operation and verifies it against
+            multiple test cases.
+            """
+            layout_A, layout_B, layout_C = layouts
+            alignment_A, alignment_B, alignment_C = alignments
+            plan = cutlass_cppgen.op.Gemm(element_A=element_A, element_B=element_B,
+                                element_C=element_C, element_D=element_output,
+                                layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
+                                element_accumulator=element_accumulator,
+                                kernel_cc=cc)
+            plan.opclass = opclass
+            if swizzle is not None:
+                plan.swizzling_functor = swizzle
+            td = plan.tile_descriptions()[0]
+            if warp_count is not None:
+                td.warp_count = warp_count
+            td.threadblock_shape = threadblock_shape
+            td.stages = stages
+            td.cluster_shape = cluster_shape
+            op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
+            self.assertTrue(test_all_gemm(op, 'universal', compilation_mode=compilation_mode))
+        element_epilogue = element_accumulator
+        name = get_name(
+            layouts=layouts, alignments=alignments, element_output=element_output, element_accumulator=element_accumulator,
+            element_epilogue=element_epilogue, cluster_shape=cluster_shape, threadblock_shape=threadblock_shape,
+            stages=stages, element_a=element_A, element_b=element_B, element_c=element_C, arch=cc, opclass=opclass,
+            kernel_schedule=kernel_schedule, epilogue_schedule=epilogue_schedule, suffix=f'_{compilation_mode}')
+        setattr(cls, name, run)

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/installation.py ADDED Viewed

	@@ -0,0 +1,57 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Tests for a successful installation of the CUTLASS Python interface
+"""
+import os
+import unittest
+import cutlass_cppgen
+import cutlass_library
+class InstallationTest(unittest.TestCase):
+    def test_cutlass_source_paths(self):
+        """
+        Tests that CUTLASS source is available as part of the cutlass and cutlass_library packages
+        """
+        src_file = 'include/cutlass/cutlass.h'
+        library_file = os.path.join(cutlass_library.source_path, src_file)
+        cutlass_file = os.path.join(cutlass_cppgen.CUTLASS_PATH, src_file)
+        assert os.path.isfile(library_file), f"Unable to locate file {library_file}. Installation has not succeeded."
+        assert os.path.isfile(cutlass_file), f"Unable to locate file {cutlass_file}. Installation has not succeeded."
+if __name__ == "__main__":
+    unittest.main()

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/interface/conv2d_interface.py ADDED Viewed

	@@ -0,0 +1,284 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Tests the high-level Conv2d interface
+"""
+from math import ceil
+import unittest
+import cutlass_cppgen
+import cutlass_cppgen.utils.datatypes as datatypes
+from cutlass_cppgen.backend.utils.device import device_cc
+from utils import ExpectException
+import os
+class Conv2dEquivalence:
+    """
+    Helper class for testing the equivalence of different constructions of the Conv2d interface
+    """
+    def __init__(self, conv_kind, element_A, element_B, element_C, element_D, element_accumulator,
+                 alignment_A, alignment_B, alignment_C):
+        self.element_A = element_A
+        self.element_B = element_B
+        self.element_C = element_C
+        self.element_D = element_D
+        self.element_accumulator = element_accumulator
+        self.alignment_A = alignment_A
+        self.alignment_B = alignment_B
+        self.alignment_C = alignment_C
+        self.conv_kind = conv_kind
+        self.plan = cutlass_cppgen.op.Conv2d(
+            kind=self.conv_kind, element_A=element_A, element_B=element_B, element_C=element_C,
+            element_D=element_D, element_accumulator=element_accumulator)
+        self.op = self.plan.construct(
+            alignment_A=self.alignment_A, alignment_B=self.alignment_B,
+            alignment_C=self.alignment_C)
+    def _plans_equal(self, other_plan) -> bool:
+        """
+        Compares whether two plans are equal
+        :param other_plan: plan to compare against the default Conv2d
+        :type other_plan: cutlass_cppgen.op.Conv2d
+        :return: whether `other_plan` is equivalent to `self.plan`
+        :rtype: bool
+        """
+        other_op = other_plan.construct(
+            alignment_A=self.alignment_A, alignment_B=self.alignment_B,
+            alignment_C=self.alignment_C)
+        return self.op.rt_module.emit() == other_op.rt_module.emit()
+    def generic_test(self):
+        """
+        Tests the equivalence of various constructions of the Conv2d interface when using CUTLASS data types
+        and layouts for constructing the Conv2d interface
+        """
+        if not datatypes.is_numpy_available():
+            return
+        # Test when specifying all parameters
+        plan_other = cutlass_cppgen.op.Conv2d(
+            kind=self.conv_kind,
+            element_A=self.element_A, element_B=self.element_B, element_C=self.element_C,
+            element_D=self.element_D, element_accumulator=self.element_accumulator)
+        assert self._plans_equal(plan_other)
+        # Test when specifying all parameters but A
+        plan_other = cutlass_cppgen.op.Conv2d(
+            kind=self.conv_kind,
+            element_B=self.element_B, element_C=self.element_C,
+            element_D=self.element_D, element_accumulator=self.element_accumulator,
+            element=self.element_A)
+        assert self._plans_equal(plan_other)
+        # Test when specifying all parameters but A and B as tensors using generic element and output
+        plan_other = cutlass_cppgen.op.Conv2d(
+            kind=self.conv_kind,
+            element_C=self.element_C,
+            element_D=self.element_D, element_accumulator=self.element_accumulator,
+            element=self.element_A)
+        assert self._plans_equal(plan_other)
+        # Test without explicit accumulator. Only run if the type of C and the accumulator are equal
+        if self.element_C == self.element_accumulator:
+            plan_other = cutlass_cppgen.op.Conv2d(
+                kind=self.conv_kind,
+                element_C=self.element_C,
+                element_D=self.element_D,
+                element=self.element_A)
+            assert self._plans_equal(plan_other)
+        # Test with only the generic types. Only rune if the types of A, B, C, and D are the same
+        if (self.element_A == self.element_B and self.element_A == self.element_C and self.element_A == self.element_D
+            and self.element_A == self.element_accumulator):
+            plan_other = cutlass_cppgen.op.Conv2d(kind=self.conv_kind, element=self.element_A)
+            assert self._plans_equal(plan_other)
+    def numpy_test(self):
+        """
+        Tests the equivalence of various constructions of the Conv2d interface when using numpy as a frontend
+        """
+        if not datatypes.is_numpy_available():
+            return
+        import numpy as np
+        type_A = datatypes.numpy_type(self.element_A)
+        type_B = datatypes.numpy_type(self.element_B)
+        type_C = datatypes.numpy_type(self.element_C)
+        type_D = datatypes.numpy_type(self.element_D)
+        type_accum = datatypes.numpy_type(self.element_accumulator)
+        size = (2, 2)
+        A = np.zeros(size, dtype=type_A)
+        B = np.zeros(size, dtype=type_B)
+        C = np.zeros(size, dtype=type_C)
+        D = np.zeros(size, dtype=type_D)
+        return self.tensor_test(type_A, type_B, type_C, type_D, type_accum, A, B, C, D)
+    def torch_test(self):
+        """
+        Tests the equivalence of various constructions of the Conv2d interface when using torch as a frontend
+        """
+        if not datatypes.is_torch_available():
+            return
+        import torch
+        type_A = datatypes.torch_type(self.element_A)
+        type_B = datatypes.torch_type(self.element_B)
+        type_C = datatypes.torch_type(self.element_C)
+        type_D = datatypes.torch_type(self.element_D)
+        type_accum = datatypes.torch_type(self.element_accumulator)
+        size = (2, 2)
+        A = torch.empty(size, dtype=type_A)
+        B = torch.empty(size, dtype=type_B)
+        C = torch.empty(size, dtype=type_C)
+        D = torch.empty(size, dtype=type_D)
+        return self.tensor_test(type_A, type_B, type_C, type_D, type_accum, A, B, C, D)
+    def tensor_test(self, type_A, type_B, type_C, type_D, type_accum, A, B, C, D):
+        # Test when specifying all parameters via tensors
+        plan_np = cutlass_cppgen.op.Conv2d(kind=self.conv_kind, A=A, B=B, C=C, D=D, element_accumulator=type_accum)
+        assert self._plans_equal(plan_np)
+        # Test when specifying all parameters but A as tensors
+        plan_np = cutlass_cppgen.op.Conv2d(kind=self.conv_kind, B=B, C=C, D=D, element_accumulator=type_accum, element_A=type_A)
+        assert self._plans_equal(plan_np)
+        # Test when specifying all parameters but A and B as tensors and using generic element and output
+        if type_A == type_B:
+            plan_np = cutlass_cppgen.op.Conv2d(kind=self.conv_kind, C=C, D=D, element_accumulator=type_accum, element=type_A)
+            assert self._plans_equal(plan_np)
+        # Test without explicit accumulator. Only run if the type of C and the accumulator.
+        if type_C == type_accum:
+            plan_np = cutlass_cppgen.op.Conv2d(kind=self.conv_kind, A=A, B=B, C=C, D=D)
+            assert self._plans_equal(plan_np)
+        # Test with only the generic types and layouts. Only run if types and layouts of A, B, C, and D are the same.
+        if (type_A == type_B and type_A == type_C and type_A == type_D and type_A == type_accum):
+            plan_np = cutlass_cppgen.op.Conv2d(kind=self.conv_kind, element=type_A)
+            assert self._plans_equal(plan_np)
+    def test_all(self):
+        """
+        Runs all tests on the Gemm interface
+        """
+        self.generic_test()
+        self.numpy_test()
+        self.torch_test()
+@unittest.skipIf(device_cc() <= 80, 'Device compute capability is insufficient for SM80 tests.')
+class ConvEquivalenceTest(unittest.TestCase):
+    """
+    Tests the equivalence of different constructions of the Conv2d interface
+    """
+    pass
+type2alignment = {
+    cutlass_cppgen.DataType.f16: 8,
+    cutlass_cppgen.DataType.f32: 4
+}
+def add_test(conv_kind, element_A, element_B, element_C, element_D, element_accumulator):
+    test_name = f"test_conv2d_{conv_kind}_{element_A}_{element_B}_{element_C}_{element_D}_{element_accumulator}"
+    def run(self):
+        conv2d_eq = Conv2dEquivalence(
+            conv_kind=conv_kind,
+            element_A=element_A, element_B=element_B,
+            element_C=element_C, element_D=element_D,
+            element_accumulator=element_accumulator,
+            alignment_A=type2alignment[element_A], alignment_B=type2alignment[element_B],
+            alignment_C=type2alignment[element_C]
+        )
+        conv2d_eq.test_all()
+    setattr(ConvEquivalenceTest, test_name, run)
+for conv_kind in ["fprop", "wgrad", "dgrad"]:
+    for types in [
+        [cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f16],
+        [cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32],
+        [cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f16],
+        [cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f32],
+        [cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f32]
+    ]:
+        add_test(conv_kind, types[0], types[1], types[2], types[3], types[4])
+@unittest.skipIf(device_cc() <= 80, 'Device compute capability is insufficient for SM80 tests.')
+class Conv2dErrorTests(unittest.TestCase):
+    """
+    Tests various error scenarios that arise with the high-level Gemm interface
+    """
+    def test_alignment(self):
+        """
+        Tests case in which the alignment specified is unsupported
+        """
+        plan = cutlass_cppgen.op.Conv2d(kind="fprop", element=cutlass_cppgen.DataType.f16)
+        with ExpectException(True, 'Alignment 3 is not supported for F16. The construction should fail.'):
+            op = plan.construct(alignment_A=3, alignment_B=3, alignment_C=3)
+    def test_invalid_tile_description(self):
+        """
+        Tests scenarios in which an invalid tile description is provided for a given CC
+        """
+        plan = cutlass_cppgen.op.Conv2d(kind="fprop", element=cutlass_cppgen.DataType.f16)
+        td = plan.tile_descriptions()[0]
+        td.threadblock_shape=[17, 32, 5]
+        plan.tile_description = td
+        with ExpectException(True, 'The threadblock shape is invalid. The compilation should fail.'):
+            plan.compile()
+        # Clean up the error message
+        os.remove("./cutlass_python_compilation_device_error.txt")
+if __name__ == '__main__':
+    unittest.main()

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/interface/evt_interface.py ADDED Viewed

	@@ -0,0 +1,254 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Test the EVT interface
+"""
+import numpy as np
+import unittest
+import cutlass_cppgen
+from cutlass_cppgen import LayoutType, Tensor
+from cutlass_cppgen.backend.utils.device import device_cc
+from cutlass_cppgen.epilogue import reshape, permute
+from utils import ExpectException
+@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
+class EVTErrorTests(unittest.TestCase):
+    """
+    Tests various error scenarios that arise with the EVT interface
+    """
+    @unittest.skipIf(device_cc() != 90, "Only Sm90 EVT requires root node be 'D'")
+    def test_root_not_d(self):
+        """
+        Test when "D" does not exist in Sm90 EVT
+        """
+        def evt_root_not_d(accum, alpha):
+            F = accum * alpha
+            return F
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "alpha": 1.2,
+            "F": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        with ExpectException(device_cc() == 90,
+            "SyntaxError: Sm90 EVT requires the epilogue to have a returned tensor D, "
+            "but the variable 'D' is not found in the return values.", True):
+            cutlass_cppgen.epilogue.trace(evt_root_not_d, example_tensors)
+    def test_no_accum(self):
+        """
+        Test when "accum" is not in input arguments
+        """
+        def evt_no_accum(alpha, C):
+            D = alpha * C
+            return D
+        example_tensors = {
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+            "alpha": 1.2,
+            "D": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        with ExpectException(True, "SyntaxError: Cannot find 'accum' in the argument list.", True):
+            cutlass_cppgen.epilogue.trace(evt_no_accum, example_tensors)
+    @unittest.skipIf(device_cc() != 90, "Only Sm90 EVT has concern on smem size")
+    def test_too_much_shared_memory(self):
+        """
+        Test when the epilogue consumes too much shared memory
+        """
+        def evt_too_much_shared_memory(accum, C1, C2, C3, C4, C5, C6, C7, C8):
+            D1 = accum + C1
+            D2 = D1 + C2
+            D3 = D2 + C3
+            D4 = D3 + C4
+            D5 = D4 + C5
+            D6 = D5 + C6
+            D7 = D6 + C7
+            D = D7 + C8
+            return D, D1, D2, D3, D4, D5, D6, D7
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C1": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C2": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C3": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C4": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C5": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C6": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C7": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C8": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D1": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D2": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D3": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D4": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D5": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D6": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D7": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        epilogue_visitor = cutlass_cppgen.epilogue.trace(evt_too_much_shared_memory, example_tensors)
+        plan = cutlass_cppgen.op.Gemm(
+            element=np.float16, layout=cutlass_cppgen.LayoutType.RowMajor,
+            element_accumulator=np.float32
+        )
+        with ExpectException(True,
+            "RuntimeError: The epilogue consumes too much shared memory. "
+            "No valid tile description is found in the generator.", True):
+            plan.epilogue_visitor = epilogue_visitor
+    def test_not_ssa(self):
+        """
+        Test when the epilogue is not in SSA
+        """
+        def evt_redefine(accum, C, alpha):
+            F = accum + C
+            F = F * alpha
+            D = F
+            return D, F
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+            "alpha": 1.5,
+            "D": self.fake_tensor(np.float16, (6, 512, 512)),
+            "F": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        with ExpectException(True, "SyntaxError: Variable 'F' cannot be defined twice.", True):
+            cutlass_cppgen.epilogue.trace(evt_redefine, example_tensors)
+        def evt_undefine(accum, alpha):
+            F = accum + C
+            D = F * alpha
+            return D, F
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "alpha": 1.5,
+            "D": self.fake_tensor(np.float16, (6, 512, 512)),
+            "F": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        with ExpectException(True, "SyntaxError: Variable 'C' is undefined.", True):
+            cutlass_cppgen.epilogue.trace(evt_undefine, example_tensors)
+    def test_missing_example_tensor(self):
+        """
+        Test when the example tensor of an input/output variable is not provided
+        """
+        def evt_missing_example_tensor(accum, C):
+            D = accum + C
+            return D
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+        }
+        with ExpectException(True, "RuntimeError: Example input for D is not provided.", True):
+            cutlass_cppgen.epilogue.trace(evt_missing_example_tensor, example_tensors)
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D": self.fake_tensor(np.float16, (6, 512, 512)),
+        }
+        with ExpectException(True, "RuntimeError: Example input for C is not provided.", True):
+            cutlass_cppgen.epilogue.trace(evt_missing_example_tensor, example_tensors)
+    def test_return_expression(self):
+        """
+        Test when the return value is an expression
+        """
+        def evt_return_expr(accum, C):
+            return accum + C
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+        }
+        with ExpectException(True, "SyntaxError: Return value cannot be an expression", True):
+            cutlass_cppgen.epilogue.trace(evt_return_expr, example_tensors)
+    def test_incompatible_shape(self):
+        """
+        Test when the shape of example tensors are incompatible
+        """
+        def evt_incompatible_shape(accum, C):
+            D = accum + C
+            return D
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 256, 512)),
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        with ExpectException(True,
+            "RuntimeError: Dimension mismatch between accum(6, 256, 512), C(6, 512, 512).", True):
+            cutlass_cppgen.epilogue.trace(evt_incompatible_shape, example_tensors)
+    def test_no_matching_impl(self):
+        def evt_no_matching_impl(accum, bias):
+            D = accum + reshape(permute(bias, indices=(1, 0)), new_shape=(512, 1))
+            return D
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 256)),
+            "bias": self.fake_tensor(np.float16, (16, 32)),
+            "D": self.fake_tensor(np.float16, (6, 512, 256))
+        }
+        with ExpectException(True, "NotImplementedError: No matching op for node bias with stride (0, (1, 32), 0).", True):
+            cutlass_cppgen.epilogue.trace(evt_no_matching_impl, example_tensors)
+    #
+    # Helper functions
+    #
+    def fake_tensor(self, element, shape):
+        return Tensor(element=element, shape=shape, layout_tag=LayoutType.RowMajor)
+if __name__ == '__main__':
+    unittest.main()

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/interface/gemm_interface.py ADDED Viewed

	@@ -0,0 +1,354 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Tests the high-level GEMM interface
+"""
+from math import ceil
+import unittest
+import cutlass_cppgen
+import cutlass_cppgen.utils.datatypes as datatypes
+from cutlass_cppgen.backend.utils.device import device_cc
+from utils import ExpectException
+class GemmEquivalence:
+    """
+    Helper class for testing the equivalence of different constructions of the Gemm interface
+    """
+    def __init__(self, element_A, element_B, element_C, element_D, element_accumulator,
+                 layout_A, layout_B, layout_C, alignment_A, alignment_B, alignment_C):
+        self.element_A = element_A
+        self.element_B = element_B
+        self.element_C = element_C
+        self.element_D = element_D
+        self.element_accumulator = element_accumulator
+        self.layout_A = layout_A
+        self.layout_B = layout_B
+        self.layout_C = layout_C
+        self.alignment_A = alignment_A
+        self.alignment_B = alignment_B
+        self.alignment_C = alignment_C
+        self.plan = cutlass_cppgen.op.Gemm(element_A=element_A, element_B=element_B, element_C=element_C,
+                                    element_D=element_D, element_accumulator=element_accumulator,
+                                    layout_A=layout_A, layout_B=layout_B, layout_C=layout_C)
+        self.op = self.plan.construct(alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
+    def _plans_equal(self, other_plan) -> bool:
+        """
+        Compares whether two plans are equal
+        :param other_plan: plan to compare against the default GEMM
+        :type other_plan: cutlass_cppgen.op.Gemm
+        :return: whether `other_plan` is equivalent to `self.plan`
+        :rtype: bool
+        """
+        other_op = other_plan.construct(alignment_A=self.alignment_A, alignment_B=self.alignment_B, alignment_C=self.alignment_C)
+        # Compare whether the operations are equal by comparing the C++ code that would be emitted for them
+        return self.op.rt_module.emit() == other_op.rt_module.emit()
+    def generic_test(self):
+        """
+        Tests the equivalence of various constructions of the Gemm interface when using CUTLASS data types
+        and layouts for constructing the Gemm interface
+        """
+        if not datatypes.is_numpy_available():
+            return
+        # Test when specifying all parameters
+        plan_other = cutlass_cppgen.op.Gemm(element_A=self.element_A, element_B=self.element_B, element_C=self.element_C,
+                                  element_D=self.element_D, element_accumulator=self.element_accumulator,
+                                  layout_A=self.layout_A, layout_B=self.layout_B, layout_C=self.layout_C)
+        assert self._plans_equal(plan_other)
+        # Test when specifying all parameters but A
+        plan_other = cutlass_cppgen.op.Gemm(element_B=self.element_B, element_C=self.element_C,
+                                  element_D=self.element_D, element_accumulator=self.element_accumulator,
+                                  layout_B=self.layout_B, layout_C=self.layout_C,
+                                  element=self.element_A, layout=self.layout_A)
+        assert self._plans_equal(plan_other)
+        # Test when specifying all parameters but A and B as tensors and using generic element and output
+        # Only run this test if the layouts and types for A and B are equal.
+        if self.element_A == self.element_B and self.layout_A == self.layout_B:
+            plan_other = cutlass_cppgen.op.Gemm(element_C=self.element_C, element_D=self.element_D, element_accumulator=self.element_accumulator,
+                                      layout_C=self.layout_C, element=self.element_A, layout=self.layout_A)
+            assert self._plans_equal(plan_other)
+        # Test without explicit accumulator. Only run if the type of C and the accumulator.
+        if self.element_C == self.element_accumulator:
+            plan_other = cutlass_cppgen.op.Gemm(element_A=self.element_A, element_B=self.element_B, element_C=self.element_C,
+                                      element_D=self.element_D, layout_A=self.layout_A, layout_B=self.layout_B,
+                                      layout_C=self.layout_C)
+            assert self._plans_equal(plan_other)
+        # Test with only the generic types and layouts. Only run if types and layouts of A, B, C, and D are the same.
+        if (self.element_A == self.element_B and self.element_A == self.element_C and self.element_A == self.element_D
+            and self.element_A == self.element_accumulator and
+            self.layout_A == self.layout_B and self.layout_A == self.layout_C):
+            plan_other = cutlass_cppgen.op.Gemm(element=self.element_A, layout=self.layout_A)
+            assert self._plans_equal(plan_other)
+    def numpy_test(self):
+        """
+        Tests the equivalence of various constructions of the Gemm interface when using numpy as a frontend
+        """
+        if not datatypes.is_numpy_available():
+            return
+        import numpy as np
+        type_A = datatypes.numpy_type(self.element_A)
+        type_B = datatypes.numpy_type(self.element_B)
+        type_C = datatypes.numpy_type(self.element_C)
+        type_D = datatypes.numpy_type(self.element_D)
+        type_accum = datatypes.numpy_type(self.element_accumulator)
+        layout_to_order = {
+            cutlass_cppgen.LayoutType.RowMajor: 'C',
+            cutlass_cppgen.LayoutType.ColumnMajor: 'F'
+        }
+        size = (2, 2)
+        A = np.zeros(size, order=layout_to_order[self.layout_A], dtype=type_A)
+        B = np.zeros(size, order=layout_to_order[self.layout_B], dtype=type_B)
+        C = np.zeros(size, order=layout_to_order[self.layout_C], dtype=type_C)
+        D = np.zeros(size, order=layout_to_order[self.layout_C], dtype=type_D)
+        # Test when specifying all parameters via tensors
+        plan_np = cutlass_cppgen.op.Gemm(A=A, B=B, C=C, D=D, element_accumulator=type_accum)
+        assert self._plans_equal(plan_np)
+        # Test when specifying all parameters but A as tensors
+        plan_np = cutlass_cppgen.op.Gemm(B=B, C=C, D=D, element_accumulator=type_accum, element_A=type_A, layout_A=self.layout_A)
+        assert self._plans_equal(plan_np)
+        # Test when specifying all parameters but A and B as tensors and using generic element and output
+        # Only run this test if the layouts and types for A and B are equal.
+        if type_A == type_B and self.layout_A == self.layout_B:
+            plan_np = cutlass_cppgen.op.Gemm(C=C, D=D, element_accumulator=type_accum, element=type_A, layout=self.layout_A)
+            assert self._plans_equal(plan_np)
+        # Test without explicit accumulator. Only run if the type of C and the accumulator.
+        if type_C == type_accum:
+            plan_np = cutlass_cppgen.op.Gemm(A=A, B=B, C=C, D=D)
+            assert self._plans_equal(plan_np)
+        # Test with only the generic types and layouts. Only run if types and layouts of A, B, C, and D are the same.
+        if (type_A == type_B and type_A == type_C and type_A == type_D and type_A == type_accum and
+            self.layout_A == self.layout_B and self.layout_A == self.layout_C):
+            plan_np = cutlass_cppgen.op.Gemm(element=type_A, layout=self.layout_A)
+            assert self._plans_equal(plan_np)
+    def test_all(self):
+        """
+        Runs all tests on the Gemm interface
+        """
+        self.generic_test()
+        self.numpy_test()
+class GemmEquivalenceTest(unittest.TestCase):
+    """
+    Tests the equivalence of different constructions of the Gemm interface
+    """
+    @unittest.skipIf(device_cc() < 70, "Device compute capability is insufficient for FP16 Tensor Core tests.")
+    def test_gemm_equivalence_f16_f16_f16_f16_f16_ttt_8_8_8(self):
+        gemm_eq = GemmEquivalence(
+                element_A=cutlass_cppgen.DataType.f16, element_B=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                element_D=cutlass_cppgen.DataType.f16, element_accumulator=cutlass_cppgen.DataType.f16,
+                layout_A=cutlass_cppgen.LayoutType.RowMajor, layout_B=cutlass_cppgen.LayoutType.RowMajor, layout_C=cutlass_cppgen.LayoutType.RowMajor,
+                alignment_A=8, alignment_B=8, alignment_C=8)
+        gemm_eq.test_all()
+    @unittest.skipIf(device_cc() < 70, "Device compute capability is insufficient for FP16 Tensor Core tests.")
+    def test_gemm_equivalence_f16_f16_f16_f16_f32_ntn_8_8_8(self):
+        gemm_eq = GemmEquivalence(
+                element_A=cutlass_cppgen.DataType.f16, element_B=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                element_D=cutlass_cppgen.DataType.f16, element_accumulator=cutlass_cppgen.DataType.f32,
+                layout_A=cutlass_cppgen.LayoutType.ColumnMajor, layout_B=cutlass_cppgen.LayoutType.RowMajor, layout_C=cutlass_cppgen.LayoutType.ColumnMajor,
+                alignment_A=8, alignment_B=8, alignment_C=8)
+        gemm_eq.test_all()
+    @unittest.skipIf(device_cc() < 70, "Device compute capability is insufficient for FP16 Tensor Core tests.")
+    def test_gemm_equivalence_f16_f16_f16_f16_f16_ttt_4_4_4(self):
+        gemm_eq = GemmEquivalence(
+                element_A=cutlass_cppgen.DataType.f16, element_B=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                element_D=cutlass_cppgen.DataType.f16, element_accumulator=cutlass_cppgen.DataType.f16,
+                layout_A=cutlass_cppgen.LayoutType.RowMajor, layout_B=cutlass_cppgen.LayoutType.RowMajor, layout_C=cutlass_cppgen.LayoutType.RowMajor,
+                alignment_A=8, alignment_B=8, alignment_C=8)
+        gemm_eq.test_all()
+    @unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for F64 Tensor Core tests.")
+    def test_gemm_equivalence_f64_f64_f64_f64_f64_tnt_1_1_1(self):
+        gemm_eq = GemmEquivalence(
+                element_A=cutlass_cppgen.DataType.f64, element_B=cutlass_cppgen.DataType.f64, element_C=cutlass_cppgen.DataType.f64,
+                element_D=cutlass_cppgen.DataType.f64, element_accumulator=cutlass_cppgen.DataType.f64,
+                layout_A=cutlass_cppgen.LayoutType.RowMajor, layout_B=cutlass_cppgen.LayoutType.ColumnMajor, layout_C=cutlass_cppgen.LayoutType.RowMajor,
+                alignment_A=1, alignment_B=1, alignment_C=1)
+        gemm_eq.test_all()
+class GemmErrorTests(unittest.TestCase):
+    """
+    Tests various error scenarios that arise with the high-level Gemm interface
+    """
+    def test_alignment(self):
+        """
+        Tests case in which the alignment specified is unsupported
+        """
+        plan = cutlass_cppgen.op.Gemm(element=cutlass_cppgen.DataType.f16, layout=cutlass_cppgen.LayoutType.RowMajor)
+        with ExpectException(True, 'Alignment 16 is not supported for F16. The construction should fail.'):
+            op = plan.construct(alignment_A=16, alignment_B=16, alignment_C=16)
+    def test_tensorop_availability(self):
+        """
+        Tests case in which only SIMT operations are available but TensorOp is requested
+        """
+        cc = device_cc()
+        # F64 Tensor Core operations are only avaiable on certain devices
+        supports_tensorop_f64 = cc in [80, 89, 90]
+        plan = cutlass_cppgen.op.Gemm(cc=cc, element=cutlass_cppgen.DataType.f64, layout=cutlass_cppgen.LayoutType.RowMajor)
+        error_msg = f'Incorrectly raised an exception for availability of TensorOp with F64 operands on SM{cc}'
+        with ExpectException(not supports_tensorop_f64, error_msg):
+            plan.opclass = cutlass_cppgen.OpcodeClass.TensorOp
+        expected_opclass = cutlass_cppgen.OpcodeClass.TensorOp if supports_tensorop_f64 else cutlass_cppgen.OpcodeClass.Simt
+        assert plan.opclass == expected_opclass, f'Expected opclass to be {expected_opclass}, but received {plan.opclass} for SM{cc}'
+    @unittest.skipIf(device_cc() < 70, "Device compute capability is insufficient for F16 Tensor Core tests.")
+    def test_opclass_switch(self):
+        """
+        Tests cases in which the opcode class in question is switched (e.g., from TensorOp to SIMT)
+        """
+        plan = cutlass_cppgen.op.Gemm( element=cutlass_cppgen.DataType.f16, layout=cutlass_cppgen.LayoutType.RowMajor)
+        assert plan.opclass == cutlass_cppgen.OpcodeClass.TensorOp
+        # Ensure that all tile descriptions have opclass of TensorOp
+        for td in plan.tile_descriptions():
+            assert td.math_instruction.opcode_class == cutlass_cppgen.OpcodeClass.TensorOp
+        plan.opclass = cutlass_cppgen.OpcodeClass.Simt
+        # Ensure that all tile descriptions have opclass of Simt
+        for td in plan.tile_descriptions():
+            assert td.math_instruction.opcode_class == cutlass_cppgen.OpcodeClass.Simt
+    def test_invalid_tile_description(self):
+        """
+        Tests scenarios in which an invalid tile description is provided for a given CC
+        """
+        cc = device_cc()
+        plan = cutlass_cppgen.op.Gemm(cc=cc, element=cutlass_cppgen.DataType.f16, layout=cutlass_cppgen.LayoutType.RowMajor)
+        td = plan.tile_descriptions()[0]
+        stages = td.stages
+        # Zero stage count is valid for SM90+, as this is used to indicate that the builder's auto stage
+        # count should be used
+        with ExpectException(cc < 90, f'Requested zero stages'):
+            td.stages = 0
+            plan.construct(td)
+        if cc < 90:
+            with ExpectException(cc < 80, f'Requested more than 2 stages on SM{cc}'):
+                td.stages = 3
+                plan.construct(td)
+        elif cc == 90:
+            original_kschedule = td.kernel_schedule
+            original_eschedule = td.epilogue_schedule
+            with ExpectException(False, f'Incorrectly flagged an error for insufficient shared memory'):
+                td.kernel_schedule = cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedPingpong
+                td.epilogue_schedule = cutlass_cppgen.EpilogueScheduleType.NoSmemWarpSpecialized
+                td.stages = 3
+                plan.construct(td)
+            # Reset schedules
+            td.kernel_schedule = original_kschedule
+            td.epilogue_schedule = original_eschedule
+        elif cc in [100, 101, 103]:
+            with ExpectException(False, f'Incorrectly flagged an error for insufficient shared memory'):
+                td.stages = 3
+                plan.construct(td)
+        with ExpectException(True, f'Requested too many stages'):
+            td.stages = 100
+            plan.construct(td)
+        # Reset stage count
+        td.stages = stages
+        cluster_shape = td.cluster_shape
+        with ExpectException(cc < 90, f'Requested non-unit cluster shape on SM{cc}'):
+            td.cluster_shape = [2, 1, 1]
+            plan.construct(td)
+        # Reset cluster shape
+        td.cluster_shape = cluster_shape
+        with ExpectException(cc < 90, f'Requested a non-auto schedule on SM{cc}'):
+            td.kernel_schedule = cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedPingpong
+            td.epilogue_schedule = cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecialized
+            plan.construct(td)
+        with ExpectException(cc == 90, f'Requested a non-auto kernel schedule with an auto epilogue schedule'):
+            td.kernel_schedule = cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedPingpong
+            td.epilogue_schedule = cutlass_cppgen.EpilogueScheduleType.ScheduleAuto
+            plan.construct(td)
+        with ExpectException(cc == 90, f'Requested an auto kernel schedule with a non-auto epilogue schedule'):
+            td.kernel_schedule = cutlass_cppgen.KernelScheduleType.ScheduleAuto
+            td.epilogue_schedule = cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecialized
+            plan.construct(td)
+        with ExpectException(cc < 90, f'Requested a tile scheduler on SM{cc}'):
+            td.kernel_schedule = cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedCooperative
+            td.epilogue_schedule = cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecializedCooperative
+            td.tile_scheduler = cutlass_cppgen.TileSchedulerType.StreamK
+            plan.construct(td)
+        # Ensure that all returned tile descriptions are unique
+        ops = {}
+        for i, td in enumerate(plan.tile_descriptions()):
+            op = plan.construct(td)
+            code_str = op.rt_module.emit()
+            if code_str in ops:
+                conflicting_td = ops[code_str]
+                assert False, f'Multiple tile descriptions emitted {code_str}\nTile descriptions are:\n{td}\n{conflicting_td}'
+if __name__ == '__main__':
+    unittest.main()

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/cutlass/interface/utils.py ADDED Viewed

	@@ -0,0 +1,69 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Helper functions & classes for interface test
+"""
+class ExpectException:
+    """
+    Utility class to assert that an exception was raised when expected
+    Example:
+    .. highlight:: python
+    .. code-block:: python
+        with ExceptionExpected(True, 'Division by zero'):
+            x = 1.0 / 0.0
+    :param exception_expected: whether an exception is expected to be raised
+    :type exception_expected: bool
+    :param message: message to print if an exception is raised when not expected or vice versa
+    :type message: str
+    """
+    def __init__(self, exception_expected: bool, message: str = '', verify_msg=False):
+        self.exception_expected = exception_expected
+        self.message = message
+        self.verify_msg = verify_msg
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, traceback):
+        exception_raised = exc_type is not None
+        assert self.exception_expected == exception_raised, self.message
+        if self.verify_msg:
+            exc_message = f"{exc_type.__name__}: {exc_val}"
+            assert exc_message == self.message, f"expect error message {self.message}, got {exc_message}"
+        # Suppress the exception
+        return True

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/pycute/run_all_tests.py ADDED Viewed

	@@ -0,0 +1,75 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Utility script for discovering and running all PyCuTe tests
+"""
+import argparse
+import logging
+import pathlib
+import unittest
+def numeric_log_level(log_level: str) -> int:
+  """
+  Converts the string identifier of the log level into the numeric identifier used
+  in setting the log level
+  :param x: string representation of log level (e.g., 'INFO', 'DEBUG')
+  :type x: str
+  :return: numeric representation of log level
+  :rtype: int
+  """
+  numeric_level = getattr(logging, log_level.upper(), None)
+  if not isinstance(numeric_level, int):
+    raise ValueError(f"Invalid log level: {log_level}")
+  return numeric_level
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--log-level", default='info', type=numeric_log_level, required=False,
+                      help='Logging level to be used by the generator script')
+  args = parser.parse_args()
+  # Set the logging level based on the user-provided `--log-level` command-line option
+  logging.basicConfig(level=args.log_level)
+  loader = unittest.TestLoader()
+  script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
+  tests = loader.discover(script_dir, "test_*.py")
+  test_runner = unittest.runner.TextTestRunner()
+  results = test_runner.run(tests)
+  if not results.wasSuccessful():
+    raise Exception("Test cases failed")

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_coalesce.py ADDED Viewed

	@@ -0,0 +1,95 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Unit tests for pycute.coalesce
+"""
+import logging
+import unittest
+from pycute import *
+_LOGGER = logging.getLogger(__name__)
+class TestCoalesce(unittest.TestCase):
+  def helper_test_coalesce(self, layout):
+    layoutR = coalesce(layout)
+    _LOGGER.debug(f"{layout}  =>  {layoutR}")
+    self.assertEqual(size(layoutR), size(layout))
+    for i in range(size(layout)):
+      self.assertEqual(layoutR(i), layout(i))
+  def test_coalesce(self):
+    layout = Layout(1,0)
+    self.helper_test_coalesce(layout)
+    layout = Layout(1,1)
+    self.helper_test_coalesce(layout)
+    layout = Layout((2,4))
+    self.helper_test_coalesce(layout)
+    layout = Layout((2,4,6))
+    self.helper_test_coalesce(layout)
+    layout = Layout((2,4,6), (1,6,2))
+    self.helper_test_coalesce(layout)
+    layout = Layout((2,1,6), (1,7,2))
+    self.helper_test_coalesce(layout)
+    layout = Layout((2,1,6), (4,7,8))
+    self.helper_test_coalesce(layout)
+    layout = Layout((2,(4,6)))
+    self.helper_test_coalesce(layout)
+    layout = Layout((2,4), (4,1))
+    self.helper_test_coalesce(layout)
+    layout = Layout((2,4,6), (24,6,1))
+    self.helper_test_coalesce(layout)
+    layout = Layout((2,1,3), (2,4,4))
+    self.helper_test_coalesce(layout)
+    layout = Layout(((2,2),(2,2)), ((1,4),(8,32)))
+    self.helper_test_coalesce(layout)
+if __name__ == "__main__":
+  unittest.main()

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_complement.py ADDED Viewed

	@@ -0,0 +1,92 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Unit tests for pycute.complement
+"""
+import logging
+import unittest
+from pycute import *
+_LOGGER = logging.getLogger(__name__)
+class TestComplement(unittest.TestCase):
+  def helper_test_complement(self, layout):
+    layoutR = complement(layout)
+    _LOGGER.debug(f"{layout}  =>  {layoutR}")
+    # Post-condition: test disjointness of the codomains
+    for a in range(size(layout)):
+      for b in range(size(layoutR)):
+        assert (layout(a) != layoutR(b)) or (layout(a) == 0 and layoutR(b) == 0)
+  def test_complement(self):
+    test = Layout(1,0)
+    self.helper_test_complement(test)
+    test = Layout(1,1)
+    self.helper_test_complement(test)
+    test = Layout(4,0)
+    self.helper_test_complement(test)
+    test = Layout((2,4),(1,2))
+    self.helper_test_complement(test)
+    test = Layout((2,3),(1,2))
+    self.helper_test_complement(test)
+    test = Layout((2,4),(1,4))
+    self.helper_test_complement(test)
+    test = Layout((2,4,8),(8,1,64))
+    self.helper_test_complement(test)
+    test = Layout(((2,2),(2,2)),((1,4),(8,32)))
+    self.helper_test_complement(test)
+    test = Layout((2,(3,4)),(3,(1,6)))
+    self.helper_test_complement(test)
+    test = Layout((4,6),(1,6))
+    self.helper_test_complement(test)
+    test = Layout((4,10),(1,10))
+    self.helper_test_complement(test)
+if __name__ == "__main__":
+  unittest.main()

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_composition.py ADDED Viewed

	@@ -0,0 +1,213 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Unit tests for pycute.composition
+"""
+import logging
+import unittest
+from pycute import *
+_LOGGER = logging.getLogger(__name__)
+class TestComposition(unittest.TestCase):
+  def helper_test_composition(self, layoutA, layoutB):
+    layoutR = composition(layoutA, layoutB)
+    _LOGGER.debug(f"{layoutA} o {layoutB}  =>  {layoutR}")
+    # True post-condition: Every coordinate c of layoutB with L1D(c) < size(layoutR) is a coordinate of layoutR.
+    # Test that R(c) = A(B(c)) for all coordinates c in layoutR
+    for i in range(size(layoutR)):
+      self.assertEqual(layoutR(i), layoutA(layoutB(i)))
+  def test_composition(self):
+    layoutA = Layout(1,0)
+    layoutB = Layout(1,0)
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout(1,0)
+    layoutB = Layout(1,1)
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout(1,1)
+    layoutB = Layout(1,0)
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout(1,1)
+    layoutB = Layout(1,1)
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((4))
+    layoutB = Layout((4))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((4), (2))
+    layoutB = Layout((4))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((4))
+    layoutB = Layout((4), (2))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((4), (0))
+    layoutB = Layout((4))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((4))
+    layoutB = Layout((4), (0))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((1), (0))
+    layoutB = Layout((4))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((4))
+    layoutB = Layout((1), (0))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((4))
+    layoutB = Layout((2))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((4), (2))
+    layoutB = Layout((2))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((4))
+    layoutB = Layout((2), (2))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((4), (2))
+    layoutB = Layout((2), (2))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((12))
+    layoutB = Layout((4,3))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((12), (2))
+    layoutB = Layout((4,3))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((12))
+    layoutB = Layout((4,3), (3,1))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((12), (2))
+    layoutB = Layout((4,3), (3,1))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((12))
+    layoutB = Layout((2,3), (2,4))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((4,3))
+    layoutB = Layout((4,3))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((4,3))
+    layoutB = Layout((12))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((4,3))
+    layoutB = Layout((6), (2))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((4,3))
+    layoutB = Layout((6,2), (2,1))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((4,3), (3,1))
+    layoutB = Layout((4,3))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((4,3), (3,1))
+    layoutB = Layout((12))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((4,3), (3,1))
+    layoutB = Layout((6), (2))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((4,3), (3,1))
+    layoutB = Layout((6,2), (2,1))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((8,8))
+    layoutB = Layout(((2,2,2), (2,2,2)),((1,16,4), (8,2,32)))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((8,8), (8,1))
+    layoutB = Layout(((2,2,2), (2,2,2)),((1,16,4), (8,2,32)))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout(((2,2,2), (2,2,2)),((1,16,4), (8,2,32)))
+    layoutB = Layout(8, 4)
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout(((4,2)), ((1,16)))
+    layoutB = Layout((4,2), (2,1))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((2,2), (2,1))
+    layoutB = Layout((2,2), (2,1))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((4,8,2))
+    layoutB = Layout((2,2,2), (2,8,1))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((4,8,2), (2,8,1))
+    layoutB = Layout((2,2,2), (1,8,2))
+    self.helper_test_composition(layoutA, layoutB)
+    layoutA = Layout((4,8,2), (2,8,1))
+    layoutB = Layout((4,2,2), (2,8,1))
+    self.helper_test_composition(layoutA, layoutB)
+    # Pre-coalesced LHS
+    layoutA = Layout((4,6,8),(1,4,7))
+    layoutB = Layout((6),(1))
+    self.helper_test_composition(layoutA, layoutB)
+    # Mid-layout truncation
+    layoutA = Layout((4,6,8,10),(2,3,5,7))
+    layoutB = Layout(6,12)
+    self.helper_test_composition(layoutA, layoutB)
+if __name__ == "__main__":
+  unittest.main()

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_int_tuple.py ADDED Viewed

	@@ -0,0 +1,80 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Unit tests for pycute.int_tuple
+"""
+import unittest
+from pycute import *
+class TestIntTuple(unittest.TestCase):
+  def test_product(self):
+    self.assertEqual(product(2), 2)
+    self.assertEqual(product((3,2)), 6)
+    self.assertEqual(product(product(((2,3),4))), 24)
+  def test_inner_product(self):
+    self.assertEqual(inner_product(2, 3), 6)
+    self.assertEqual(inner_product((1,2), (3,2)), 7)
+    self.assertEqual(inner_product(((2,3),4), ((2,1),2)), 15)
+  def test_shape_div(self):
+    self.assertEqual(shape_div((3,4), 6), (1,2))
+    self.assertEqual(shape_div((3,4), 12), (1,1))
+    self.assertEqual(shape_div((3,4), 36), (1,1))
+    self.assertEqual(shape_div(((3,4),6), 36), ((1,1),2))
+    self.assertEqual(shape_div((6,(3,4)), 36), (1,(1,2)))
+  def test_prefix_product(self):
+    self.assertEqual(prefix_product(2), 1)
+    self.assertEqual(prefix_product((3,2)), (1,3))
+    self.assertEqual(prefix_product((3,2,4)), (1,3,6))
+    self.assertEqual(prefix_product(((2,3),4)), ((1,2),6))
+    self.assertEqual(prefix_product(((2,3),(2, 1, 2),( 5,  2,  1))),
+                                    ((1,2),(6,12,12),(24,120,240)))

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_left_inverse.py ADDED Viewed

	@@ -0,0 +1,87 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Unit tests for pycute.left_inverse
+"""
+import logging
+import unittest
+from pycute import *
+_LOGGER = logging.getLogger(__name__)
+class TestLeftInverse(unittest.TestCase):
+  def helper_test_left_inverse(self, layout):
+    inv_layout = left_inverse(layout)
+    _LOGGER.debug(f"{layout}  =>  {inv_layout}")
+    for i in range(size(layout)):
+      self.assertEqual(inv_layout(layout(i)), i)
+  def test_left_inverse(self):
+    test = Layout(1,0)
+    self.helper_test_left_inverse(test)
+    test = Layout((1,1),(0,0))
+    self.helper_test_left_inverse(test)
+    test = Layout(1,1)
+    self.helper_test_left_inverse(test)
+    test = Layout(4,1)
+    self.helper_test_left_inverse(test)
+    test = Layout(4,2)
+    self.helper_test_left_inverse(test)
+    test = Layout((8,4),(1,8))
+    self.helper_test_left_inverse(test)
+    test = Layout((8,4),(4,1))
+    self.helper_test_left_inverse(test)
+    test = Layout((2,4,6),(1,2,8))
+    self.helper_test_left_inverse(test)
+    test = Layout((2,4,6),(4,1,8))
+    self.helper_test_left_inverse(test)
+    test = Layout((4,2),(1,16))
+    self.helper_test_left_inverse(test)
+if __name__ == "__main__":
+  unittest.main()

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_right_inverse.py ADDED Viewed

	@@ -0,0 +1,96 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Unit tests for pycute.left_inverse
+"""
+import logging
+import unittest
+from pycute import *
+_LOGGER = logging.getLogger(__name__)
+class TestRightInverse(unittest.TestCase):
+  def helper_test_right_inverse(self, layout):
+    inv_layout = right_inverse(layout)
+    _LOGGER.debug(f"{layout}  =>  {inv_layout}")
+    for i in range(size(inv_layout)):
+      self.assertEqual(layout(inv_layout(i)), i)
+  def test_right_inverse(self):
+    test = Layout(1,0)
+    self.helper_test_right_inverse(test)
+    test = Layout((1,1),(0,0))
+    self.helper_test_right_inverse(test)
+    test = Layout((3,7),(0,0))
+    self.helper_test_right_inverse(test)
+    test = Layout(1,1)
+    self.helper_test_right_inverse(test)
+    test = Layout(4,0)
+    self.helper_test_right_inverse(test)
+    test = Layout(4,1)
+    self.helper_test_right_inverse(test)
+    test = Layout(4,2)
+    self.helper_test_right_inverse(test)
+    test = Layout((2,4),(0,2))
+    self.helper_test_right_inverse(test)
+    test = Layout((8,4),(1,8))
+    self.helper_test_right_inverse(test)
+    test = Layout((8,4),(4,1))
+    self.helper_test_right_inverse(test)
+    test = Layout((2,4,6),(1,2,8))
+    self.helper_test_right_inverse(test)
+    test = Layout((2,4,6),(4,1,8))
+    self.helper_test_right_inverse(test)
+    test = Layout((4,2),(1,16))
+    self.helper_test_right_inverse(test)
+if __name__ == "__main__":
+  unittest.main()

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_typing.py ADDED Viewed

	@@ -0,0 +1,59 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Unit tests for pycute.typing
+"""
+import logging
+import unittest
+from pycute import *
+_LOGGER = logging.getLogger(__name__)
+class TestTyping(unittest.TestCase):
+    def helper_test_typing(self, _cls, _obj, cls, expected: bool):
+        _LOGGER.debug(f"issubclass({_cls}, {cls})")
+        _LOGGER.debug(f"isinstance({_obj}, {cls})")
+        self.assertEqual(expected, issubclass(_cls, cls))
+        self.assertEqual(expected, isinstance(_obj, cls))
+    def test_typing(self):
+        self.helper_test_typing(int, 1, Integer, True)
+        self.helper_test_typing(float, 1., Integer, False)
+        self.helper_test_typing(str, 'hi', Integer, False)
+        self.helper_test_typing(bool, False, Integer, False)
+if __name__ == '__main__':
+    unittest.main()

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/common/cutlass_unit_test.h ADDED Viewed

	@@ -0,0 +1,102 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#pragma warning (disable : 4068 ) /* disable unknown pragma warnings for visual studio */
+#pragma nv_diag_suppress boolean_controlling_expr_is_constant
+#include <gtest/gtest.h>
+#pragma nv_diag_warning boolean_controlling_expr_is_constant
+#pragma warning( disable : 4503)
+#include <cstdlib>
+#include <string>
+#include <cuda_runtime_api.h>
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Gets a CUDA device
+cudaDeviceProp GetCudaDevice();
+/// Prints device properties
+std::ostream &operator<<(std::ostream &out, cudaDeviceProp const &device);
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Sets flags for Unit test
+void FilterArchitecture();
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Reads environment variable `CUTLASS_UNIT_TEST_PROBLEM_COUNT` to control the number and order
+//  of problem sizes run by CUTLASS unit tests
+int CutlassUnitTestProblemCount();
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// active test macro
+#define CUTLASS_TEST_LEVEL_ACTIVE(LEVEL,NAME_STATIC,NAME_DYNAMIC,...) \
+    TEST(NAME_STATIC,L##LEVEL##_##NAME_DYNAMIC) __VA_ARGS__
+// disabled test macro
+#define CUTLASS_TEST_LEVEL_DISABLED(LEVEL,NAME_STATIC,NAME_DYNAMIC,...) \
+    TEST(NAME_STATIC,DISABLED_L##LEVEL##_##NAME_DYNAMIC) {}
+#if CUTLASS_TEST_LEVEL == 0
+#define CUTLASS_TEST_L0(NAME_STATIC,NAME_DYNAMIC,...)   CUTLASS_TEST_LEVEL_ACTIVE(0,NAME_STATIC,NAME_DYNAMIC,__VA_ARGS__)
+#define CUTLASS_TEST_L1(NAME_STATIC,NAME_DYNAMIC,...) CUTLASS_TEST_LEVEL_DISABLED(1,NAME_STATIC,NAME_DYNAMIC,__VA_ARGS__)
+#define CUTLASS_TEST_L2(NAME_STATIC,NAME_DYNAMIC,...) CUTLASS_TEST_LEVEL_DISABLED(2,NAME_STATIC,NAME_DYNAMIC,__VA_ARGS__)
+#elif CUTLASS_TEST_LEVEL == 1
+#define CUTLASS_TEST_L0(NAME_STATIC,NAME_DYNAMIC,...)   CUTLASS_TEST_LEVEL_ACTIVE(0,NAME_STATIC,NAME_DYNAMIC,__VA_ARGS__)
+#define CUTLASS_TEST_L1(NAME_STATIC,NAME_DYNAMIC,...)   CUTLASS_TEST_LEVEL_ACTIVE(1,NAME_STATIC,NAME_DYNAMIC,__VA_ARGS__)
+#define CUTLASS_TEST_L2(NAME_STATIC,NAME_DYNAMIC,...) CUTLASS_TEST_LEVEL_DISABLED(2,NAME_STATIC,NAME_DYNAMIC,__VA_ARGS__)
+#else
+#define CUTLASS_TEST_L0(NAME_STATIC,NAME_DYNAMIC,...)   CUTLASS_TEST_LEVEL_ACTIVE(0,NAME_STATIC,NAME_DYNAMIC,__VA_ARGS__)
+#define CUTLASS_TEST_L1(NAME_STATIC,NAME_DYNAMIC,...)   CUTLASS_TEST_LEVEL_ACTIVE(1,NAME_STATIC,NAME_DYNAMIC,__VA_ARGS__)
+#define CUTLASS_TEST_L2(NAME_STATIC,NAME_DYNAMIC,...)   CUTLASS_TEST_LEVEL_ACTIVE(2,NAME_STATIC,NAME_DYNAMIC,__VA_ARGS__)
+#endif
+#if !defined(CUTLASS_TEST_UNIT_ENABLE_WARNINGS)
+#define CUTLASS_TEST_UNIT_ENABLE_WARNINGS false
+#endif
+#if (__CUDACC_VER_MAJOR__ >= 12)
+  #define CUDA_12_0_SM90_FEATURES_SUPPORTED true
+#else
+  #define CUDA_12_0_SM90_FEATURES_SUPPORTED false
+#endif
+#include <cutlass/cutlass.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/trace.h>
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/cache_testbed_output.h ADDED Viewed

	@@ -0,0 +1,907 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Helper to construct cached name for
+*/
+#pragma once
+#include <typeinfo>
+#include <fstream>
+#include <list>
+#include <utility>
+#include <sstream>
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/core_io.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "thrust/universal_vector.h"
+#ifndef CUTLASS_TEST_ENABLE_CACHED_RESULTS
+#define CUTLASS_TEST_ENABLE_CACHED_RESULTS false
+#endif
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace test::conv::device {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Result of a test
+struct CachedTestKey {
+  std::string op;         ///< Concatenated string representation of operation performed
+  std::string problem;    ///< Concatenated string representation of problem description
+  std::string types;      ///< Concatenated string representation of operand types
+  uint32_t    A;          ///< Hashed result of tensor A
+  uint32_t    B;          ///< Hashed result of tensor B
+  uint32_t    C;          ///< Hashed result of tensor C
+  //
+  // Methods
+  //
+  inline CachedTestKey(): A(), B(), C() { }
+  inline CachedTestKey(
+    std::string op,         ///< Concatenated string representation of operation performed
+    std::string problem,    ///< Concatenated string representation of problem description
+    std::string types,      ///< Concatenated string representation of operand types
+    uint32_t    A,          ///< Hashed result of tensor A
+    uint32_t    B,          ///< Hashed result of tensor B
+    uint32_t    C           ///< Hashed result of tensor C
+  ):
+    op(op), problem(problem), types(types), A(A), B(B), C(C)
+  { }
+  /// Checks for equality of the problem
+  bool operator==(CachedTestKey const &rhs) const {
+    return op == rhs.op && problem == rhs.problem && types == rhs.types && A == rhs.A && B == rhs.B && C == rhs.C;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+inline std::istream &operator>>(std::istream &in, CachedTestKey &result) {
+  in >> result.op;
+  in >> result.problem;
+  in >> result.types;
+  in >> result.A;
+  in >> result.B;
+  in >> result.C;
+  return in;
+}
+inline std::ostream &operator<<(std::ostream &out, CachedTestKey const &result) {
+  out << result.op << " ";
+  out << result.problem << " ";
+  out << result.types << " ";
+  out << result.A << " ";
+  out << result.B << " ";
+  out << result.C << " ";
+  return out;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+struct CachedTestResult {
+  uint32_t D;
+  //
+  // Methods
+  //
+  CachedTestResult(): D()
+      { }
+  CachedTestResult(uint32_t D): D(D)
+      { }
+  operator bool() const {
+    return bool(D);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+inline std::istream &operator>>(std::istream &in, CachedTestResult &result) {
+  in >> result.D;
+  return in;
+}
+inline std::ostream &operator<<(std::ostream &out, CachedTestResult const &result) {
+  out << result.D;
+  return out;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+struct CachedTestResultListing {
+  std::list<std::pair<CachedTestKey, CachedTestResult>> results;
+  //
+  // Methods
+  //
+  inline CachedTestResultListing(std::string const &path) {
+    std::ifstream file(path);
+    while (file.good()) {
+      CachedTestKey key;
+      file >> key;
+      CachedTestResult result;
+      file >> result;
+      if (result) {
+        results.push_back(std::make_pair(key, result));
+      }
+    }
+  }
+  /// Returns the cached result
+  std::pair<bool, CachedTestResult> find(CachedTestKey const &rhs) const {
+    for (auto const & result : results) {
+      if (result.first == rhs) {
+        return std::make_pair(true, result.second);
+      }
+    }
+    return std::make_pair(false, CachedTestResult());
+  }
+  /// Appends an entry
+  void append(CachedTestKey const &key, CachedTestResult const &result) {
+    if (result) {
+      results.push_back(std::make_pair(key, result));
+    }
+  }
+  /// Writes the entire listing to a file
+  bool write(std::string const &path) {
+    std::ofstream file(path);
+    if (!file.good()) {
+      return false;
+    }
+    for (auto const &result : results) {
+      file << result.first << result.second << std::endl;
+    }
+    return true;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Element>
+struct ScalarEncoder {
+  Element scalar;
+  ScalarEncoder(Element s): scalar(s) { }
+  std::string str() const {
+    std::stringstream ss;
+    Element s = scalar;
+    if (s < Element()) {
+      s = -s;
+      ss << "n";
+    }
+    ss << s;
+    return ss.str();
+  }
+};
+template <typename Element>
+ScalarEncoder<Element> EncodeScalar(Element a) {
+  return ScalarEncoder<Element>(a);
+}
+template <typename Element>
+struct ScalarEncoder<cutlass::complex<Element>> {
+  cutlass::complex<Element> scalar;
+  ScalarEncoder(cutlass::complex<Element> s): scalar(s) { }
+  std::string str() const {
+    std::stringstream ss;
+    ss << EncodeScalar<Element>(scalar.real()) << "_" << EncodeScalar<Element>(scalar.imag()) << "i";
+    return ss.str();
+  }
+};
+template <typename Element>
+std::ostream &operator<<(std::ostream &out, ScalarEncoder<Element> const &scalar) {
+  out << scalar.str();
+  return out;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+inline char const *EncodeOperator(cutlass::conv::Operator conv_op) {
+    switch (conv_op) {
+      case cutlass::conv::Operator::kFprop: return "fprop";
+      case cutlass::conv::Operator::kDgrad: return "dgrad";
+      case cutlass::conv::Operator::kWgrad: return "wgrad";
+      case cutlass::conv::Operator::kDeconv: return "deconv";
+    }
+    return "conv_unknown";
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Encode GemmCoord (Gemm problem size)
+inline std::ostream &EncodeProblemSize(
+  std::ostream &out,
+  cutlass::gemm::GemmCoord const &problem) {
+  out << problem.m() << "x" << problem.n() << "x" << problem.k() << "_";
+  return out;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Encode Conv2dProblemSize
+inline std::ostream &EncodeProblemSize(
+  std::ostream &out,
+  cutlass::conv::Conv2dProblemSize const &problem) {
+  out << problem.N << "x" << problem.H << "x" << problem.W << "x" << problem.C << "_"
+    << problem.P << "x" << problem.Q << "_" << problem.K << "x" << problem.R << "x" << problem.S << "_";
+  out << "pad_h" << problem.pad_h << "w" << problem.pad_w << "_";
+  out << "stride_h" << problem.stride_h << "w" << problem.stride_w << "_";
+  out << "dil_h" << problem.dilation_h << "w" << problem.dilation_w << "_";
+  switch (problem.mode) {
+    case cutlass::conv::Mode::kCrossCorrelation:
+        out << "corr";
+        break;
+    case cutlass::conv::Mode::kConvolution:
+        out << "conv";
+        break;
+  }
+  return out;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Encode Conv3dProblemSize
+inline std::ostream &EncodeProblemSize(
+  std::ostream &out,
+  cutlass::conv::Conv3dProblemSize const &problem) {
+  out << problem.N << "x" << problem.D << "x" << problem.H << "x" << problem.W << "x" << problem.C << "_"
+    << problem.Z << problem.P << "x" << problem.Q << "_" << problem.K << "x" << problem.R << "x" << problem.S << "_";
+  out << "pad_d" << problem.pad_h << "h" << problem.pad_h << "w" << problem.pad_w << "_";
+  out << "stride_d" << problem.stride_d << "h" << problem.stride_h << "w" << problem.stride_w << "_";
+  out << "dil_d" << problem.dilation_d << "h" << problem.dilation_h << "w" << problem.dilation_w << "_";
+  switch (problem.mode) {
+    case cutlass::conv::Mode::kCrossCorrelation:
+        out << "corr";
+        break;
+    case cutlass::conv::Mode::kConvolution:
+        out << "conv";
+        break;
+  }
+  return out;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Encode 3.x ConvNd ProblemShape
+template <class ProblemShape>
+inline std::ostream &EncodeProblemSize(
+  std::ostream &out,
+  ProblemShape const& problem_shape) {
+  out << problem_shape.shape_A << "_";
+  out << problem_shape.shape_B << "_";
+  out << "padl" << problem_shape.lower_padding << "_";
+  out << "padu" << problem_shape.upper_padding << "_";
+  out << "str"  << problem_shape.traversal_stride << "_";
+  out << "dil"  << problem_shape.dilation << "_";
+  switch (problem_shape.mode) {
+    case cutlass::conv::Mode::kCrossCorrelation:
+        out << "corr";
+        break;
+    case cutlass::conv::Mode::kConvolution:
+        out << "conv";
+        break;
+  }
+  return out;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Element>
+inline std::string ElementTypeName() {
+  return std::string(typeid(Element).name());
+}
+template <>
+inline std::string ElementTypeName<cutlass::half_t>() {
+  return "h";
+}
+template <>
+inline std::string ElementTypeName<cutlass::complex<cutlass::half_t>>() {
+  return "ch";
+}
+template <>
+inline std::string ElementTypeName<cutlass::bfloat16_t>() {
+  return "bf16";
+}
+template <>
+inline std::string ElementTypeName<cutlass::complex<cutlass::bfloat16_t>>() {
+  return "cbf16";
+}
+template <>
+inline std::string ElementTypeName<cutlass::tfloat32_t>() {
+  return "tf32";
+}
+template <>
+inline std::string ElementTypeName<cutlass::complex<cutlass::tfloat32_t>>() {
+  return "ctf32";
+}
+template <>
+inline std::string ElementTypeName<cutlass::complex<float>>() {
+  return "c";
+}
+template <>
+inline std::string ElementTypeName<cutlass::complex<double>>() {
+  return "z";
+}
+template <>
+inline std::string ElementTypeName<cutlass::Quaternion<float>>() {
+  return "q";
+}
+template <>
+inline std::string ElementTypeName<int8_t>() {
+  return "s8";
+}
+template <>
+inline std::string ElementTypeName<uint8_t>() {
+  return "u8";
+}
+template <>
+inline std::string ElementTypeName<cutlass::int4b_t>() {
+  return "s4";
+}
+template <>
+inline std::string ElementTypeName<cutlass::uint4b_t>() {
+  return "u4";
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Layout>
+inline std::string LayoutTypeName() {
+  return std::string(typeid(Layout).name());
+}
+template <>
+inline std::string LayoutTypeName<cutlass::layout::ColumnMajor>() {
+  return "n";
+}
+template <>
+inline std::string LayoutTypeName<cutlass::layout::RowMajor>() {
+  return "t";
+}
+template <>
+inline std::string LayoutTypeName<cutlass::layout::TensorNHWC>() {
+  return "nhwc";
+}
+template <>
+inline std::string LayoutTypeName<cutlass::layout::TensorNCxHWx<32>>() {
+  return "nc32hw32";
+}
+template <>
+inline std::string LayoutTypeName<cutlass::layout::TensorNCxHWx<64>>() {
+  return "nc64hw64";
+}
+template <>
+inline std::string LayoutTypeName<cutlass::layout::TensorCxRSKx<32>>() {
+  return "c32rsk32";
+}
+template <>
+inline std::string LayoutTypeName<cutlass::layout::TensorCxRSKx<64>>() {
+  return "c64rsk64";
+}
+template <>
+inline std::string LayoutTypeName<cutlass::layout::TensorNDHWC>() {
+  return "ndhwc";
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Element, typename Layout>
+inline std::string TensorTypeName() {
+  std::stringstream ss;
+  ss << ElementTypeName<Element>() << LayoutTypeName<Layout>();
+  return ss.str();
+}
+template <typename Element>
+inline std::string TensorTypeName() {
+  std::stringstream ss;
+  ss << ElementTypeName<Element>();
+  return ss.str();
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Hash function on a byte array
+struct CRC32 {
+  uint32_t table[256];
+  //
+  // Methods
+  //
+  CRC32() {
+    uint32_t rem;
+    int i, j;
+    for (i = 0; i < 256; i++) {
+      rem = i;
+      for (j = 0; j < 8; j++) {
+        if (rem & 1) {
+          rem >>= 1;
+          rem ^= 0xedb88320;
+        } else
+          rem >>= 1;
+      }
+      table[i] = rem;
+    }
+  }
+  /// Computes the CRC of an array of bytes
+  uint32_t operator()(void const *start, size_t length, uint32_t crc = uint32_t()) const {
+    uint8_t const *p = static_cast<uint8_t const *>(start);
+    uint8_t const *q = static_cast<uint8_t const *>(start) + length;
+    crc = ~crc;
+    for (; p != q; ++p) {
+      uint8_t octet = *p;
+      crc = (crc >> 8) ^ table[(crc & 0xff) ^ octet];
+    }
+    return ~crc;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Element, typename Layout
+>
+uint32_t TensorHash(
+  cutlass::TensorView<Element, Layout> view,
+  CRC32 const &hash = CRC32(),
+  uint32_t crc = uint32_t()
+) {
+  return hash(view.data(), view.capacity() * cutlass::sizeof_bits<Element>::value / 8, crc);
+}
+template <typename Element>
+uint32_t TensorHash(
+  thrust::universal_vector<Element>& tensor,
+  CRC32 const &hash = CRC32(),
+  uint32_t crc = uint32_t()
+) {
+  return hash(tensor.data().get(), tensor.size() * cutlass::sizeof_bits<Element>::value / 8, crc);
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA, typename LayoutA,
+  typename ElementB, typename LayoutB,
+  typename ElementC, typename LayoutC,
+  typename ElementAccumulator,
+  typename ElementCompute
+>
+inline std::ostream &EncodeTypes(
+  std::ostream &out
+) {
+  out << TensorTypeName<ElementA, LayoutA>() << "_"
+    << TensorTypeName<ElementB, LayoutB>() << "_"
+    << TensorTypeName<ElementC, LayoutC>() << "_"
+    << ElementTypeName<ElementAccumulator>() << "_"
+    << ElementTypeName<ElementCompute>();
+  return out;
+}
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename ElementD
+>
+inline std::ostream &EncodeTypes(
+  std::ostream &out
+) {
+  out << TensorTypeName<ElementA>() << "_"
+      << TensorTypeName<ElementB>() << "_"
+      << TensorTypeName<ElementC>() << "_"
+      << ElementTypeName<ElementD>();
+  return out;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA, typename LayoutA,
+  typename ElementB, typename LayoutB,
+  typename ElementC, typename LayoutC,
+  typename ElementAccumulator,
+  typename ElementCompute
+>
+inline CachedTestKey CreateCachedGemmTestKey(
+  cutlass::gemm::GemmCoord const &problem,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cutlass::TensorView<ElementA, LayoutA> A,
+  cutlass::TensorView<ElementB, LayoutB> B,
+  cutlass::TensorView<ElementC, LayoutC> C
+) {
+  CachedTestKey key;
+  // Encode gemm operator and problem sizes
+  key.op = "gemm";
+  std::stringstream ss_problem;
+  EncodeProblemSize(ss_problem, problem);
+  ss_problem << "_alpha" << EncodeScalar(alpha) << "_beta" << EncodeScalar(beta);
+  key.problem = ss_problem.str();
+  // Encode problem data types
+  std::stringstream ss_types;
+  EncodeTypes<
+        ElementA, LayoutA,
+        ElementB, LayoutB,
+        ElementC, LayoutC,
+        ElementAccumulator,
+        ElementCompute>(ss_types);
+  key.types = ss_types.str();
+  // Encode hash for problem data
+  CRC32 crc_hash;
+  key.A = TensorHash(A, crc_hash);
+  key.B = TensorHash(B, crc_hash);
+  key.C = TensorHash(C, crc_hash);
+  return key;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA, typename LayoutA,
+  typename ElementB, typename LayoutB,
+  typename ElementC, typename LayoutC,
+  typename ElementAccumulator,
+  typename ElementCompute
+>
+inline CachedTestKey CreateCachedConv2dTestKey(
+  cutlass::conv::Operator conv_operator,
+  cutlass::conv::Conv2dProblemSize const &problem,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cutlass::TensorView<ElementA, LayoutA> A,
+  cutlass::TensorView<ElementB, LayoutB> B,
+  cutlass::TensorView<ElementC, LayoutC> C
+) {
+  CachedTestKey key;
+  // Encode conv2d operator and problem sizes
+  key.op = "conv2d";
+  std::stringstream ss_problem;
+  ss_problem << EncodeOperator(conv_operator) << "_";
+  EncodeProblemSize(ss_problem, problem);
+  ss_problem << "_alpha" << EncodeScalar(alpha) << "_beta" << EncodeScalar(beta);
+  key.problem = ss_problem.str();
+  // Encode problem data types
+  std::stringstream ss_types;
+  EncodeTypes<
+        ElementA, LayoutA,
+        ElementB, LayoutB,
+        ElementC, LayoutC,
+        ElementAccumulator,
+        ElementCompute>(ss_types);
+  key.types = ss_types.str();
+  // Encode hash for problem data
+  CRC32 crc_hash;
+  key.A = TensorHash(A, crc_hash);
+  key.B = TensorHash(B, crc_hash);
+  key.C = TensorHash(C, crc_hash);
+  return key;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA, typename LayoutA,
+  typename ElementB, typename LayoutB,
+  typename ElementC, typename LayoutC,
+  typename ElementAccumulator,
+  typename ElementCompute
+>
+inline CachedTestKey CreateCachedConv2dWithBroadcastTestKey(
+  cutlass::conv::Operator conv_operator,
+  cutlass::conv::Conv2dProblemSize const &problem,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cutlass::TensorView<ElementA, LayoutA> A,
+  cutlass::TensorView<ElementB, LayoutB> B,
+  cutlass::TensorView<ElementC, LayoutC> C
+) {
+  CachedTestKey key;
+  // Encode conv2d operator and problem sizes
+  key.op = "conv2d_with_broadcast";
+  std::stringstream ss_problem;
+  ss_problem << EncodeOperator(conv_operator) << "_";
+  EncodeProblemSize(ss_problem, problem);
+  ss_problem << "_alpha" << EncodeScalar(alpha) << "_beta" << EncodeScalar(beta);
+  key.problem = ss_problem.str();
+  // Encode problem data types
+  std::stringstream ss_types;
+  EncodeTypes<
+        ElementA, LayoutA,
+        ElementB, LayoutB,
+        ElementC, LayoutC,
+        ElementAccumulator,
+        ElementCompute>(ss_types);
+  key.types = ss_types.str();
+  // Encode hash for problem data
+  CRC32 crc_hash;
+  key.A = TensorHash(A, crc_hash);
+  key.B = TensorHash(B, crc_hash);
+  key.C = TensorHash(C, crc_hash);
+  return key;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA, typename LayoutA,
+  typename ElementB, typename LayoutB,
+  typename ElementC, typename LayoutC,
+  typename ElementAccumulator,
+  typename ElementCompute
+>
+inline CachedTestKey CreateCachedConv2dWithReductionTestKey(
+  cutlass::conv::Operator conv_operator,
+  cutlass::conv::Conv2dProblemSize const &problem,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cutlass::TensorView<ElementA, LayoutA> A,
+  cutlass::TensorView<ElementB, LayoutB> B,
+  cutlass::TensorView<ElementC, LayoutC> C
+) {
+  CachedTestKey key;
+  // Encode conv2d operator and problem sizes
+  key.op = "conv2d_with_reduction";
+  std::stringstream ss_problem;
+  ss_problem << EncodeOperator(conv_operator) << "_";
+  EncodeProblemSize(ss_problem, problem);
+  ss_problem << "_alpha" << EncodeScalar(alpha) << "_beta" << EncodeScalar(beta);
+  key.problem = ss_problem.str();
+  // Encode problem data types
+  std::stringstream ss_types;
+  EncodeTypes<
+        ElementA, LayoutA,
+        ElementB, LayoutB,
+        ElementC, LayoutC,
+        ElementAccumulator,
+        ElementCompute>(ss_types);
+  key.types = ss_types.str();
+  // Encode hash for problem data
+  CRC32 crc_hash;
+  key.A = TensorHash(A, crc_hash);
+  key.B = TensorHash(B, crc_hash);
+  key.C = TensorHash(C, crc_hash);
+  return key;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA, typename LayoutA,
+  typename ElementB, typename LayoutB,
+  typename ElementC, typename LayoutC,
+  typename ElementAccumulator,
+  typename ElementCompute
+>
+inline CachedTestKey CreateCachedConv3dTestKey(
+  cutlass::conv::Operator conv_operator,
+  cutlass::conv::Conv3dProblemSize const &problem,
+  ElementCompute alpha,
+  ElementCompute beta,
+  cutlass::TensorView<ElementA, LayoutA> A,
+  cutlass::TensorView<ElementB, LayoutB> B,
+  cutlass::TensorView<ElementC, LayoutC> C
+) {
+  CachedTestKey key;
+  // Encode conv3d operator and problem sizes
+  key.op = "conv3d";
+  std::stringstream ss_problem;
+  ss_problem << EncodeOperator(conv_operator) << "_";
+  EncodeProblemSize(ss_problem, problem);
+  ss_problem << "_alpha" << EncodeScalar(alpha) << "_beta" << EncodeScalar(beta);
+  key.problem = ss_problem.str();
+  // Encode problem data types
+  std::stringstream ss_types;
+  EncodeTypes<
+        ElementA, LayoutA,
+        ElementB, LayoutB,
+        ElementC, LayoutC,
+        ElementAccumulator,
+        ElementCompute>(ss_types);
+  key.types = ss_types.str();
+  // Encode problem data
+  CRC32 crc_hash;
+  key.A = TensorHash(A, crc_hash);
+  key.B = TensorHash(B, crc_hash);
+  key.C = TensorHash(C, crc_hash);
+  return key;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  class ProblemShape,
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename ElementD
+>
+inline CachedTestKey CreateCachedConvNd3xTestKey(
+  cutlass::conv::Operator conv_operator,
+  ProblemShape const& problem_shape,
+  double alpha,
+  double beta,
+  thrust::universal_vector<ElementA> A,
+  thrust::universal_vector<ElementB> B,
+  thrust::universal_vector<ElementC> C
+) {
+  CachedTestKey key;
+  // Encode convNd operator and problem sizes
+  std::stringstream ss_op;
+  ss_op << "conv" << ProblemShape::RankS <<  "d";
+  key.op = ss_op.str();
+  std::stringstream ss_problem;
+  ss_problem << EncodeOperator(conv_operator) << "_";
+  EncodeProblemSize(ss_problem, problem_shape);
+  ss_problem << "_alpha" << EncodeScalar(alpha) << "_beta" << EncodeScalar(beta);
+  key.problem = ss_problem.str();
+  // Encode problem data types
+  std::stringstream ss_types;
+  EncodeTypes<
+        ElementA,
+        ElementB,
+        ElementC,
+        ElementD>(ss_types);
+  key.types = ss_types.str();
+  // Encode problem data
+  CRC32 crc_hash;
+  key.A = TensorHash(A, crc_hash);
+  key.B = TensorHash(B, crc_hash);
+  key.C = TensorHash(C, crc_hash);
+  return key;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace test::conv::device
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_problems.h ADDED Viewed

	@@ -0,0 +1,927 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed sizes for Conv2d problem
+*/
+#pragma once
+#include <vector>
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+namespace test {
+namespace conv {
+namespace device {
+using Conv2dProblemVector = std::vector<cutlass::conv::Conv2dProblemSize>;
+//
+// Structures to prune items from Conv2dProblemVector
+//
+// Specification template for pruning items for convolution problem lists
+template <typename T> struct Specification
+{
+  virtual ~Specification() = default;
+  virtual bool is_satisfied(T item) const = 0;
+};
+// input size  (NHWC) specification
+struct InputSizeSpecification : Specification<cutlass::conv::Conv2dProblemSize>
+{
+  cutlass::Tensor4DCoord input_size;
+  InputSizeSpecification(cutlass::Tensor4DCoord input_size_) : input_size(input_size_) {}
+  bool is_satisfied(cutlass::conv::Conv2dProblemSize item) const override {
+    return ((input_size.n() == item.N) && (input_size.h() == item.H) && (input_size.w() == item.W) && (input_size.c() == item.C));
+  }
+};
+// stride (stride_h, stride_w) specification
+struct StrideSpecification : Specification<cutlass::conv::Conv2dProblemSize>
+{
+  cutlass::MatrixCoord stride;
+  StrideSpecification(cutlass::MatrixCoord stride_) : stride(stride_) {}
+  bool is_satisfied(cutlass::conv::Conv2dProblemSize item) const override {
+    return ((stride.row() == item.stride_h) && (stride.column() == item.stride_h));
+  }
+};
+// channel (C,K) specification, must be multiple of minimum channel
+struct ChannelDivisibilitySpecification : Specification<cutlass::conv::Conv2dProblemSize>
+{
+  int channel_multiple;
+  ChannelDivisibilitySpecification(int channel_multiple_) : channel_multiple(channel_multiple_) {}
+  bool is_satisfied(cutlass::conv::Conv2dProblemSize item) const override {
+    return ((item.K % channel_multiple == 0) && (item.C % channel_multiple == 0));
+  }
+};
+//
+// Pruning function for items from Conv2dProblemVector based on a Specification
+//
+inline Conv2dProblemVector prune(Conv2dProblemVector const &items,
+                           Specification<cutlass::conv::Conv2dProblemSize> const &spec)
+{
+  Conv2dProblemVector pruned_list;
+  for (auto& p : items)
+    if (spec.is_satisfied(p))
+      pruned_list.push_back(p);
+  return pruned_list;
+}
+////////////////////////////////////////////////////////////////////////////
+/// Structure TestbedConv2dProblemSizes initializes and holds conv default and
+/// important network sizes
+////////////////////////////////////////////////////////////////////////////
+struct TestbedConv2dProblemSizes {
+  //
+  // Data members
+  //
+  int minimum_channel_size;
+  Conv2dProblemVector conv2d_default_sizes;
+  Conv2dProblemVector conv2d_rigorous_sizes;
+  Conv2dProblemVector conv2d_resnet50_sizes;
+  Conv2dProblemVector conv2d_resnet50_sizes_perf;
+  //
+  // Methods
+  //
+  /// Default ctor
+  TestbedConv2dProblemSizes(int minimum_channel_size_ = 64): minimum_channel_size (minimum_channel_size_) {
+    initialize_conv2d_default_sizes();
+    initialize_conv2d_rigorous_sizes();
+    initialize_conv2d_resnet50_sizes(conv2d_resnet50_sizes, 1 /*batch-size*/);
+    initialize_conv2d_resnet50_sizes(conv2d_resnet50_sizes_perf, 34 /*batch-size*/);
+    filter_all();
+  }
+  /// Eliminates some illegal cases
+  void filter_all() {
+    Conv2dProblemVector *problems_vectors[] = {
+      &conv2d_default_sizes,
+      &conv2d_rigorous_sizes,
+      &conv2d_resnet50_sizes,
+      &conv2d_resnet50_sizes_perf
+    };
+    for (Conv2dProblemVector *problems : problems_vectors) {
+      Conv2dProblemVector filtered;
+      for (cutlass::conv::Conv2dProblemSize const & problem : *problems) {
+        if (!(problem.C % minimum_channel_size)) {
+          filtered.push_back(problem);
+        }
+      }
+      *problems = filtered;
+    }
+  }
+  // Add a few standard convolution problem sizes
+  void initialize_conv2d_default_sizes() {
+    ////////////////////////////////////////////////////////////////////////////////////////////
+    // Small input size x stride (1,1)
+    // C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+    ////////////////////////////////////////////////////////////////////////////////////////////
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 1, 1, minimum_channel_size},   // input size  (NHWC)
+      {8, 1, 1, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 1, 8, minimum_channel_size},   // input size  (NHWC)
+      {8, 1, 3, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 7, 8, minimum_channel_size},   // input size  (NHWC)
+      {8, 3, 3, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 7, 9, minimum_channel_size},  // input size  (NHWC)
+      {8, 4, 4, minimum_channel_size},  // filter size (KRSC)
+      {1, 1, 1, 1},                     // padding (pad_h, _, pad_w, _)
+      {1, 1},                           // stride (stride_h, stride_w)
+      {1, 1}                            // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {2, 7, 9, minimum_channel_size},   // input size  (NHWC)
+      {8, 5, 5, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {3, 7, 9, minimum_channel_size},   // input size  (NHWC)
+      {8, 6, 5, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {3, 7, 9, minimum_channel_size},   // input size  (NHWC)
+      {8, 6, 6, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {3, 7, 9, minimum_channel_size},   // input size  (NHWC)
+      {8, 7, 7, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w)
+    ));
+    ////////////////////////////////////////////////////////////////////////////////////////////
+    // Small input size x stride (1,1) asymmetric paddings (1, 0, 1, 0)
+    // C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+    ////////////////////////////////////////////////////////////////////////////////////////////
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 1, 1, minimum_channel_size},   // input size  (NHWC)
+      {8, 1, 1, minimum_channel_size},   // filter size (KRSC)
+      {1, 0, 1, 0},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 1, 8, minimum_channel_size},   // input size  (NHWC)
+      {8, 1, 3, minimum_channel_size},   // filter size (KRSC)
+      {1, 0, 1, 0},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 7, 8, minimum_channel_size},   // input size  (NHWC)
+      {8, 3, 3, minimum_channel_size},   // filter size (KRSC)
+      {1, 0, 1, 0},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 7, 9, minimum_channel_size},  // input size  (NHWC)
+      {8, 4, 4, minimum_channel_size},  // filter size (KRSC)
+      {1, 0, 1, 0},                     // padding (pad_h, _, pad_w, _)
+      {1, 1},                           // stride (stride_h, stride_w)
+      {1, 1}                            // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {2, 7, 9, minimum_channel_size},   // input size  (NHWC)
+      {8, 5, 5, minimum_channel_size},   // filter size (KRSC)
+      {1, 0, 1, 0},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {3, 7, 9, minimum_channel_size},   // input size  (NHWC)
+      {8, 6, 5, minimum_channel_size},   // filter size (KRSC)
+      {1, 0, 1, 0},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {3, 7, 9, minimum_channel_size},   // input size  (NHWC)
+      {8, 6, 6, minimum_channel_size},   // filter size (KRSC)
+      {1, 0, 1, 0},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {3, 7, 9, minimum_channel_size},   // input size  (NHWC)
+      {8, 7, 7, minimum_channel_size},   // filter size (KRSC)
+      {1, 0, 1, 0},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w)
+    ));
+    ////////////////////////////////////////////////////////////////////////////////////////////
+    // Small input size x stride (2,2)
+    // C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+    ////////////////////////////////////////////////////////////////////////////////////////////
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 11, 7, minimum_channel_size},  // input size  (NHWC)
+      {8, 1, 1, minimum_channel_size},    // filter size (KRSC)
+      {0, 0, 0, 0},                       // padding (pad_h, _, pad_w, _)
+      {2, 2},                             // stride (stride_h, stride_w)
+      {1, 1}                              // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 11, 7, minimum_channel_size},   // input size  (NHWC)
+      {8, 3, 3, minimum_channel_size},     // filter size (KRSC)
+      {1, 1, 1, 1},                        // padding (pad_h, _, pad_w, _)
+      {2, 2},                              // stride (stride_h, stride_w)
+      {1, 1}                               // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 13, 11, minimum_channel_size},   // input size  (NHWC)
+      {8, 1, 1, minimum_channel_size},     // filter size (KRSC)
+      {1, 1, 1, 1},                        // padding (pad_h, _, pad_w, _)
+      {2, 2},                              // stride (stride_h, stride_w)
+      {1, 1}                               // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 17, 19, minimum_channel_size},   // input size  (NHWC)
+      {16, 2, 2, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},    // padding (pad_h, _, pad_w, _)
+      {2, 2},          // stride (stride_h, stride_w)
+      {1, 1}           // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 23, 5, minimum_channel_size},   // input size  (NHWC)
+      {16, 3, 3, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},    // padding (pad_h, _, pad_w, _)
+      {2, 2},          // stride (stride_h, stride_w)
+      {1, 1}           // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 13, 17, 8},   // input size  (NHWC)
+      {24, 3, 3, 8},   // filter size (KRSC)
+      {0, 0, 0, 0},    // padding (pad_h, _, pad_w, _)
+      {2, 2},          // stride (stride_h, stride_w)
+      {1, 1}           // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 23, 21, 8},     // input size (NHWC)
+      {24, 3, 3, 8},     // filter size (KRSC)
+      {1, 1, 1, 1},     // padding (pad_h, _, pad_w, _)
+      {3, 3},           // stride (stride_h, stride_w)
+      {1, 1}            // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 20, 24, 8},   // input size (NHWC)
+      {40, 3, 3, 8},     // filter size (KRSC)
+      {3, 3, 3, 3},     // padding (pad_h, _, pad_w, _)
+      {3, 3},           // stride (stride_h, stride_w)
+      {1, 1}            // dilation (dilation_h, dilation_w)
+    ));
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Medium input size (1x16x16x128), filter size (1x1, 2x2, 3x3, 5x5), stride (1, 1)
+    ////////////////////////////////////////////////////////////////////////////////////
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 15, 19, 160},   // input size  (NHWC)
+      {224, 1, 1, 160},   // filter size (KRSC)
+      {0, 0, 0, 0},       // padding (pad_h, _, pad_w, _)
+      {1, 1},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 19, 37, 160},     // input size  (NHWC)
+      {224, 3, 3, 160},     // filter size (KRSC)
+      {1, 1, 1, 1},         // padding (pad_h, _, pad_w, _)
+      {2, 2},               // stride (stride_h, stride_w)
+      {1, 1}                // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 16, 16, 160},   // input size  (NHWC)
+      {224, 2, 3, 160},   // filter size (KRSC)
+      {1, 1, 1, 1},       // padding (pad_h, _, pad_w, _)
+      {1, 1},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 23, 21, 128},  // input size  (NHWC)
+      {224, 3, 3, 128},  // filter size (KRSC)
+      {1, 1, 1, 1},      // padding (pad_h, _, pad_w, _)
+      {1, 1},            // stride (stride_h, stride_w)
+      {1, 1}             // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 29, 37, 160},      // input size  (NHWC)
+      {224, 5, 5, 160},      // filter size (KRSC)
+      {2, 2, 2, 2},          // padding (pad_h, _, pad_w, _)
+      {1, 1},                // stride (stride_h, stride_w)
+      {1, 1}                 // dilation (dilation_h, dilation_w)
+    ));
+    ////////////////////////////////////////////////////////////////////////////////////
+    // C > CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+    ////////////////////////////////////////////////////////////////////////////////////
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 15, 19, 32 + minimum_channel_size},     // input size  (NHWC)
+      {96, 3, 3, 32 + minimum_channel_size},      // filter size (KRSC)
+      {1, 1, 1, 1},                               // padding (pad_h, _, pad_w, _)
+      {1, 1},                                     // stride (stride_h, stride_w)
+      {1, 1}                                      // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 16, 24, 64 + minimum_channel_size},     // input size  (NHWC)
+      {96, 3, 3, 64 + minimum_channel_size},      // filter size (KRSC)
+      {1, 1, 1, 1},                               // padding (pad_h, _, pad_w, _)
+      {1, 1},                                     // stride (stride_h, stride_w)
+      {1, 1}                                      // dilation (dilation_h, dilation_w)
+    ));
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Medium input size, filter size (1x1, 3,x3, 5x5, 7x7), stride (2, 2)
+    ////////////////////////////////////////////////////////////////////////////////////
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 13, 16, 288},   // input size  (NHWC)
+      {160, 5, 5, 288},   // filter size (KRSC)
+      {2, 2, 2, 2},       // padding (pad_h, _, pad_w, _)
+      {2, 2},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 55, 51, 256},   // input size (NHWC)
+      {512, 1, 1, 256},   // filter size (KRSC)
+      {0, 0, 0, 0},       // padding (pad_h, _, pad_w, _)
+      {2, 2},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 71, 80, 32},    // input size (NHWC)
+      {64, 5, 5, 32},     // filter size (KRSC)
+      {2, 2, 2, 2},       // padding (pad_h, _, pad_w, _)
+      {2, 2},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 224, 224, 8},   // input size (NHWC)
+      {64, 7, 7, 8},      // filter size (KRSC)
+      {3, 3, 3, 3},       // padding (pad_h, _, pad_w, _)
+      {2, 2},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    ));
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Medium input size stride (3, 3), filter (3, 3), non-default padding
+    ////////////////////////////////////////////////////////////////////////////////////
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 27, 23, 256},     // input size (NHWC)
+      {512, 3, 3, 256},     // filter size (KRSC)
+      {0, 0, 0, 0},         // padding (pad_h, _, pad_w, _)
+      {3, 3},               // stride (stride_h, stride_w)
+      {1, 1}                // dilation (dilation_h, dilation_w)
+    ));
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Medium input size padding > stride, asymmetric filter, padding and striding
+    ////////////////////////////////////////////////////////////////////////////////////
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 27, 31, 256},     // input size (NHWC)
+      {512, 3, 3, 256},     // filter size (KRSC)
+      {5, 5, 7, 7},         // padding (pad_h, _, pad_w, _)
+      {3, 4},               // stride (stride_h, stride_w)
+      {1, 1}                // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 27, 35, 256},     // input size (NHWC)
+      {512, 7, 5, 256},     // filter size (KRSC)
+      {11, 11, 7, 7},       // padding (pad_h, _, pad_w, _)
+      {3, 5},               // stride (stride_h, stride_w)
+      {1, 1}                // dilation (dilation_h, dilation_w)
+    ));
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Medium input size *mixed* stride (1, 2) and (2, 1),
+    // filter (3, 3), default padding
+    ////////////////////////////////////////////////////////////////////////////////////
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 27, 27, 256},     // input size (NHWC)
+      {512, 3, 3, 256},     // filter size (KRSC)
+      {1, 1, 1, 1},         // padding (pad_h, _, pad_w, _)
+      {1, 2},               // stride (stride_h, stride_w)
+      {1, 1}                // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 27, 27, 256},     // input size (NHWC)
+      {512, 3, 3, 256},     // filter size (KRSC)
+      {1, 1, 1, 1},         // padding (pad_h, _, pad_w, _)
+      {2, 1},               // stride (stride_h, stride_w)
+      {1, 1}                // dilation (dilation_h, dilation_w)
+    ));
+    /////////////////////////////////////////////////////////////////////////////
+    // Additional input size
+    /////////////////////////////////////////////////////////////////////////////
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {3, 28, 28, 256},  // input size  (NHWC)
+      {256, 2, 2, 256},  // filter size (KRSC)
+      {0, 0, 0, 0},      // padding (pad_h, _, pad_w, _)
+      {2, 2},            // stride (stride_h, stride_w)
+      {1, 1}             // dilation (dilation_h, dilation_w)
+    ));
+   conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 32, 32, 16},  // input size  (NHWC)
+      {32, 3, 3, 16},  // filter size (KRSC)
+      {1, 1, 1, 1},      // padding (pad_h, _, pad_w, _)
+      {6, 2},            // stride (stride_h, stride_w)
+      {1, 1}             // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {32, 24, 32, 32},  // input size  (NHWC)
+      {32, 1, 2, 32},    // filter size (KRSC)
+      {0, 0, 0, 0},      // padding (pad_h, _, pad_w, _)
+      {1, 1},            // stride (stride_h, stride_w)
+      {1, 1}             // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {4, 4, 5, 128},     // input size  (NHWC)
+      {256, 3, 6, 128},   // filter size (KRSC)
+      {0, 0, 0, 0},       // padding (pad_h, _, pad_w, _)
+      {1, 1},             // stride (stride_h, stride_w)
+      {1, 1},             // dilation (dilation_h, dilation_w)
+      {4, 3, 3, 256}      // output size (NPQK)
+    ));
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {4, 2, 3, 256},     // input size  (NHWC)
+      {328, 3, 5, 256},   // filter size (KRSC)
+      {1, 1, 1, 1},       // padding (pad_h, _, pad_w, _)
+      {1, 1},             // stride (stride_h, stride_w)
+      {1, 1},             // dilation (dilation_h, dilation_w)
+      {4, 1, 1, 328}      // output size (NPQK)
+    ));
+  }
+  // Add a few large and rigorous convolution problem sizes
+  void initialize_conv2d_rigorous_sizes() {
+#if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED
+  conv2d_rigorous_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 124, 224, 96},    // input size  (NHWC)
+    {24, 7, 7, 96},       // filter size (KRSC)
+    {1, 229, 129, 32}     // output size (NPQK)
+  ));
+  conv2d_rigorous_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 233, 35, 48},     // input size  (NHWC)
+    {24, 7, 5, 48},       // filter size (KRSC)
+    {1, 233, 35, 24}      // output size (NPQK)
+  ));
+#endif
+  }
+  // Add resent50 layers to unit testing sizes
+  void initialize_conv2d_resnet50_sizes(Conv2dProblemVector &conv2d_problem_vector, int batch_size = 1){
+#if 0 // Resnet50 first layer (layer_id = 0) with channel = 3 is not supported in cutlass
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      [1, 224, 224, 3],           // input size (NHWC)
+      [64, 7, 7, 3],              // filter size (KRSC)
+      [3, 3, 3, 3],               // padding (pad_h, _, pad_w, _)
+      [2, 2],                     // stride (stride_h, stride_w)
+      [1, 1],                     // dilation (dilation_h, dilation_w)
+    ));
+#endif
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 56, 56, 64},   // input size (NHWC)
+      {256, 1, 1, 64},            // filter size (KRSC)
+      {0, 0, 0, 0},               // padding (pad_h, _, pad_w, _)
+      {1, 1},                     // stride (stride_h, stride_w)
+      {1, 1}                      // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 56, 56, 64},   // input size (NHWC)
+      {64, 1, 1, 64},             // filter size (KRSC)
+      {0, 0, 0, 0},               // padding (pad_h, _, pad_w, _)
+      {1, 1},                     // stride (stride_h, stride_w)
+      {1, 1}                      // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 56, 56, 64},    // input size (NHWC)
+      {64, 3, 3, 64},             // filter size (KRSC)
+      {1, 1, 1, 1},               // padding (pad_h, _, pad_w, _)
+      {1, 1},                     // stride (stride_h, stride_w)
+      {1, 1}                      // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 56, 56, 256},   // input size (NHWC)
+      {64, 1, 1, 256},             // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+   conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 56, 56, 256},   // input size (NHWC)
+      {512, 1, 1, 256},            // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {2, 2},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 56, 56, 256},   // input size (NHWC)
+      {128, 1, 1, 256},            // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {2, 2},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 28, 28, 128},   // input size (NHWC)
+      {128, 3, 3, 128},            // filter size (KRSC)
+      {1, 1, 1, 1},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 28, 28, 128},   // input size (NHWC)
+      {512, 1, 1, 128},            // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 28, 28, 512},   // input size (NHWC)
+      {128, 1, 1, 512},            // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 28, 28, 512},   // input size (NHWC)
+      {1024, 1, 1, 512},           // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {2, 2},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 28, 28, 512},   // input size (NHWC)
+      {256, 1, 1, 512},            // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {2, 2},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 14, 14, 256},   // input size (NHWC)
+      {256, 3, 3, 256},            // filter size (KRSC)
+      {1, 1, 1, 1},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 14, 14, 256},   // input size (NHWC)
+      {1024, 1, 1, 256},           // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 14, 14, 1024},   // input size (NHWC)
+      {256, 1, 1, 1024},            // filter size (KRSC)
+      {0, 0, 0, 0},                 // padding (pad_h, _, pad_w, _)
+      {1, 1},                       // stride (stride_h, stride_w)
+      {1, 1}                        // dilation (dilation_h, dilation_w)
+    ));
+     conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 14, 14, 1024},   // input size (NHWC)
+      {2048, 1, 1, 1024},           // filter size (KRSC)
+      {0, 0, 0, 0},                 // padding (pad_h, _, pad_w, _)
+      {2, 2},                       // stride (stride_h, stride_w)
+      {1, 1}                        // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 14, 14, 1024},   // input size (NHWC)
+      {512, 1, 1, 1024},            // filter size (KRSC)
+      {0, 0, 0, 0},                 // padding (pad_h, _, pad_w, _)
+      {2, 2},                       // stride (stride_h, stride_w)
+      {1, 1}                        // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 7, 7, 512},     // input size (NHWC)
+      {512, 3, 3, 512},            // filter size (KRSC)
+      {1, 1, 1, 1},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 7, 7, 512},     // input size (NHWC)
+      {2048, 1, 1, 512},           // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 7, 7, 2048},    // input size (NHWC)
+      {512, 1, 1, 2048},           // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+ }
+};
+////////////////////////////////////////////////////////////////////////////
+/// Structure TestbedGroupConv2dProblemSizes initializes and holds group conv default and
+/// important network sizes
+////////////////////////////////////////////////////////////////////////////
+struct TestbedGroupConv2dProblemSizes {
+  //
+  // Data members
+  //
+  int threadblock_n;
+  int threadblock_k;
+  int minimum_channel_size;
+  Conv2dProblemVector default_single_group_sizes;
+  Conv2dProblemVector default_multiple_group_sizes;
+  //
+  // Methods
+  //
+  /// Default ctor
+  TestbedGroupConv2dProblemSizes(
+    int threadblock_n_,
+    int threadblock_k_,
+    int minimum_channel_size_ = 64)
+  : threadblock_n (threadblock_n_),
+    threadblock_k (threadblock_k_),
+    minimum_channel_size (minimum_channel_size_) {
+    initialize_group_conv2d_default_sizes();
+    filter_all();
+  }
+  /// Eliminates some illegal cases
+  void filter_all() {
+    Conv2dProblemVector *problems_vectors[] = {
+      &default_single_group_sizes,
+      &default_multiple_group_sizes
+    };
+    for (Conv2dProblemVector *problems : problems_vectors) {
+      Conv2dProblemVector filtered;
+      for (cutlass::conv::Conv2dProblemSize const & problem : *problems) {
+        if (!((problem.C / problem.groups) % minimum_channel_size)) {
+          filtered.push_back(problem);
+        }
+      }
+      *problems = filtered;
+    }
+  }
+  // Add a few standard convolution problem sizes
+  void initialize_group_conv2d_default_sizes() {
+    ////////////////////////////////////////////////////////////////////////////////////
+    // One group calculated by one or multiple CTAs: k_per_group % CTA::N = 0
+    // One CTA calculates a single group
+    ////////////////////////////////////////////////////////////////////////////////////
+    for (int cta_per_group_k = 1; cta_per_group_k < 4; ++cta_per_group_k) {
+      // groups = 2, 3, 4
+      for (int groups = 2; groups < 5; ++groups) {
+        int conv_k = cta_per_group_k * threadblock_n * groups;
+        default_single_group_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+          {1, 8, 8, threadblock_k * 2 * groups},        // input size  (NHWC)
+          {conv_k, 3, 3, threadblock_k * 2},            // filter size (KRSC)
+          {1, 1, 1, 1},                                 // padding (pad_h, _, pad_w, _)
+          {1, 1},                                       // stride (stride_h, stride_w)
+          {1, 1},                                       // dilation (dilation_h, dilation_w)
+          cutlass::conv::Mode::kCrossCorrelation,
+          1,                                            // split_k_slices
+          groups                                        // groups
+        ));
+      } // loop groups
+    } // loop cta_per_group_k
+    // Partial gemm_k: k_per_group == CTA::N && channels_per_group < CTA::K
+    default_single_group_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 8, 8, threadblock_k},                       // input size  (NHWC)
+      {threadblock_n * 2, 3, 3, threadblock_k / 2},   // filter size (KRSC)
+      {1, 1, 1, 1},                                   // padding (pad_h, _, pad_w, _)
+      {1, 1},                                         // stride (stride_h, stride_w)
+      {1, 1},                                         // dilation (dilation_h, dilation_w)
+      cutlass::conv::Mode::kCrossCorrelation,
+      1,                                              // split_k_slices
+      2                                               // groups
+    ));
+    // Larger problem sizes
+    default_single_group_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 56, 56, 696},                               // input size  (NHWC)
+      {768, 3, 3, 232},                               // filter size (KRSC)
+      {1, 1, 1, 1},                                   // padding (pad_h, _, pad_w, _)
+      {2, 2},                                         // stride (stride_h, stride_w)
+      {1, 1},                                         // dilation (dilation_h, dilation_w)
+      cutlass::conv::Mode::kCrossCorrelation,
+      1,                                              // split_k_slices
+      3                                               // groups
+    ));
+    default_single_group_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 14, 14, 1392},                              // input size  (NHWC)
+      {1536, 3, 3, 232},                              // filter size (KRSC)
+      {1, 1, 1, 1},                                   // padding (pad_h, _, pad_w, _)
+      {1, 1},                                         // stride (stride_h, stride_w)
+      {1, 1},                                         // dilation (dilation_h, dilation_w)
+      cutlass::conv::Mode::kCrossCorrelation,
+      1,                                              // split_k_slices
+      3                                               // groups
+    ));
+    ////////////////////////////////////////////////////////////////////////////////////
+    // One CTA calculate multiple groups: CTA::N % k_per_group = 0
+    ////////////////////////////////////////////////////////////////////////////////////
+    // 2 groups per CTA
+    default_multiple_group_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 8, 8, threadblock_k * 4},                   // input size  (NHWC)
+      {threadblock_n, 3, 3, threadblock_k * 2},       // filter size (KRSC)
+      {1, 1, 1, 1},                                   // padding (pad_h, _, pad_w, _)
+      {1, 1},                                         // stride (stride_h, stride_w)
+      {1, 1},                                         // dilation (dilation_h, dilation_w)
+      cutlass::conv::Mode::kCrossCorrelation,
+      1,                                              // split_k_slices
+      2                                               // groups
+    ));
+    // 2 groups per CTA and partial gemm_k
+    default_multiple_group_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 8, 8, threadblock_k},                       // input size  (NHWC)
+      {threadblock_n, 3, 3, threadblock_k / 2},       // filter size (KRSC)
+      {1, 1, 1, 1},                                   // padding (pad_h, _, pad_w, _)
+      {1, 1},                                         // stride (stride_h, stride_w)
+      {1, 1},                                         // dilation (dilation_h, dilation_w)
+      cutlass::conv::Mode::kCrossCorrelation,
+      1,                                              // split_k_slices
+      2                                               // groups
+    ));
+    // 4 groups per CTA
+    default_multiple_group_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 8, 8, threadblock_k * 8},                   // input size  (NHWC)
+      {threadblock_n / 2, 3, 3, threadblock_k * 2},   // filter size (KRSC)
+      {1, 1, 1, 1},                                   // padding (pad_h, _, pad_w, _)
+      {1, 1},                                         // stride (stride_h, stride_w)
+      {1, 1},                                         // dilation (dilation_h, dilation_w)
+      cutlass::conv::Mode::kCrossCorrelation,
+      1,                                              // split_k_slices
+      4                                               // groups
+    ));
+    // 4 groups per CTA and partial gemm_k
+    default_multiple_group_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 8, 8, threadblock_k * 2},                   // input size  (NHWC)
+      {threadblock_n / 2, 3, 3, threadblock_k / 2},   // filter size (KRSC)
+      {1, 1, 1, 1},                                   // padding (pad_h, _, pad_w, _)
+      {1, 1},                                         // stride (stride_h, stride_w)
+      {1, 1},                                         // dilation (dilation_h, dilation_w)
+      cutlass::conv::Mode::kCrossCorrelation,
+      1,                                              // split_k_slices
+      4                                               // groups
+    ));
+  }
+};
+} // namespace device
+} // namespace conv
+} // namespace test

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_testbed.h ADDED Viewed

	@@ -0,0 +1,818 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed
+*/
+#pragma once
+#include <fstream>
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+#include "conv2d_problems.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/device/convolution.h"
+#include "cutlass/core_io.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "../cache_testbed_output.h"
+namespace test {
+namespace conv {
+namespace device {
+template <typename Conv2d>
+class TestbedConv2d {
+public:
+  using ElementA = typename Conv2d::ElementA;
+  using LayoutA = typename Conv2d::LayoutA;
+  using ElementB = typename Conv2d::ElementB;
+  using LayoutB = typename Conv2d::LayoutB;
+  using ElementC = typename Conv2d::ElementC;
+  using LayoutC = typename Conv2d::LayoutC;
+  using ElementAccumulator = typename Conv2d::ElementAccumulator;
+  using ElementCompute = typename Conv2d::ElementCompute;
+  using EpilogueOutputOp = typename Conv2d::EpilogueOutputOp;
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv2d::kConvolutionalOperator;
+  /// Reduction kernel
+  using ReductionOp = cutlass::reduction::thread::ReduceAdd<
+    ElementAccumulator,
+    typename EpilogueOutputOp::ElementAccumulator,
+    EpilogueOutputOp::kCount
+  >;
+  using ReductionKernel = cutlass::reduction::kernel::ReduceSplitK<
+    cutlass::MatrixShape<4, 32 * EpilogueOutputOp::kCount>,
+    EpilogueOutputOp,
+    ReductionOp
+  >;
+  using ReductionDevice = cutlass::reduction::device::ReduceSplitK<ReductionKernel>;
+  using ReductionStrideIndex = typename ReductionDevice::StrideIndex;
+public:
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
+  int tested_problem_count;
+public:
+  TestbedConv2d(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_), tested_problem_count(0) {
+  }
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view,
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+    if (dist_kind == cutlass::Distribution::Uniform) {
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+      if (bits <= 8) {
+        scope = 2;
+      }
+      else if (bits == 16) {
+        if (cutlass::sizeof_bits<ElementAccumulator>::value <= 16) {
+          scope = 3;
+        }
+        else {
+          scope = 5;
+        }
+      }
+      else {
+        scope = 8;
+      }
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    }
+    else if (dist_kind == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(view);
+    }
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    }
+    else {
+    }
+  }
+  void initialize(
+    cutlass::conv::Conv2dProblemSize const &problem_size, uint64_t seed = 2019) {
+    tensor_A.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size));
+    tensor_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_C.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    initialize_tensor(tensor_A.host_view(), init_A, seed);
+    initialize_tensor(tensor_B.host_view(), init_B, seed * 17);
+    initialize_tensor(tensor_C.host_view(), init_C, seed * 39);
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D_computed.sync_device();
+    tensor_D_reference.sync_device();
+  }
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+    size_t smem_size = sizeof(typename Conv2d::UnderlyingKernel::SharedStorage);
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+    result = cudaGetDeviceProperties(&properties, device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+    return true;
+  }
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(0)) {
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+    // increment tested problem count run by the testbed
+    tested_problem_count++;
+#if 0 // display conv2d problem size for debugging
+    std::cout << problem_size << std::endl
+              << "alpha, beta: (" << alpha << ", " << beta << ")" << std::endl
+              << "split_k_mode: " << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)") << std::endl
+              << std::endl;
+#endif
+    initialize(problem_size);
+    // configure the operator
+    Conv2d conv2d_op;
+    typename Conv2d::Arguments conv2d_args(
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_computed.device_ref(),
+      {alpha, beta},
+      split_k_mode
+    );
+    // find workspace requirement for parallel split-k reduction
+    size_t workspace_size = Conv2d::get_workspace_size(conv2d_args);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+    cutlass::Status status = conv2d_op.initialize(conv2d_args, workspace.get());
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+    // conv2d operation with parallel split-k-mode
+    if (split_k_mode == cutlass::conv::SplitKMode::kParallel) {
+      // conv2d output is written to workspace in global memory
+      conv2d_args.ref_D.reset(reinterpret_cast<ElementC*>(workspace.get()));
+      // accumulate mma for each cta in k-dimension (1.0 * A * B)
+      conv2d_args.output_op = {ElementCompute(1), ElementCompute(0)};
+      // update conv2d operator arguments
+      status = conv2d_op.update(conv2d_args, workspace.get());
+    }
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+    // run conv2d operator
+    status = conv2d_op();
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to run." << std::endl;
+      return false;
+    }
+    if (split_k_mode == cutlass::conv::SplitKMode::kParallel) {
+      // configure parallel reduction operator
+      ReductionDevice reduction_op;
+      typename ReductionDevice::Arguments reduction_args(
+        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn(),
+        problem_size.split_k_slices,
+        cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, problem_size),
+        {
+          reinterpret_cast<ElementAccumulator*> (workspace.get()),
+          ReductionStrideIndex(tensor_C.stride()[Conv2d::UnderlyingKernel::kTensorCStrideIdx])
+        },
+        {
+          tensor_D_computed.device_data(),
+          ReductionStrideIndex(tensor_C.stride()[Conv2d::UnderlyingKernel::kTensorCStrideIdx])
+        },
+        {
+          tensor_C.device_data(),
+          ReductionStrideIndex(tensor_C.stride()[Conv2d::UnderlyingKernel::kTensorCStrideIdx])
+        },
+        // apply alpha, beta to obtain the following equation alpha * ReduceAdd(A * B) + beta * C
+        {alpha, beta}
+      );
+      status = reduction_op.initialize(reduction_args, nullptr);
+      EXPECT_TRUE(status == cutlass::Status::kSuccess);
+      if (status != cutlass::Status::kSuccess) {
+        return false;
+      }
+      // run prallel reduction kernel
+      status = reduction_op();
+      EXPECT_TRUE(status == cutlass::Status::kSuccess);
+      if (status != cutlass::Status::kSuccess) {
+        return false;
+      }
+    }
+    bool passed = false;
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " device reference error: "
+                                   << cudaGetErrorString(result);
+    tensor_D_computed.sync_host();
+    //
+    // Reference check - support caching results
+    //
+    CachedTestKey cached_test_key = CreateCachedConv2dTestKey<
+        ElementA, LayoutA,
+        ElementB, LayoutB,
+        ElementC, LayoutC,
+        ElementAccumulator,
+        ElementCompute
+      >(
+        kConvolutionalOperator,
+        problem_size,
+        alpha,
+        beta,
+        tensor_A.host_view(),
+        tensor_B.host_view(),
+        tensor_C.host_view()
+      );
+    //
+    // Look for the cached key
+    //
+    bool cached_result_loaded = false;
+    CachedTestResult cached_test_result;
+    std::string conv2d_result_cache_name =
+      std::string("cached_results_") + CUTLASS_TARGET_NAME + ".txt";
+    if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+      CachedTestResultListing cached_results(conv2d_result_cache_name);
+      auto cached = cached_results.find(cached_test_key);
+      cached_result_loaded = cached.first;
+      if (cached_result_loaded) {
+        cached_test_result = cached.second;
+      }
+    }
+    if (!cached_result_loaded) {
+#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
+    cutlass::reference::device::Conv2d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementCompute,
+      ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_reference.device_ref(),
+      alpha,
+      beta);
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_D_reference.sync_host();
+#else
+    cutlass::reference::host::Conv2d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementCompute,
+      ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      tensor_C.host_ref(),
+      tensor_D_reference.host_ref(),
+      alpha,
+      beta);
+#endif
+      if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+        cached_test_result.D = TensorHash(tensor_D_reference.host_view());
+        CachedTestResultListing cached_results(conv2d_result_cache_name);
+        cached_results.append(cached_test_key, cached_test_result);
+        cached_results.write(conv2d_result_cache_name);
+      }
+    } // if (!cached_result_loaded)
+    uint32_t tensor_D_hash = TensorHash(tensor_D_computed.host_view());
+    if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+      passed = (tensor_D_hash == cached_test_result.D);
+      EXPECT_EQ(tensor_D_hash, cached_test_result.D)
+        << "Hash-based comparison failed for key:" << "\n" << cached_test_key << "\n";
+    }
+    else {
+      passed = cutlass::reference::host::TensorEquals(
+        tensor_D_computed.host_view(),
+        tensor_D_reference.host_view());
+    }
+    EXPECT_TRUE(passed);
+    std::stringstream ss_problem_size_text;
+    ss_problem_size_text         << "nhwc_"
+        << problem_size.N << "x"
+        << problem_size.H << "x"
+        << problem_size.W << "x"
+        << problem_size.C
+        << "_krsc_"
+        << problem_size.K << "x"
+        << problem_size.R << "x"
+        << problem_size.S << "x"
+        << problem_size.C
+        << "_padding_"
+        << problem_size.pad_h << "x"
+        << problem_size.pad_w
+        << "_stride_"
+        << problem_size.stride_h << "x"
+        << problem_size.stride_w
+        << "_dilation_"
+        << problem_size.dilation_h << "x"
+        << problem_size.dilation_w << "_"
+        << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_");
+    if (!passed) {
+      std::stringstream fname;
+      fname << "error_Conv2d_ImplicitGemm_device_"
+        << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
+        << (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
+            (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" :
+              (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDeconv ? "deconv_" : "wgrad_")))
+        << ss_problem_size_text.str()
+        << Conv2d::ThreadblockShape::kM << "x"
+        << Conv2d::ThreadblockShape::kN << "x"
+        << Conv2d::ThreadblockShape::kK << "_"
+        << Conv2d::WarpShape::kM << "x"
+        << Conv2d::WarpShape::kN << "x"
+        << Conv2d::WarpShape::kK << ".txt";
+      std::cout << fname.str() << std::endl;
+      std::ofstream results(fname.str());
+      results << problem_size << std::endl;
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n";
+      results << "\nD reference (hash: " << cached_test_result.D << ")\n";
+      if (!cached_result_loaded) {
+        results
+          << tensor_D_reference.host_view() << "\n";
+      }
+      results
+        << "\nD computed (hash: " << tensor_D_hash << ")\n"
+        << tensor_D_computed.host_view() << "\n";
+    }
+    return passed;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename ImplicitGemm>
+bool TestSpecificConv2d(
+  const Conv2dProblemVector & problem_sizes) {
+  bool passed = true;
+  //
+  // Testbed object
+  //
+  TestbedConv2d<ImplicitGemm> testbed;
+  // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for(auto conv_problem : problem_sizes) {
+    //
+    // Test
+    //
+    // test mode = xcross
+    passed = testbed.run(
+      conv_problem,
+      cutlass::conv::SplitKMode::kSerial);
+    if (!passed) {
+      return false;
+    }
+    // test mode = convolution
+    passed = testbed.run(
+      conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+      cutlass::conv::SplitKMode::kSerial);
+    if (!passed) {
+      return false;
+    }
+  }
+  return true;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
+// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
+// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
+// (conv_blacklist_sizes)
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename ImplicitGemm>
+bool TestAllConv2d(
+  const Conv2dProblemVector & conv_test_sizes = Conv2dProblemVector(),
+  const Conv2dProblemVector & conv_blacklist_sizes = Conv2dProblemVector()) {
+  bool passed = true;
+  //
+  // Testbed object
+  //
+  TestbedConv2d<ImplicitGemm> testbed;
+  //
+  // Get conv problem sizes to run conv operator
+  //
+  TestbedConv2dProblemSizes conv_problems(128/cutlass::sizeof_bits<typename ImplicitGemm::ElementA>::value);
+  // Vector of conv2d problem sizes to avoid duplicate runs
+  Conv2dProblemVector conv_tested_sizes;
+  // Vectors of Conv2dProblemVector (lenient/easiest to rigorous problem sizes)
+  std::vector<Conv2dProblemVector> problem_vectors = {
+    conv_test_sizes,                               // run user specified sizes
+    conv_problems.conv2d_default_sizes,            // run default and cudnn bug sizes
+    //conv_problems.conv2d_resnet50_sizes,         // run resnet50 sizes
+#if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED
+    conv_problems.conv2d_rigorous_sizes,           // run large and rigorous sizes if enabled
+#endif
+  };
+  // Flatten 2D problem_vectors into a 1D problem_sizes
+  std::vector<cutlass::conv::Conv2dProblemSize> problem_sizes;
+  for (auto problem_vector : problem_vectors) {
+    for(auto conv_problem : problem_vector) {
+      problem_sizes.push_back(conv_problem);
+    }
+  }
+  // If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reverse the order (rigorous to lenient)
+  // run the most rigorous problem size first
+  if (CutlassUnitTestProblemCount()) {
+    std::reverse(problem_sizes.begin(), problem_sizes.end());
+  }
+  // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for(auto conv_problem : problem_sizes) {
+    // Skip blacklist and avoid duplicate problem sizes
+    if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() ||
+        std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) {
+      continue;
+    }
+    //
+    // Procedurally disable certain cases
+    //
+    // CUTLASS DGRAD's *unity* stride specialization only support stride {1, 1}
+    if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+          ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) &&
+        (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport ==
+          cutlass::conv::StrideSupport::kUnity)) {
+      if (!((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+        continue;
+      }
+    }
+    // Fixed channels algorithm requires channel count to match access size
+    if (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kIteratorAlgorithm ==
+        cutlass::conv::IteratorAlgorithm::kFixedChannels) {
+      if (conv_problem.C != ImplicitGemm::UnderlyingKernel::Mma::IteratorA::AccessType::kElements) {
+        continue;
+      }
+    }
+    // Few channels algorithm requires channel count to match access size
+    if (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kIteratorAlgorithm ==
+        cutlass::conv::IteratorAlgorithm::kFewChannels) {
+      if (conv_problem.C % ImplicitGemm::UnderlyingKernel::Mma::IteratorA::AccessType::kElements) {
+        continue;
+      }
+    }
+    // CUTLASS DGRAD's *strided* stride specialization supports all stride {stride_h, stride_w}
+    // Although strided dgrad works for all stride combinations, we are only going
+    // to run strided dgrad for non-unity strides
+    if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+          ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) &&
+        (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport ==
+          cutlass::conv::StrideSupport::kStrided)) {
+       if (((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+         continue;
+       }
+    }
+    //
+    // Test
+    //
+    // push back tested problem size to avoid re-running duplicates
+    conv_tested_sizes.push_back(conv_problem);
+    // test mode = xcross
+    passed = testbed.run(
+      conv_problem,
+      cutlass::conv::SplitKMode::kSerial);
+    if (!passed) {
+      return false;
+    }
+    // test mode = convolution
+    passed = testbed.run(
+      conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+      cutlass::conv::SplitKMode::kSerial);
+    if (!passed) {
+      return false;
+    }
+    // If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reduce the number of tested problem counts
+    if (CutlassUnitTestProblemCount() &&
+        testbed.tested_problem_count > CutlassUnitTestProblemCount()) {
+      return true;
+    }
+  }
+  // Small-channels convolution can't run here.
+  if (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kIteratorAlgorithm ==
+        cutlass::conv::IteratorAlgorithm::kFixedChannels) {
+    return true;
+  }
+  // Small-channels convolution can't run here.
+  if (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kIteratorAlgorithm ==
+        cutlass::conv::IteratorAlgorithm::kFewChannels) {
+    return true;
+  }
+  // CUTLASS DGRAD's *strided* specialization does not support split-k mode
+  if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+          ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) &&
+      (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport ==
+        cutlass::conv::StrideSupport::kStrided)) {
+    passed = testbed.run(
+      cutlass::conv::Conv2dProblemSize(
+      {1, 56, 56, 8},   // input size (NHWC)
+      {8, 1, 1, 8},     // filter size (KRSC)
+      {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
+      {2, 2},           // stride (stride_h, stride_w)
+      {1, 1}),          // dilation (dilation_h, dilation_w)
+      cutlass::conv::SplitKMode::kSerial,
+      cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0),
+      cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0));
+    passed = testbed.run(
+      cutlass::conv::Conv2dProblemSize(
+      {1, 56, 56, 8},   // input size (NHWC)
+      {8, 1, 1, 8},     // filter size (KRSC)
+      {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
+      {1, 1},           // stride (stride_h, stride_w)
+      {1, 1})           // dilation (dilation_h, dilation_w)
+      .reset_split_k_slices(2),
+      cutlass::conv::SplitKMode::kSerial,
+      cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0),
+      cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0));
+    if (!passed) {
+      return false;
+    }
+    return passed;
+  }
+  // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for
+  // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters
+  // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep
+  // alpha and beta for local testing, but only runs one value for alpha and beta.
+  cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size (
+      {1, 17, 11, 288},   // input size (NHWC)
+      {160, 3, 3, 288},   // filter size (KRSC)
+      {1, 1, 1, 1},       // padding (pad_h, _, pad_w, _)
+      {1, 1},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    );
+  cutlass::conv::SplitKMode split_k_modes [] = {
+    cutlass::conv::SplitKMode::kSerial,
+    cutlass::conv::SplitKMode::kParallel,
+  };
+  int split_k_slices[] = {
+    1, 2, 3, 4, 201
+  };
+  double problem_alpha[] = {
+    2.0
+  };
+  double problem_beta[] = {
+    2.0
+  };
+  for (auto split_k_mode : split_k_modes) {
+    for (auto split_k_slice : split_k_slices) {
+      for (auto alpha : problem_alpha) {
+        for (auto beta : problem_beta) {
+          passed = testbed.run(
+            conv2d_split_k_test_size.reset_split_k_slices(split_k_slice),
+            split_k_mode,
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(alpha),
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(beta));
+          if (!passed) {
+            return false;
+          }
+          // If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reduce the number of tested problem counts
+          if (CutlassUnitTestProblemCount() &&
+              testbed.tested_problem_count > CutlassUnitTestProblemCount()) {
+            return true;
+          }
+        }
+      }
+    }
+  }
+  return passed;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace device
+} // namespace conv
+} // namespace test
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_testbed_interleaved.h ADDED Viewed

	@@ -0,0 +1,666 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed
+*/
+#pragma once
+#include <fstream>
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+#include "conv2d_problems.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/host_reorder.h"
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/device/convolution.h"
+#include "cutlass/core_io.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "../cache_testbed_output.h"
+namespace test {
+namespace conv {
+namespace device {
+template <typename Conv2d, int InterleavedK>
+class InterleavedTestbedConv2d {
+public:
+  using ElementA = typename Conv2d::ElementA;
+  using LayoutA = typename Conv2d::LayoutA;
+  using ElementB = typename Conv2d::ElementB;
+  using LayoutB = typename Conv2d::LayoutB;
+  using ElementC = typename Conv2d::ElementC;
+  using LayoutC = typename Conv2d::LayoutC;
+  using ElementAccumulator = typename Conv2d::ElementAccumulator;
+  using ElementCompute = typename Conv2d::ElementCompute;
+  using EpilogueOutputOp = typename Conv2d::EpilogueOutputOp;
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv2d::kConvolutionalOperator;
+  /// Reduction kernel
+  using ReductionOp = cutlass::reduction::thread::ReduceAdd<
+    ElementAccumulator,
+    typename EpilogueOutputOp::ElementAccumulator,
+    EpilogueOutputOp::kCount
+  >;
+  using ReductionKernel = cutlass::reduction::kernel::ReduceSplitK<
+    cutlass::MatrixShape<4, 32 * EpilogueOutputOp::kCount>,
+    EpilogueOutputOp,
+    ReductionOp
+  >;
+  using ReductionDevice = cutlass::reduction::device::ReduceSplitK<ReductionKernel>;
+  using ReductionStrideIndex = typename ReductionDevice::StrideIndex;
+public:
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B_reordered;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
+public:
+  InterleavedTestbedConv2d(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
+  }
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view,
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+    if (dist_kind == cutlass::Distribution::Uniform) {
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+      if (bits <= 8) {
+        scope = 2;
+      }
+      else if (bits == 16) {
+        scope = 3;
+      }
+      else {
+        scope = 8;
+      }
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    }
+    else if (dist_kind == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(view);
+    }
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    }
+    else {
+    }
+  }
+  void initialize(
+    cutlass::conv::Conv2dProblemSize const &problem_size, uint64_t seed = 2019) {
+    tensor_A.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size));
+    tensor_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_B_reordered.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_C.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    initialize_tensor(tensor_A.host_view(), init_A, seed);
+    initialize_tensor(tensor_B.host_view(), init_B, seed * 17);
+    initialize_tensor(tensor_C.host_view(), init_C, seed * 39);
+    cutlass::reorder_convK<InterleavedK>(
+        tensor_B_reordered.host_ref(), tensor_B.host_ref(), implicit_gemm_problem_size(kConvolutionalOperator, problem_size));
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_B_reordered.sync_device();
+    tensor_C.sync_device();
+    tensor_D_computed.sync_device();
+    tensor_D_reference.sync_device();
+  }
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+    size_t smem_size = sizeof(typename Conv2d::UnderlyingKernel::SharedStorage);
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+    result = cudaGetDeviceProperties(&properties, device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+    if (properties.sharedMemPerMultiprocessor < smem_size) {
+      return false;
+    }
+    return true;
+  }
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(0)) {
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+#if 0 //display conv2d problem size for debugging
+    std::cout << problem_size << std::endl
+              << "alpha, beta: (" << float(alpha) << ", " << float(beta) << ")" << std::endl
+              << "split_k_mode: " << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)") << std::endl
+              << std::endl;
+#endif
+    initialize(problem_size);
+    // configure the operator
+    Conv2d conv2d_op;
+    typename Conv2d::Arguments conv2d_args(
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B_reordered.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_computed.device_ref(),
+      {alpha, beta},
+      split_k_mode
+    );
+    // find workspace requirement for parallel split-k reduction
+    size_t workspace_size = Conv2d::get_workspace_size(conv2d_args);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+    cutlass::Status status = conv2d_op.initialize(conv2d_args, workspace.get());
+    // conv2d operation with parallel split-k-mode
+    if (split_k_mode == cutlass::conv::SplitKMode::kParallel) {
+      // conv2d output is written to workspace in global memory
+      conv2d_args.ref_D.reset(reinterpret_cast<ElementC*>(workspace.get()));
+      // accumulate mma for each cta in k-dimension (1.0 * A * B)
+      conv2d_args.output_op = {ElementCompute(1), ElementCompute(0)};
+      // update conv2d operator arguments
+      status = conv2d_op.update(conv2d_args, workspace.get());
+    }
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+    // run conv2d operator
+    status = conv2d_op();
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+    if (split_k_mode == cutlass::conv::SplitKMode::kParallel) {
+      // configure parallel reduction operator
+      ReductionDevice reduction_op;
+      typename ReductionDevice::Arguments reduction_args(
+        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn(),
+        problem_size.split_k_slices,
+        cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, problem_size),
+        {
+          reinterpret_cast<ElementAccumulator*> (workspace.get()),
+          ReductionStrideIndex(tensor_C.stride()[Conv2d::UnderlyingKernel::kTensorCStrideIdx])
+        },
+        {
+          tensor_D_computed.device_data(),
+          ReductionStrideIndex(tensor_C.stride()[Conv2d::UnderlyingKernel::kTensorCStrideIdx])
+        },
+        {
+          tensor_C.device_data(),
+          ReductionStrideIndex(tensor_C.stride()[Conv2d::UnderlyingKernel::kTensorCStrideIdx])
+        },
+        // apply alpha, beta to obtain the following equation alpha * ReduceAdd(A * B) + beta * C
+        {alpha, beta}
+      );
+      status = reduction_op.initialize(reduction_args, nullptr);
+      EXPECT_TRUE(status == cutlass::Status::kSuccess);
+      if (status != cutlass::Status::kSuccess) {
+        return false;
+      }
+      // run prallel reduction kernel
+      status = reduction_op();
+      EXPECT_TRUE(status == cutlass::Status::kSuccess);
+      if (status != cutlass::Status::kSuccess) {
+        return false;
+      }
+    }
+    bool passed = false;
+    tensor_D_computed.sync_host();
+    //
+    // Reference check - support caching results
+    //
+    CachedTestKey cached_test_key = CreateCachedConv2dTestKey<
+        ElementA, LayoutA,
+        ElementB, LayoutB,
+        ElementC, LayoutC,
+        ElementAccumulator,
+        ElementCompute
+      >(
+        kConvolutionalOperator,
+        problem_size,
+        alpha,
+        beta,
+        tensor_A.host_view(),
+        tensor_B.host_view(),
+        tensor_C.host_view()
+      );
+    //
+    // Look for the cached key
+    //
+    bool cached_result_loaded = false;
+    CachedTestResult cached_test_result;
+    std::string conv2d_result_cache_name =
+      std::string("cached_results_") + CUTLASS_TARGET_NAME + ".txt";
+    if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+      CachedTestResultListing cached_results(conv2d_result_cache_name);
+      auto cached = cached_results.find(cached_test_key);
+      cached_result_loaded = cached.first;
+      if (cached_result_loaded) {
+        cached_test_result = cached.second;
+      }
+    }
+    if (!cached_result_loaded) {
+#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
+    cutlass::reference::device::Conv2d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      cutlass::NumericConverterClamp<ElementC, ElementCompute>
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_reference.device_ref(),
+      alpha,
+      beta);
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " device reference error: "
+                                   << cudaGetErrorString(result);
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_D_reference.sync_host();
+#else
+    cutlass::reference::host::Conv2d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ElementC,
+      cutlass::NumericConverterClamp<ElementC, ElementCompute>
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      tensor_C.host_ref(),
+      tensor_D_reference.host_ref(),
+      alpha,
+      beta);
+#endif
+      if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+        cached_test_result.D = TensorHash(tensor_D_reference.host_view());
+        CachedTestResultListing cached_results(conv2d_result_cache_name);
+        cached_results.append(cached_test_key, cached_test_result);
+        cached_results.write(conv2d_result_cache_name);
+      }
+    } // if (!cached_result_loaded)
+    uint32_t tensor_D_hash = TensorHash(tensor_D_computed.host_view());
+    if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+      passed = (tensor_D_hash == cached_test_result.D);
+      EXPECT_EQ(tensor_D_hash, cached_test_result.D)
+        << "Hash-based comparison failed for key:" << "\n" << cached_test_key << "\n";
+    }
+    else {
+      passed = cutlass::reference::host::TensorEquals(
+        tensor_D_computed.host_view(),
+        tensor_D_reference.host_view());
+    }
+    EXPECT_TRUE(passed);
+    if (!passed) {
+      std::stringstream fname;
+      fname << "error_Conv2d_ImplicitGemm_device_"
+        << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
+        << (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
+            (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" : "wgrad_"))
+        << "ncxhwx_"
+        << problem_size.N << "x"
+        << problem_size.H << "x"
+        << problem_size.W << "x"
+        << problem_size.C
+        << "_cxrskx_"
+        << problem_size.K << "x"
+        << problem_size.R << "x"
+        << problem_size.S << "x"
+        << problem_size.C
+        << "_padding_"
+        << problem_size.pad_h << "x"
+        << problem_size.pad_w
+        << "_stride_"
+        << problem_size.stride_h << "x"
+        << problem_size.stride_w
+        << "_dilation_"
+        << problem_size.dilation_h << "x"
+        << problem_size.dilation_w << "_"
+        << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_")
+        << Conv2d::ThreadblockShape::kM << "x"
+        << Conv2d::ThreadblockShape::kN << "x"
+        << Conv2d::ThreadblockShape::kK << "_"
+        << Conv2d::WarpShape::kM << "x"
+        << Conv2d::WarpShape::kN << "x"
+        << Conv2d::WarpShape::kK << ".txt";
+      std::cout << fname.str() << std::endl;
+      std::ofstream results(fname.str());
+      results << problem_size << std::endl;
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n";
+      results << "\nD reference (hash: " << cached_test_result.D << ")\n";
+      if (!cached_result_loaded) {
+        results
+          << tensor_D_reference.host_view() << "\n";
+      }
+      results
+        << "\nD computed (hash: " << tensor_D_hash << ")\n"
+        << tensor_D_computed.host_view() << "\n";
+    }
+    return passed;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
+// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
+// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
+// (conv_blacklist_sizes)
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename ImplicitGemm, int InterleavedK>
+bool TestAllInterleavedConv2d(
+  const Conv2dProblemVector & conv_test_sizes = Conv2dProblemVector(),
+  const Conv2dProblemVector & conv_blacklist_sizes = Conv2dProblemVector()) {
+  bool passed = true;
+  //
+  // Testbed object
+  //
+  InterleavedTestbedConv2d<ImplicitGemm, InterleavedK> testbed;
+  //
+  // Get conv problem sizes to run conv operator
+  //
+  TestbedConv2dProblemSizes conv_problems(InterleavedK); // minimum channel size must be multiple of InterleavedK for interleaved layout
+  // Vector of conv2d problem sizes to avoid duplicate runs
+  Conv2dProblemVector conv_tested_sizes;
+  Conv2dProblemVector const *problem_vectors[] = {
+    &conv_test_sizes,                               // run user specified sizes
+    &conv_problems.conv2d_default_sizes,            // run default and cudnn bug sizes
+    &conv_problems.conv2d_resnet50_sizes,           // run resnet50 sizes
+#if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED
+    &conv_problems.conv2d_rigorous_sizes,           // run large and rigorous sizes if enabled
+#endif
+  };
+  // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for (Conv2dProblemVector const * problem_vector : problem_vectors) {
+    ChannelDivisibilitySpecification channel_spec(InterleavedK); //input and output channels must be multiple of InterleavedK
+    auto pruned_problem_vector = prune(*problem_vector, channel_spec);
+    //  Run conv testbed on default convolution sizes
+    for(auto conv_problem : pruned_problem_vector) {
+      // Skip blacklist and avoid duplicate problem sizes
+      if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() ||
+          std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) {
+        continue;
+      }
+      //
+      // Procedurally disable certain cases
+      //
+      // CUTLASS DGRAD's unity stride specialization only support stride {1, 1}
+      if ((ImplicitGemm::kConvolutionalOperator ==
+            cutlass::conv::Operator::kDgrad) &&
+          (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport ==
+            cutlass::conv::StrideSupport::kUnity)) {
+        if (!((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+          continue;
+        }
+      }
+      //
+      // Test
+      //
+      // push back tested problem size to avoid re-running duplicates
+      conv_tested_sizes.push_back(conv_problem);
+      // test mode = xcross
+      passed = testbed.run(
+        conv_problem,
+        cutlass::conv::SplitKMode::kSerial);
+      if (!passed) {
+        return false;
+      }
+      // test mode = convolution
+      passed = testbed.run(
+        conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+        cutlass::conv::SplitKMode::kSerial);
+      if (!passed) {
+        return false;
+      }
+    }
+  }
+#if 0
+  // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for
+  // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters
+  // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep
+  // alpha and beta for local testing, but only runs one value for alpha and beta.
+  cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size (
+      {1, 17, 11, 288},   // input size (NHWC)
+      {160, 3, 3, 288},   // filter size (KRSC)
+      {1, 1, 1, 1},       // padding (pad_h, _, pad_w, _)
+      {1, 1},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    );
+  cutlass::conv::SplitKMode split_k_modes [] = {
+    cutlass::conv::SplitKMode::kSerial,
+    cutlass::conv::SplitKMode::kParallel,
+  };
+  int split_k_slices[] = {
+    1, 2, 3, 4, 201
+  };
+  double problem_alpha[] = {
+    2.0
+  };
+  double problem_beta[] = {
+    2.0
+  };
+  for (auto split_k_mode : split_k_modes) {
+    for (auto split_k_slice : split_k_slices) {
+      for (auto alpha : problem_alpha) {
+        for (auto beta : problem_beta) {
+          passed = testbed.run(
+            conv2d_split_k_test_size.reset_split_k_slices(split_k_slice),
+            split_k_mode,
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(alpha),
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(beta));
+          if (!passed) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+#endif
+  return passed;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace device
+} // namespace conv
+} // namespace test

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_with_absmax_testbed.h ADDED Viewed

	@@ -0,0 +1,622 @@

+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Testbed for running device-level Conv2Ds with absolute maximum calculation and scaling
+*/
+#pragma once
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include "conv2d_problems.h"
+#include "../../common/cutlass_unit_test.h"
+#include "../../gemm/device/testbed_utils.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_reduce.h"
+namespace test {
+namespace conv {
+namespace device {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Conv,
+  template<typename T> class ActivationFunctor
+>
+struct TestbedConv2dWithAbsMax {
+  using ElementAccumulator = typename Conv::ElementAccumulator;
+  using ElementCompute = typename Conv::UnderlyingKernel::Epilogue::OutputOp::ElementCompute;
+  using ElementScalingFactor = typename Conv::EpilogueOutputOp::ElementScalingFactor;
+  using ElementAbsmax = typename Conv::EpilogueOutputOp::ElementAbsmax;
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv::kConvolutionalOperator;
+  static bool const kScaleAux = Conv::EpilogueOutputOp::kIsScalingAndAmaxAuxOutputNeeded;
+  static bool const kScaleOutput = Conv::EpilogueOutputOp::kIsScalingAndAmaxOutputNeeded;
+  bool doScaleA;
+  bool doScaleB;
+  bool doScaleC;
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+  cutlass::HostTensor<typename Conv::ElementA, typename Conv::LayoutA> tensor_A;
+  cutlass::HostTensor<typename Conv::ElementB, typename Conv::LayoutB> tensor_B;
+  cutlass::HostTensor<typename Conv::ElementC, typename Conv::LayoutC> tensor_C;
+  cutlass::HostTensor<typename Conv::EpilogueOutputOp::ElementAuxOutput, typename Conv::LayoutC> tensor_Aux;
+  cutlass::HostTensor<typename Conv::EpilogueOutputOp::ElementOutput, typename Conv::LayoutC> tensor_D;
+  cutlass::HostTensor<typename Conv::ElementC, typename Conv::LayoutC> tensor_Vector;
+  cutlass::HostTensor<ElementAccumulator, typename Conv::LayoutC> tmp_D;
+  cutlass::HostTensor<typename Conv::EpilogueOutputOp::ElementOutput, typename Conv::LayoutC> reference_D;
+  cutlass::HostTensor<typename Conv::EpilogueOutputOp::ElementAuxOutput, typename Conv::LayoutC> reference_Aux;
+  cutlass::HostTensor<ElementScalingFactor, typename Conv::LayoutC> scale_A;
+  cutlass::HostTensor<ElementScalingFactor, typename Conv::LayoutC> scale_B;
+  cutlass::HostTensor<ElementScalingFactor, typename Conv::LayoutC> scale_C;
+  cutlass::HostTensor<ElementScalingFactor, typename Conv::LayoutC> scale_D;
+  cutlass::HostTensor<ElementScalingFactor, typename Conv::LayoutC> scale_Aux;
+  cutlass::HostTensor<ElementAbsmax, typename Conv::LayoutC> abs_max_Aux;
+  cutlass::HostTensor<ElementAbsmax, typename Conv::LayoutC> abs_max_D;
+  cutlass::HostTensor<ElementAbsmax, typename Conv::LayoutC> reference_abs_max_Aux;
+  cutlass::HostTensor<ElementAbsmax, typename Conv::LayoutC> reference_abs_max_D;
+  //
+  // Methods
+  //
+  TestbedConv2dWithAbsMax(
+    bool scaleA = true,
+    bool scaleB = true,
+    bool scaleC = true,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    doScaleA(scaleA), doScaleB(scaleB), doScaleC(scaleC),
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+  /// Helper to initialize scaling factors
+  template <typename Element, typename Layout>
+  bool initialize_scale_factor(cutlass::TensorView<Element, Layout> view, uint64_t seed, int bits=0) {
+    cutlass::reference::host::TensorFillRandomUniform(view, seed, double(1.), double(0.), bits);
+    return true;
+  }
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view,
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+    if (dist_kind == cutlass::Distribution::Uniform) {
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Conv::ElementC>::value;
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope_max, scope_min, 0);
+    }
+    else if (dist_kind == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(view);
+    }
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    }
+    else {
+      EXPECT_TRUE(false) << "Not implemented";
+      return false;
+    }
+    return true;
+  }
+  /// Initializes data structures
+  void initialize(cutlass::conv::Conv2dProblemSize const &problem_size) {
+    //
+    // Allocate the GEMM workspace
+    //
+    tensor_A.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size));
+    tensor_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_C.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_Vector.resize({1, 1, 1, implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size).c()});
+    reference_D.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size), false);
+    tmp_D.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size), false);
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2019));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2018));
+    EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 2017));
+    EXPECT_TRUE(initialize_tensor(tensor_Vector.host_view(), init_C, seed + 2020));
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    cutlass::Coord<4> origin(0);
+    tensor_A.host_view().at(origin) = typename Conv::ElementA(1);
+    tensor_B.host_view().at(origin) = typename Conv::ElementB(1);
+    tensor_C.host_view().at(origin) = typename Conv::ElementC(1);
+    tensor_Vector.host_view().at(origin) = typename Conv::ElementC(1);
+    cutlass::reference::host::TensorFill(tensor_D.host_view());
+    cutlass::reference::host::TensorCopy(reference_D.host_view(), tensor_C.host_view());
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D.sync_device();
+    tensor_Vector.sync_device();
+    int scale_bits = 2;
+    if (doScaleA) {
+      scale_A.resize({1, 1, 1, 1});
+      EXPECT_TRUE(initialize_scale_factor(scale_A.host_view(), seed + 2021, scale_bits));
+      scale_A.sync_device();
+    }
+    if (doScaleB) {
+      scale_B.resize({1, 1, 1, 1});
+      EXPECT_TRUE(initialize_scale_factor(scale_B.host_view(), seed + 2022, scale_bits));
+      scale_B.sync_device();
+    }
+    if (doScaleC) {
+      scale_C.resize({1, 1, 1, 1});
+      EXPECT_TRUE(initialize_scale_factor(scale_C.host_view(), seed + 2023, scale_bits));
+      scale_C.sync_device();
+    }
+    if (kScaleOutput) {
+      scale_D.resize({1, 1, 1, 1});
+      EXPECT_TRUE(initialize_scale_factor(scale_D.host_view(), seed + 2024, scale_bits));
+      scale_D.sync_device();
+      abs_max_D.resize({1, 1, 1, 1});
+      cutlass::reference::host::TensorFill(abs_max_D.host_view());
+      abs_max_D.sync_device();
+      reference_abs_max_D.resize({1, 1, 1, 1});
+    }
+    if (kScaleAux) {
+      tensor_Aux.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+      cutlass::reference::host::TensorFill(tensor_Aux.host_view());
+      tensor_Aux.sync_device();
+      scale_Aux.resize({1, 1, 1, 1});
+      EXPECT_TRUE(initialize_scale_factor(scale_Aux.host_view(), seed + 2025, scale_bits));
+      scale_Aux.sync_device();
+      abs_max_Aux.resize({1, 1, 1, 1});
+      cutlass::reference::host::TensorFill(abs_max_Aux.host_view());
+      abs_max_Aux.sync_device();
+      reference_Aux.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size), false);
+      reference_abs_max_Aux.resize({1, 1, 1, 1});
+    }
+  }
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    ElementCompute alpha,
+    ElementCompute beta) {
+    tensor_D.sync_host();
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+    bool passed = cutlass::reference::host::TensorEquals(reference_D.host_view(), tensor_D.host_view());
+    if (kScaleAux) {
+      tensor_Aux.sync_host();
+      abs_max_Aux.sync_host();
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_Aux.host_view()), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(abs_max_Aux.host_view()), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_Aux.host_view()), 0);
+      passed &= cutlass::reference::host::TensorEquals(reference_Aux.host_view(), tensor_Aux.host_view());
+      passed &= cutlass::reference::host::TensorEquals(abs_max_Aux.host_view(), reference_abs_max_Aux.host_view());
+    }
+    if (kScaleOutput) {
+      abs_max_D.sync_host();
+      EXPECT_GT(cutlass::reference::host::TensorNorm(abs_max_D.host_view()), 0);
+      passed &= cutlass::reference::host::TensorEquals(abs_max_D.host_view(), reference_abs_max_D.host_view());
+    }
+    EXPECT_TRUE(passed) << " mismatched reference";
+    if (!passed) {
+      std::ofstream file0("conv_testbed_with_amax_errors_reference.txt");
+      std::ofstream file1("conv_testbed_with_amax_errors_computed.txt");
+      std::ofstream file("conv_testbed_with_amax_errors.txt");
+      file
+        << "problem: " << problem_size
+        << ", alpha: " << alpha << ", beta: " << beta << "\n\n";
+      file
+        << "A =\n" << tensor_A.host_view()
+        << "\nB =\n" << tensor_B.host_view()
+        << "\nC =\n" << tensor_C.host_view()
+        << "\nVector =\n" << tensor_Vector.host_view()
+        << "\nScaleA = " << scale_A.host_view()
+        << "\nScaleB = " << scale_B.host_view()
+        << "\nScaleC = " << scale_C.host_view()
+        << "\nScaleD = " << scale_D.host_view()
+        << "\nScaleAux = " << scale_Aux.host_view()
+        << std::endl;
+      file0 << "\n\nReference D =\n" << reference_D.host_view() << std::endl;
+      file1 << "\n\nComputed D =\n" << tensor_D.host_view() << std::endl;
+      if (kScaleAux) {
+        file0 << "\n\nReference Aux =\n" << reference_Aux.host_view() << std::endl;
+        file1 << "\n\nComputed Aux =\n" << tensor_Aux.host_view() << std::endl;
+        file0 << "\n\nReference Absmax Aux = " << reference_abs_max_Aux.host_view() << std::endl;
+        file1 << "\n\nComputed Absmax Aux = " << abs_max_Aux.host_view() << std::endl;
+      }
+      if (kScaleOutput) {
+        file0 << "\n\nReference Absmax D = " << reference_abs_max_D.host_view() << std::endl;
+        file1 << "\n\nComputed Absmax D = " << abs_max_D.host_view() << std::endl;
+      }
+    }
+    return passed;
+  }
+  /// Verifies the result is a GEMM
+  bool verify(
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    ElementCompute alpha,
+    ElementCompute beta) {
+    cutlass::Coord<4> origin(0);
+    ElementCompute scaled_alpha = alpha;
+    if (doScaleA) {
+      scaled_alpha *= scale_A.host_view().at(origin);
+    }
+    if (doScaleB) {
+      scaled_alpha *= scale_B.host_view().at(origin);
+    }
+    ElementCompute scaled_beta = beta;
+    if (doScaleC) {
+      scaled_beta *= scale_C.host_view().at(origin);
+    }
+    //
+    // Verify
+    //
+    cutlass::reference::host::Conv2d<
+        typename Conv::ElementA, typename Conv::LayoutA,
+        typename Conv::ElementB, typename Conv::LayoutB,
+        typename Conv::ElementC, typename Conv::LayoutC,
+        ElementCompute, ElementAccumulator, ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      tensor_C.host_ref(),
+      tmp_D.host_ref(),
+      scaled_alpha,
+      scaled_beta
+    );
+    ElementCompute tmp_abs_max_Aux(0.);
+    ElementCompute tmp_abs_max_D(0.);
+    cutlass::NumericConverter<ElementCompute, typename Conv::ElementC> cvt_c_to_compute;
+    cutlass::NumericConverter<ElementCompute, ElementAccumulator> cvt_accum_to_compute;
+    cutlass::NumericConverter<ElementAbsmax, ElementCompute> cvt_compute_to_absmax;
+    cutlass::NumericConverter<typename Conv::EpilogueOutputOp::ElementOutput, ElementCompute> cvt_compute_to_d;
+    cutlass::NumericConverter<typename Conv::EpilogueOutputOp::ElementAuxOutput, ElementCompute> cvt_compute_to_aux;
+    cutlass::absolute_value_op<ElementCompute> abs;
+    cutlass::maximum_with_nan_propogation<ElementCompute> max;
+    ActivationFunctor<ElementCompute> act;
+    ElementScalingFactor d_scale = kScaleOutput ? scale_D.host_view().at(origin) : ElementScalingFactor(1.);
+    for (int n = 0; n < problem_size.N; ++n) {
+      for (int p = 0; p < problem_size.P; ++p) {
+        for (int q = 0; q < problem_size.Q; ++q) {
+          for (int k = 0; k < problem_size.K; ++k) {
+            ElementCompute intermediate = cvt_accum_to_compute(tmp_D.host_view().at({n, p, q, k}));
+            ElementCompute bias = cvt_c_to_compute(tensor_Vector.host_view().at({0, 0, 0, k}));
+            ElementCompute aux = intermediate + bias;
+            ElementCompute d = act(aux);
+            tmp_abs_max_Aux = max(abs(aux), tmp_abs_max_Aux);
+            tmp_abs_max_D = max(abs(d), tmp_abs_max_D);
+            reference_D.host_view().at({n, p, q, k}) = cvt_compute_to_d(d * d_scale);
+            if (kScaleAux) {
+              reference_Aux.host_view().at({n, p, q, k}) = cvt_compute_to_aux(aux * scale_Aux.host_view().at(origin));
+            }
+          }
+        }
+      }
+    }
+    if (kScaleAux) {
+      reference_abs_max_Aux.host_view().at(origin) = cvt_compute_to_absmax(tmp_abs_max_Aux);
+    }
+    if (kScaleOutput) {
+      reference_abs_max_D.host_view().at(origin) = cvt_compute_to_absmax(tmp_abs_max_D);
+    }
+    return compare_reference(problem_size, alpha, beta);
+  }
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+    size_t smem_size = sizeof(typename Conv::UnderlyingKernel::SharedStorage);
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+    result = cudaGetDeviceProperties(&properties, device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+    return true;
+  }
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(0))
+  {
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+    this->initialize(problem_size);
+    //
+    // Initialize the GEMM operator
+    //
+    typename Conv::EpilogueOutputOp::Params::ActivationParams activation_params{alpha, beta};
+    typename Conv::EpilogueOutputOp::Params epilogue_params{
+      activation_params,
+      scale_A.device_data(),
+      scale_B.device_data(),
+      scale_C.device_data(),
+      scale_D.device_data(),
+      scale_Aux.device_data(),
+      abs_max_Aux.device_data(),
+      abs_max_D.device_data()
+    };
+    typename Conv::Arguments arguments{
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D.device_ref(),
+      tensor_Aux.device_ref(),
+      epilogue_params,
+      cutlass::conv::SplitKMode::kSerial,
+      tensor_Vector.device_data(),
+      0
+    };
+    Conv conv2d_op;
+    cutlass::Status status = conv2d_op.can_implement(arguments);
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+    size_t workspace_size = Conv::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+    status = conv2d_op.initialize(arguments, workspace.get());
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+    //
+    // Run the GEMM
+    //
+    status = conv2d_op();
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+    cudaError_t cuda_error = cudaDeviceSynchronize();
+    EXPECT_TRUE(cuda_error == cudaSuccess) << cudaGetErrorString(cuda_error);
+    //
+    // Verify
+    //
+    bool passed = this->verify(problem_size, alpha, beta);
+    if (!passed) {
+      std::cout << "Failed" << std::endl;
+    }
+    return passed;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ImplicitGemm,
+  template<typename T> class ActivationFunctor = cutlass::epilogue::thread::Identity
+>
+bool TestAllConv2dWithAbsmax(bool scaleA=true, bool scaleB=true, bool scaleC=true) {
+  const Conv2dProblemVector &conv_test_sizes = Conv2dProblemVector();
+  const Conv2dProblemVector &conv_blacklist_sizes = Conv2dProblemVector();
+  //
+  // Testbed object
+  //
+  TestbedConv2dWithAbsMax<ImplicitGemm, ActivationFunctor> testbed(scaleA, scaleB, scaleC);
+  //
+  // Get conv problem sizes to run conv operator
+  //
+  TestbedConv2dProblemSizes conv_problems(128/cutlass::sizeof_bits<typename ImplicitGemm::ElementA>::value);
+  // Vector of conv2d problem sizes to avoid duplicate runs
+  Conv2dProblemVector conv_tested_sizes;
+  Conv2dProblemVector const *problem_vectors[] = {
+    &conv_test_sizes,                               // run user specified sizes
+    &conv_problems.conv2d_default_sizes,            // run default and cudnn bug sizes
+    &conv_problems.conv2d_resnet50_sizes,           // run resnet50 sizes
+#if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED
+    &conv_problems.conv2d_rigorous_sizes,           // run large and rigorous sizes if enabled
+#endif
+  };
+  bool passed = true;
+  // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for (Conv2dProblemVector const * problem_vector : problem_vectors) {
+    // Prune all problems with channels that aren't divisible by the number of elements accessed per
+    // load for operands A and B. This is meant to align with the requirements of iterators used for
+    // fprop kernels.
+    ChannelDivisibilitySpecification channel_spec(128 / cutlass::sizeof_bits<typename ImplicitGemm::ElementA>::value);
+    auto pruned_problem_vector = prune(*problem_vector, channel_spec);
+    //  Run conv testbed on default convolution sizes
+    for(auto conv_problem : pruned_problem_vector) {
+      // Skip blacklist and avoid duplicate problem sizes
+      if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() ||
+          std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) {
+        continue;
+      }
+      //
+      // Test
+      //
+      // push back tested problem size to avoid re-running duplicates
+      conv_tested_sizes.push_back(conv_problem);
+      // test mode = xcross
+      passed &= testbed.run(conv_problem);
+      if (!passed) {
+        return false;
+      }
+      // test mode = convolution
+      passed &= testbed.run(conv_problem.reset_mode(cutlass::conv::Mode::kConvolution));
+      if (!passed) {
+        return false;
+      }
+    }
+  }
+  return passed;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace device
+} // namespace conv
+} // namespace test
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_with_broadcast_testbed.h ADDED Viewed

	@@ -0,0 +1,734 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM for fused epilogue broadcast testbed
+    Parallel split-k is not tested because we can just use regular conv kernel
+    when we need to use parallel-splitk.  Broadcast can happen in the reduction
+    kernel.
+*/
+#pragma once
+#include <fstream>
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+#include "conv2d_problems.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/device/convolution.h"
+#include "cutlass/core_io.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "../cache_testbed_output.h"
+namespace test {
+namespace conv {
+namespace device {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Conv2d>
+struct Conv2dWithBroadcastReferenceOp {
+  using OutputOp = typename Conv2d::EpilogueOutputOp;
+  using ElementCompute = typename OutputOp::ElementCompute;
+  using ElementZ = typename OutputOp::ElementZ;
+  using ElementT = typename OutputOp::ElementT;
+  typename OutputOp::BinaryOp binary_op;
+  typename OutputOp::ElementwiseOp elementwise_op;
+  Conv2dWithBroadcastReferenceOp() { }
+  void operator()(ElementZ &Z, ElementT &T, ElementCompute conv2d, ElementCompute bias) {
+    ElementCompute t_full = binary_op(conv2d, bias);
+    T = ElementT(t_full);
+    ElementCompute z_full = elementwise_op(t_full);
+    Z = ElementZ(z_full);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Fused testbed
+//
+//  Y = CONV(AB, C)
+//
+//  T[n, p, q, k] = ReductionOp(Y[n, p, q, k], Broadcast[k])
+//
+//  Z[n, p, q, k] = Elementwise(T[n, p, q, k])
+//
+template <
+  typename Conv2d,
+  typename ReferenceOp,
+  bool AddBroadcastFirst = false
+>
+class TestbedConv2dWithBroadcast {
+public:
+  using ElementA = typename Conv2d::ElementA;
+  using LayoutA = typename Conv2d::LayoutA;
+  using ElementB = typename Conv2d::ElementB;
+  using LayoutB = typename Conv2d::LayoutB;
+  using ElementC = typename Conv2d::ElementC;
+  using LayoutC = typename Conv2d::LayoutC;
+  using ElementAccumulator = typename Conv2d::ElementAccumulator;
+  using ElementCompute = typename Conv2d::ElementCompute;
+  using EpilogueOutputOp = typename Conv2d::EpilogueOutputOp;
+  using ElementZ = typename EpilogueOutputOp::ElementZ;
+  using ElementT = typename EpilogueOutputOp::ElementT;
+  using ElementVector = typename EpilogueOutputOp::ElementVector;
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv2d::kConvolutionalOperator;
+  static const bool kAddBroadcastFirst = AddBroadcastFirst;
+  static const bool kStoreT = EpilogueOutputOp::kStoreT;
+public:
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementAccumulator, LayoutC> tensor_C_reference;
+  cutlass::HostTensor<ElementZ, LayoutC> tensor_Z_computed;
+  cutlass::HostTensor<ElementZ, LayoutC> tensor_Z_reference;
+  cutlass::HostTensor<ElementT, LayoutC> tensor_T_computed;
+  cutlass::HostTensor<ElementT, LayoutC> tensor_T_reference;
+  cutlass::HostTensor<ElementAccumulator, LayoutC> tensor_Y_reference;
+  cutlass::HostTensor<ElementVector, LayoutC> tensor_Broadcast;            // Input Broadcast
+public:
+  TestbedConv2dWithBroadcast(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
+  }
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view,
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+    if (dist_kind == cutlass::Distribution::Uniform) {
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+      if (bits <= 8) {
+        scope = 2;
+      }
+      else if (bits == 16) {
+        if (cutlass::sizeof_bits<ElementAccumulator>::value <= 16) {
+          scope = 3;
+        }
+        else {
+          scope = 5;
+        }
+      }
+      else {
+        scope = 8;
+      }
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    }
+    else if (dist_kind == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(view);
+    }
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    }
+    else {
+    }
+  }
+  void initialize(
+    cutlass::conv::Conv2dProblemSize const &problem_size, uint64_t seed = 2019) {
+    tensor_A.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size));
+    tensor_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_C.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_C_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_Z_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_Z_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_T_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_T_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_Y_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_Broadcast.resize({
+      1,
+      1,
+      1,
+      implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size).c(),
+    });
+    initialize_tensor(tensor_A.host_view(), init_A, seed);
+    initialize_tensor(tensor_B.host_view(), init_B, seed * 17);
+    initialize_tensor(tensor_C.host_view(), init_C, seed * 39);
+    initialize_tensor(tensor_Broadcast.host_view(), init_C, seed * 39);
+    for (int n = 0; n < tensor_C_reference.extent().n(); ++n) {
+      for (int p = 0; p < tensor_C_reference.extent().h(); ++p) {
+        for (int q = 0; q < tensor_C_reference.extent().w(); ++q) {
+          for (int k = 0; k < tensor_C_reference.extent().c(); ++k) {
+            tensor_C_reference.at({n, p, q, k}) = ElementAccumulator(tensor_C.at({n, p, q, k}));
+          }
+        }
+      }
+    }
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_Broadcast.sync_device();
+    tensor_C_reference.sync_device();
+    tensor_Z_computed.sync_device();
+    tensor_Z_reference.sync_device();
+    tensor_T_computed.sync_device();
+    tensor_T_reference.sync_device();
+    tensor_Y_reference.sync_device();
+  }
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+    size_t smem_size = sizeof(typename Conv2d::UnderlyingKernel::SharedStorage);
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+    result = cudaGetDeviceProperties(&properties, device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+    return true;
+  }
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(1)) {
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+#if 0 //display conv2d problem size for debugging
+    std::cout << problem_size << std::endl
+              << "alpha, beta: (" << alpha << ", " << beta << ")" << std::endl
+              << "split_k_mode: " << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)") << std::endl
+              << std::endl;
+#endif
+    initialize(problem_size);
+    // configure the operator
+    Conv2d conv2d_op;
+    typename Conv2d::Arguments conv2d_args(
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_Z_computed.device_ref(),
+      {alpha, beta},
+      split_k_mode,
+      tensor_Broadcast.device_data(),
+      kStoreT ? tensor_T_computed.device_data() : nullptr,
+      0,         // This must be zero
+      implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size).c()
+    );
+    // initialize the kernel
+    size_t workspace_size = Conv2d::get_workspace_size(conv2d_args);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+    cutlass::Status status = conv2d_op.initialize(conv2d_args, workspace.get());
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+    // run conv2d operator
+    status = conv2d_op();
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+    bool passed = false;
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " device reference error: "
+                                   << cudaGetErrorString(result);
+    tensor_T_computed.sync_host();
+    tensor_Z_computed.sync_host();
+    //
+    // Reference check
+    //
+    // When kAddBroadcastFirst is true, add bias on the host
+    ElementCompute beta_ref = kAddBroadcastFirst ? ElementCompute(0) : beta;
+#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
+    cutlass::reference::device::Conv2d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementAccumulator,
+      LayoutC,
+      ElementAccumulator,
+      ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C_reference.device_ref(),
+      tensor_Y_reference.device_ref(),
+      alpha,
+      beta_ref);
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_Y_reference.sync_host();
+#else
+    cutlass::reference::host::Conv2d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementAccumulator,
+      LayoutC,
+      ElementAccumulator,
+      ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      tensor_C_reference.host_ref(),
+      tensor_Y_reference.host_ref(),
+      alpha,
+      beta_ref);
+#endif
+    ReferenceOp reference_op;
+    // compute tensor Z and tensor T
+    for (int n = 0; n < problem_size.N; ++n) {
+      for (int p = 0; p < (kConvolutionalOperator == cutlass::conv::Operator::kFprop ? problem_size.P : problem_size.H); ++p) {
+        for (int q = 0; q < (kConvolutionalOperator == cutlass::conv::Operator::kFprop ? problem_size.Q : problem_size.W); ++q) {
+          for (int k = 0; k < (kConvolutionalOperator == cutlass::conv::Operator::kFprop ? problem_size.K : problem_size.C); ++k) {
+            ElementZ z{};
+            ElementT t{};
+            ElementCompute accum = tensor_Y_reference.at({n, p, q, k});
+	          ElementCompute bias = ElementCompute(tensor_Broadcast.at({0, 0, 0, k}));
+            if (kAddBroadcastFirst) {
+              reference_op(z, t, accum + bias,
+                           beta * ElementCompute(tensor_C_reference.at({n, p, q, k})));
+            } else {
+              reference_op(z, t, accum, bias);
+            }
+            tensor_Z_reference.at({n, p, q, k}) = z;
+            tensor_T_reference.at({n, p, q, k}) = t;
+          }
+        }
+      }
+    }
+    if (kStoreT) {
+      passed = cutlass::reference::host::TensorEquals(
+        tensor_T_computed.host_view(),
+        tensor_T_reference.host_view());
+      EXPECT_TRUE(passed);
+    }
+    passed = cutlass::reference::host::TensorEquals(
+      tensor_Z_computed.host_view(),
+      tensor_Z_reference.host_view());
+    EXPECT_TRUE(passed);
+    if (!passed) {
+      std::stringstream fname;
+      fname << "error_Conv2d_ImplicitGemm_device_"
+        << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
+        << (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
+            (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" :
+              (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDeconv ? "deconv_" : "wgrad_")))
+        << "nhwc_"
+        << problem_size.N << "x"
+        << problem_size.H << "x"
+        << problem_size.W << "x"
+        << problem_size.C
+        << "_krsc_"
+        << problem_size.K << "x"
+        << problem_size.R << "x"
+        << problem_size.S << "x"
+        << problem_size.C
+        << "_padding_"
+        << problem_size.pad_h << "x"
+        << problem_size.pad_w
+        << "_stride_"
+        << problem_size.stride_h << "x"
+        << problem_size.stride_w
+        << "_dilation_"
+        << problem_size.dilation_h << "x"
+        << problem_size.dilation_w << "_"
+        << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_")
+        << Conv2d::ThreadblockShape::kM << "x"
+        << Conv2d::ThreadblockShape::kN << "x"
+        << Conv2d::ThreadblockShape::kK << "_"
+        << Conv2d::WarpShape::kM << "x"
+        << Conv2d::WarpShape::kN << "x"
+        << Conv2d::WarpShape::kK << ".txt";
+      std::cout << fname.str() << std::endl;
+      std::ofstream results(fname.str());
+      results << problem_size << std::endl;
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n"
+        << "\nBroadcast:\n" << tensor_Broadcast.host_view() << "\n"
+        << "\nY reference:\n" << tensor_Y_reference.host_view() << "\n"
+        << "\nT reference:\n" << tensor_T_reference.host_view() << "\n"
+        << "\nT computed:\n" << tensor_T_computed.host_view() << "\n"
+        << "\nZ reference:\n" << tensor_Z_reference.host_view() << "\n"
+        << "\nZ computed:\n" << tensor_Z_computed.host_view() << "\n";
+    }
+    return passed;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename ImplicitGemm,
+          typename ReferenceOp = Conv2dWithBroadcastReferenceOp<ImplicitGemm>,
+          bool AddBroadcastFirst = false>
+bool TestSpecificConv2dWithBroadcast(
+  const Conv2dProblemVector & problem_sizes) {
+  bool passed = true;
+  //
+  // Testbed object
+  //
+  TestbedConv2dWithBroadcast<ImplicitGemm, ReferenceOp, AddBroadcastFirst> testbed;
+  // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for(auto conv_problem : problem_sizes) {
+    //
+    // Test
+    //
+    // test mode = xcross
+    passed = testbed.run(
+      conv_problem,
+      cutlass::conv::SplitKMode::kSerial);
+    if (!passed) {
+      return false;
+    }
+    // test mode = convolution
+    passed = testbed.run(
+      conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+      cutlass::conv::SplitKMode::kSerial);
+    if (!passed) {
+      return false;
+    }
+  }
+  return true;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
+// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
+// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
+// (conv_blacklist_sizes)
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename ImplicitGemm,
+          typename ReferenceOp = Conv2dWithBroadcastReferenceOp<ImplicitGemm>,
+          bool AddBroadcastFirst = false,
+          bool TestSplitK = true
+>
+bool TestAllConv2dWithBroadcast(
+  const Conv2dProblemVector &conv_test_sizes = Conv2dProblemVector(),
+  const Conv2dProblemVector &conv_blacklist_sizes = Conv2dProblemVector()) {
+  bool passed = true;
+  //
+  // Testbed object
+  //
+  TestbedConv2dWithBroadcast<ImplicitGemm, ReferenceOp, AddBroadcastFirst> testbed;
+  //
+  // Get conv problem sizes to run conv operator
+  //
+  TestbedConv2dProblemSizes conv_problems(128/cutlass::sizeof_bits<typename ImplicitGemm::ElementA>::value);
+  // Vector of conv2d problem sizes to avoid duplicate runs
+  Conv2dProblemVector conv_tested_sizes;
+  Conv2dProblemVector const *problem_vectors[] = {
+    &conv_test_sizes,                               // run user specified sizes
+    &conv_problems.conv2d_default_sizes,            // run default and cudnn bug sizes
+    &conv_problems.conv2d_resnet50_sizes,           // run resnet50 sizes
+#if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED
+    &conv_problems.conv2d_rigorous_sizes,           // run large and rigorous sizes if enabled
+#endif
+  };
+  // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for (Conv2dProblemVector const * problem_vector : problem_vectors) {
+    //  Run conv testbed on default convolution sizes
+    for(auto conv_problem : *problem_vector) {
+      // Skip blacklist and avoid duplicate problem sizes
+      if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() ||
+          std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) {
+        continue;
+      }
+      //
+      // Procedurally disable certain cases
+      //
+      // CUTLASS DGRAD's *unity* stride specialization only support stride {1, 1}
+      if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+            ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) &&
+          (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport ==
+            cutlass::conv::StrideSupport::kUnity)) {
+        if (!((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+          continue;
+        }
+      }
+#if 0 // relax restrictions on analytic strided dgrad
+      // CUTLASS DGRAD's *strided* specialization only support stride >= {2, 2}
+      if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+            ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) &&
+          (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport ==
+            cutlass::conv::StrideSupport::kStrided)) {
+         if (((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+           continue;
+         }
+      }
+#endif
+      //
+      // Test
+      //
+      // push back tested problem size to avoid re-running duplicates
+      conv_tested_sizes.push_back(conv_problem);
+      // test mode = xcross
+      passed = testbed.run(
+        conv_problem,
+        cutlass::conv::SplitKMode::kSerial);
+      if (!passed) {
+        return false;
+      }
+      // test mode = convolution
+      passed = testbed.run(
+        conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+        cutlass::conv::SplitKMode::kSerial);
+      if (!passed) {
+        return false;
+      }
+    }
+  }
+  // CUTLASS DGRAD's *strided* specialization does not support split-k mode
+  if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+        ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) &&
+      (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport ==
+        cutlass::conv::StrideSupport::kStrided)) {
+    passed = testbed.run(
+      cutlass::conv::Conv2dProblemSize(
+      {1, 56, 56, 8},   // input size (NHWC)
+      {8, 1, 1, 8},     // filter size (KRSC)
+      {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
+      {2, 2},           // stride (stride_h, stride_w)
+      {1, 1}),          // dilation (dilation_h, dilation_w)
+      cutlass::conv::SplitKMode::kSerial,
+      cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0),
+      cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0));
+    if (!passed) {
+      return false;
+    }
+    return passed;
+  }
+  if (!TestSplitK)
+    return passed;
+  // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for
+  // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters
+  // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep
+  // alpha and beta for local testing, but only runs one value for alpha and beta.
+  cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size (
+      {1, 17, 11, 288},   // input size (NHWC)
+      {160, 3, 3, 288},   // filter size (KRSC)
+      {1, 1, 1, 1},       // padding (pad_h, _, pad_w, _)
+      {1, 1},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    );
+  cutlass::conv::SplitKMode split_k_modes [] = {
+    cutlass::conv::SplitKMode::kSerial
+  };
+  int split_k_slices[] = {
+    1, 2, 3, 4, 201
+  };
+  double problem_alpha[] = {
+    2.0
+  };
+  double problem_beta[] = {
+    2.0
+  };
+  for (auto split_k_mode : split_k_modes) {
+    for (auto split_k_slice : split_k_slices) {
+      for (auto alpha : problem_alpha) {
+        for (auto beta : problem_beta) {
+          passed = testbed.run(
+            conv2d_split_k_test_size.reset_split_k_slices(split_k_slice),
+            split_k_mode,
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(alpha),
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(beta));
+          if (!passed) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+  return passed;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace device
+} // namespace conv
+} // namespace test

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_with_reduction_testbed.h ADDED Viewed

	@@ -0,0 +1,643 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed
+*/
+#pragma once
+#include <fstream>
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/reduction/device/tensor_reduce.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+#include "conv2d_problems.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/device/convolution.h"
+#include "cutlass/core_io.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "../cache_testbed_output.h"
+namespace test {
+namespace conv {
+namespace device {
+template <typename Conv2d>
+class TestbedConv2dWithReduction {
+public:
+  using ElementA = typename Conv2d::ElementA;
+  using LayoutA = typename Conv2d::LayoutA;
+  using ElementB = typename Conv2d::ElementB;
+  using LayoutB = typename Conv2d::LayoutB;
+  using ElementC = typename Conv2d::ElementC;
+  using LayoutC = typename Conv2d::LayoutC;
+  using ElementAccumulator = typename Conv2d::ElementAccumulator;
+  using ElementCompute = typename Conv2d::ElementCompute;
+  using EpilogueOutputOp = typename Conv2d::EpilogueOutputOp;
+  using ElementT = typename EpilogueOutputOp::ElementTensor;
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv2d::kConvolutionalOperator;
+public:
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementAccumulator, LayoutC> tensor_Reduction;
+  cutlass::HostTensor<ElementT,           cutlass::layout::RowMajor> tensor_Tensor;
+  cutlass::HostTensor<ElementAccumulator, LayoutC> tensor_Final_Reduction;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
+public:
+  TestbedConv2dWithReduction(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
+  }
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view,
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+    if (dist_kind == cutlass::Distribution::Uniform) {
+      int scope = 2;
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    }
+    else if (dist_kind == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(view);
+    }
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    }
+    else {
+    }
+  }
+  void initialize(
+    cutlass::conv::Conv2dProblemSize const &problem_size, uint64_t seed = 2019) {
+    tensor_A.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size));
+    tensor_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_C.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_Reduction.resize({
+      1,
+      1,
+      (problem_size.N * problem_size.P * problem_size.Q - 1 + Conv2d::ThreadblockShape::kM) / Conv2d::ThreadblockShape::kM,
+      (problem_size.K)
+    });
+    tensor_Final_Reduction.resize({
+      1,
+      1,
+      1,
+      (problem_size.K)
+    });
+    tensor_Tensor.resize({(problem_size.N * problem_size.P * problem_size.Q), problem_size.K});
+    tensor_D_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    initialize_tensor(tensor_A.host_view(), init_A, seed);
+    initialize_tensor(tensor_B.host_view(), init_B, seed * 17);
+    initialize_tensor(tensor_C.host_view(), init_C, seed * 39);
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D_computed.sync_device();
+    tensor_D_reference.sync_device();
+  }
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+    size_t smem_size = sizeof(typename Conv2d::UnderlyingKernel::SharedStorage);
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+    result = cudaGetDeviceProperties(&properties, device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+    return true;
+  }
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(0)) {
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+#if 0 //display conv2d problem size for debugging
+    std::cout << problem_size << std::endl
+              << "alpha, beta: (" << alpha << ", " << beta << ")" << std::endl
+              << "split_k_mode: " << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)") << std::endl
+              << std::endl;
+#endif
+    initialize(problem_size);
+    // configure the operator
+    Conv2d conv2d_op;
+    typename Conv2d::Arguments conv2d_args(
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_computed.device_ref(),
+      {alpha, beta},
+      split_k_mode,
+      tensor_Reduction.device_data(),
+      tensor_Tensor.device_data(),
+      static_cast<int>(tensor_Reduction.stride()[0]),
+      static_cast<int>(tensor_Tensor.stride()[0])
+    );
+    // find workspace requirement for parallel split-k reduction
+    size_t workspace_size = Conv2d::get_workspace_size(conv2d_args);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+    cutlass::Status status = conv2d_op.initialize(conv2d_args, workspace.get());
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+    // conv2d operation with parallel split-k-mode
+    if (split_k_mode == cutlass::conv::SplitKMode::kParallel) {
+      // conv2d output is written to workspace in global memory
+      conv2d_args.ref_D.reset(reinterpret_cast<ElementC*>(workspace.get()));
+      // accumulate mma for each cta in k-dimension (1.0 * A * B)
+      conv2d_args.output_op = {ElementCompute(1), ElementCompute(0)};
+      // update conv2d operator arguments
+      status = conv2d_op.update(conv2d_args, workspace.get());
+    }
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+    // run conv2d operator
+    status = conv2d_op();
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+    bool passed = false;
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " device reference error: "
+                                   << cudaGetErrorString(result);
+    // Final reduction over the partial reduction tensor
+    using Functor = cutlass::plus<ElementAccumulator>;
+    using TensorReduction = cutlass::reduction::device::TensorReduction<
+      ElementAccumulator,
+      ElementAccumulator,
+      LayoutC,
+      Functor,
+      8,
+      ElementAccumulator
+    >;
+    TensorReduction reduction(tensor_Reduction.extent(), 2);
+    cutlass::DeviceAllocation<uint8_t> reduction_device_workspace(reduction.workspace_size());
+    status = reduction.reduce(
+      tensor_Final_Reduction.device_ref(),
+      tensor_Reduction.device_ref(),
+      reduction_device_workspace.get(),
+      ElementAccumulator());
+    EXPECT_EQ(status, cutlass::Status::kSuccess);
+    EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess);
+    //
+    // Reference check
+    //
+    tensor_D_computed.sync_host();
+#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
+    cutlass::reference::device::Conv2d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementCompute,
+      ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_reference.device_ref(),
+      alpha,
+      beta);
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_D_reference.sync_host();
+#else
+    cutlass::reference::host::Conv2d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementCompute,
+      ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      tensor_C.host_ref(),
+      tensor_D_reference.host_ref(),
+      alpha,
+      beta);
+#endif
+    passed = cutlass::reference::host::TensorEquals(
+      tensor_D_computed.host_view(),
+      tensor_D_reference.host_view());
+    EXPECT_TRUE(passed);
+    //
+    // Reference check on reduction results
+    //
+    tensor_Reduction.sync_host();
+    tensor_Final_Reduction.sync_host();
+    // compute backwards for reduction results
+    cutlass::HostTensor<ElementAccumulator, LayoutC> reference_Reduction;
+    reference_Reduction.resize({
+      1,
+      1,
+      1,
+      (problem_size.K)
+    });
+    for (int k = 0; k < problem_size.K; ++k) {
+      ElementAccumulator reduced_value = ElementAccumulator();
+      for (int n = 0; n < problem_size.N; ++n) {
+        for (int p = 0; p < problem_size.P; ++p) {
+          for (int q = 0; q < problem_size.Q; ++q) {
+            reduced_value += tensor_D_reference.at({n, p, q, k});
+          }
+        }
+      }
+      reference_Reduction.at({0, 0, 0, k}) = reduced_value;
+    }
+    passed = cutlass::reference::host::TensorEquals(
+      tensor_Final_Reduction.host_view(),
+      reference_Reduction.host_view()
+    );
+    EXPECT_TRUE(passed);
+    if (!passed) {
+      std::stringstream fname;
+      fname << "error_Conv2d_ImplicitGemm_device_"
+        << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
+        << (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
+            (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" : "wgrad_"))
+        << "nhwc_"
+        << problem_size.N << "x"
+        << problem_size.H << "x"
+        << problem_size.W << "x"
+        << problem_size.C
+        << "_krsc_"
+        << problem_size.K << "x"
+        << problem_size.R << "x"
+        << problem_size.S << "x"
+        << problem_size.C
+        << "_padding_"
+        << problem_size.pad_h << "x"
+        << problem_size.pad_w
+        << "_stride_"
+        << problem_size.stride_h << "x"
+        << problem_size.stride_w
+        << "_dilation_"
+        << problem_size.dilation_h << "x"
+        << problem_size.dilation_w << "_"
+        << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_")
+        << Conv2d::ThreadblockShape::kM << "x"
+        << Conv2d::ThreadblockShape::kN << "x"
+        << Conv2d::ThreadblockShape::kK << "_"
+        << Conv2d::WarpShape::kM << "x"
+        << Conv2d::WarpShape::kN << "x"
+        << Conv2d::WarpShape::kK << ".txt";
+      std::cout << fname.str() << std::endl;
+      std::ofstream results(fname.str());
+      results << problem_size << std::endl;
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n"
+        << "\nD reference:\n" << tensor_D_reference.host_view() << "\n"
+        << "\nD computed:\n" << tensor_D_computed.host_view() << "\n"
+        << "\nreduction reference:\n" << reference_Reduction.host_view() << "\n"
+        << "\nreduction computed:\n" << tensor_Reduction.host_view() << "\n";
+    }
+    return passed;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
+// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
+// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
+// (conv_blacklist_sizes)
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename ImplicitGemm>
+bool TestAllConv2dWithReduction(
+  const Conv2dProblemVector & conv_test_sizes = Conv2dProblemVector(),
+  const Conv2dProblemVector & conv_blacklist_sizes = Conv2dProblemVector()) {
+  bool passed = true;
+  //
+  // Testbed object
+  //
+  TestbedConv2dWithReduction<ImplicitGemm> testbed;
+  //
+  // Get conv problem sizes to run conv operator
+  //
+  TestbedConv2dProblemSizes conv_problems(128/cutlass::sizeof_bits<typename ImplicitGemm::ElementA>::value);
+  // Vector of conv2d problem sizes to avoid duplicate runs
+  Conv2dProblemVector conv_tested_sizes;
+  Conv2dProblemVector const *problem_vectors[] = {
+    &conv_test_sizes,                               // run user specified sizes
+    &conv_problems.conv2d_default_sizes,            // run default and cudnn bug sizes
+    &conv_problems.conv2d_resnet50_sizes,           // run resnet50 sizes
+#if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED
+    &conv_problems.conv2d_rigorous_sizes,           // run large and rigorous sizes if enabled
+#endif
+  };
+  // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for (Conv2dProblemVector const * problem_vector : problem_vectors) {
+    //  Run conv testbed on default convolution sizes
+    for(auto conv_problem : *problem_vector) {
+      // Skip blacklist and avoid duplicate problem sizes
+      if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() ||
+          std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) {
+        continue;
+      }
+      //
+      // Procedurally disable certain cases
+      //
+      // CUTLASS DGRAD's *unity* stride specialization only support stride {1, 1}
+      if ((ImplicitGemm::kConvolutionalOperator ==
+            cutlass::conv::Operator::kDgrad) &&
+          (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport ==
+            cutlass::conv::StrideSupport::kUnity)) {
+        if (!((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+          continue;
+        }
+      }
+#if 0 // relax restrictions on analytic strided dgrad
+      // CUTLASS DGRAD's *strided* specialization only support stride >= {2, 2}
+      if ((ImplicitGemm::kConvolutionalOperator ==
+            cutlass::conv::Operator::kDgrad) &&
+          (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport ==
+            cutlass::conv::StrideSupport::kStrided)) {
+         if (((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+           continue;
+         }
+      }
+#endif
+      //
+      // Test
+      //
+      // push back tested problem size to avoid re-running duplicates
+      conv_tested_sizes.push_back(conv_problem);
+      // test mode = xcross
+      passed = testbed.run(
+        conv_problem,
+        cutlass::conv::SplitKMode::kSerial);
+      if (!passed) {
+        return false;
+      }
+      // test mode = convolution
+      passed = testbed.run(
+        conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+        cutlass::conv::SplitKMode::kSerial);
+      if (!passed) {
+        return false;
+      }
+    }
+  }
+  // CUTLASS DGRAD's *strided* specialization does not support split-k mode
+  if ((ImplicitGemm::kConvolutionalOperator ==
+          cutlass::conv::Operator::kDgrad) &&
+      (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport ==
+        cutlass::conv::StrideSupport::kStrided)) {
+    passed = testbed.run(
+      cutlass::conv::Conv2dProblemSize(
+      {1, 56, 56, 8},   // input size (NHWC)
+      {8, 1, 1, 8},     // filter size (KRSC)
+      {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
+      {2, 2},           // stride (stride_h, stride_w)
+      {1, 1}),          // dilation (dilation_h, dilation_w)
+      cutlass::conv::SplitKMode::kSerial,
+      cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0),
+      cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0));
+    if (!passed) {
+      return false;
+    }
+    return passed;
+  }
+  // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for
+  // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters
+  // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep
+  // alpha and beta for local testing, but only runs one value for alpha and beta.
+  cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size (
+      {1, 17, 11, 288},   // input size (NHWC)
+      {160, 3, 3, 288},   // filter size (KRSC)
+      {1, 1, 1, 1},       // padding (pad_h, _, pad_w, _)
+      {1, 1},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    );
+  // Parallel SplitK is not tested.
+  cutlass::conv::SplitKMode split_k_modes [] = {
+    cutlass::conv::SplitKMode::kSerial,
+  };
+  int split_k_slices[] = {
+    1, 2, 3, 4, 201
+  };
+  double problem_alpha[] = {
+    2.0
+  };
+  double problem_beta[] = {
+    2.0
+  };
+  for (auto split_k_mode : split_k_modes) {
+    for (auto split_k_slice : split_k_slices) {
+      for (auto alpha : problem_alpha) {
+        for (auto beta : problem_beta) {
+          passed = testbed.run(
+            conv2d_split_k_test_size.reset_split_k_slices(split_k_slice),
+            split_k_mode,
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(alpha),
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(beta));
+          if (!passed) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+  return passed;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace device
+} // namespace conv
+} // namespace test

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv3d_problems.h ADDED Viewed

	@@ -0,0 +1,293 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed sizes for Conv2d problem
+*/
+#pragma once
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/core_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+namespace test {
+namespace conv {
+namespace device {
+using Conv3dProblemVector = std::vector<cutlass::conv::Conv3dProblemSize>;
+////////////////////////////////////////////////////////////////////////////
+/// Structure TestbedConv3dProblemSizes initializes and holds conv default and
+/// important network sizes
+////////////////////////////////////////////////////////////////////////////
+struct TestbedConv3dProblemSizes {
+  //
+  // Data members
+  //
+  int minimum_channel_size;
+  Conv3dProblemVector conv3d_default_sizes;
+  Conv3dProblemVector conv3d_vnet_medical_sizes;
+  //
+  // Methods
+  //
+  /// Default ctor
+  TestbedConv3dProblemSizes(int minimum_channel_size_ = 64): minimum_channel_size (minimum_channel_size_) {
+    initialize_conv3d_default_sizes();
+    initialize_conv3d_vnet_medical_sizes(conv3d_vnet_medical_sizes, 1 /*batch-size*/);
+    filter_all();
+  }
+  /// Eliminates some illegal cases
+  void filter_all() {
+    Conv3dProblemVector *problems_vectors[] = {
+      &conv3d_default_sizes,
+      &conv3d_vnet_medical_sizes
+    };
+    for (Conv3dProblemVector *problems : problems_vectors) {
+      Conv3dProblemVector filtered;
+      for (cutlass::conv::Conv3dProblemSize const & problem : *problems) {
+        if (!(problem.C % minimum_channel_size)) {
+          filtered.push_back(problem);
+        }
+      }
+      *problems = filtered;
+    }
+  }
+  // Add a few standard convolution problem sizes
+  void initialize_conv3d_default_sizes() {
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 1, 3, 3, minimum_channel_size}, // input size  (NDHWC)
+      {8, 1, 1, 1, minimum_channel_size}, // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),       // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),       // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})        // dilation (dilation_d, dilation_h, dilation_w)
+    ));
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 1, 1, 8, minimum_channel_size}, // input size  (NDHWC)
+      {8, 1, 1, 3, minimum_channel_size},   // filter size (KTRSC)
+      cutlass::Coord<3>({1, 1, 1}),         // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),         // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})          // dilation (dilation_d, dilation_h, dilation_w)
+    ));
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 1, 1, 8, minimum_channel_size},   // input size  (NDHWC)
+      {8, 1, 1, 3, minimum_channel_size},   // filter size (KTRSC)
+      CUTLASS_STL_NAMESPACE::make_tuple(
+        cutlass::Coord<3>({1, 1, 1}),       // near padding (pad_d, pad_h, pad_w)
+        cutlass::Coord<3>({0, 0, 0})        // far padding (pad_d, pad_h, pad_w)
+      ),
+      cutlass::Coord<3>({1, 1, 1}),         // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})          // dilation (dilation_d, dilation_h, dilation_w)
+    ));
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 8, 8, 8, minimum_channel_size}, // input size  (NDHWC)
+      {8, 3, 3, 3, minimum_channel_size},   // filter size (KTRSC)
+      cutlass::Coord<3>({1, 1, 1}),         // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),         // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})          // dilation (dilation_d, dilation_h, dilation_w)
+    ));
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 8, 8, 8, minimum_channel_size},    // input size  (NDHWC)
+      {8, 3, 3, 3, minimum_channel_size},    // filter size (KTRSC)
+      CUTLASS_STL_NAMESPACE::make_tuple(
+        cutlass::Coord<3>({1, 1, 1}),       // near padding (pad_d, pad_h, pad_w)
+        cutlass::Coord<3>({0, 0, 0})        // far padding (pad_d, pad_h, pad_w)
+      ),
+      cutlass::Coord<3>({1, 1, 1}),          // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})           // dilation (dilation_d, dilation_h, dilation_w)
+    ));
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 16, 16, 16, minimum_channel_size}, // input size  (NDHWC)
+      {8, 3, 3, 3, minimum_channel_size},   // filter size (KTRSC)
+      cutlass::Coord<3>({1, 1, 1}),         // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),         // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})          // dilation (dilation_d, dilation_h, dilation_w)
+    ));
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 1, 15, 19, 160},              // input size  (NDHWC)
+      {224, 1, 3, 6, 160},              // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),     // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),     // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})      // dilation (dilation_d, dilation_h, dilation_w)
+    ));
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 2, 1, 1, minimum_channel_size},  // input size  (NDHWC)
+      {8, 2, 1, 1, minimum_channel_size},  // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),        // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),        // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})         // dilation (dilation_d, dilation_h, dilation_w)
+    ));
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1,  1, 7, 7, minimum_channel_size}, // input size  (NDHWC)
+      {16, 1, 3, 3, minimum_channel_size}, // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),        // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),        // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})         // dilation (dilation_d, dilation_h, dilation_w)
+    ));
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 11, 15, 19, 64},              // input size  (NDHWC)
+      {32, 4, 3, 6, 64},                // filter size (KTRSC)
+      cutlass::Coord<3>({2, 1, 3}),     // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),     // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})      // dilation (dilation_d, dilation_h, dilation_w)
+    ));
+  }
+  // Add vnet layers to unit testing sizes
+  void initialize_conv3d_vnet_medical_sizes(Conv3dProblemVector &conv3d_problem_vector, int batch_size = 1) {
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 32, 32, 32, 16},     // input size  (NDHWC)
+      {32, 2, 2, 2, 16},              // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({2, 2, 2}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w)
+    ));
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 16, 16, 16, 32},     // input size  (NDHWC)
+      {32, 3, 3, 3, 32},              // filter size (KTRSC)
+      cutlass::Coord<3>({1, 1, 1}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w)
+    ));
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 16, 16, 16, 32},     // input size  (NDHWC)
+      {64, 2, 2, 2, 32},              // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({2, 2, 2}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w)
+    ));
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 8, 8, 8, 64},     // input size  (NDHWC)
+      {64, 3, 3, 3, 64},              // filter size (KTRSC)
+      cutlass::Coord<3>({1, 1, 1}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w)
+    ));
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 8, 8, 8, 64},     // input size  (NDHWC)
+      {128, 2, 2, 2, 64},              // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({2, 2, 2}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w)
+    ));
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 4, 4, 4, 128},     // input size  (NDHWC)
+      {128, 3, 3, 3, 128},              // filter size (KTRSC)
+      cutlass::Coord<3>({1, 1, 1}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w)
+    ));
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 8, 8, 8, 128},     // input size  (NDHWC)
+      {128, 3, 3, 3, 128},              // filter size (KTRSC)
+      cutlass::Coord<3>({1, 1, 1}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w)
+    ));
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 16, 16, 16, 64},     // input size  (NDHWC)
+      {64, 3, 3, 3, 64},              // filter size (KTRSC)
+      cutlass::Coord<3>({1, 1, 1}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w)
+    ));
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 32, 32, 32, 16},     // input size  (NDHWC)
+      {64, 2, 2, 2, 16},              // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({2, 2, 2}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w)
+    ));
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 16, 16, 16, 32},     // input size  (NDHWC)
+      {128, 2, 2, 2, 32},              // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({2, 2, 2}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w)
+    ));
+  }
+};
+} // namespace device
+} // namespace conv
+} // namespace test

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv3d_testbed.h ADDED Viewed

	@@ -0,0 +1,716 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed
+*/
+#pragma once
+#include <fstream>
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/device/convolution.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "conv3d_problems.h"
+#include "cutlass/core_io.h"
+#include "../cache_testbed_output.h"
+namespace test {
+namespace conv {
+namespace device {
+template <typename Conv3d>
+class TestbedConv3d {
+public:
+  using ElementA = typename Conv3d::ElementA;
+  using LayoutA = typename Conv3d::LayoutA;
+  using ElementB = typename Conv3d::ElementB;
+  using LayoutB = typename Conv3d::LayoutB;
+  using ElementC = typename Conv3d::ElementC;
+  using LayoutC = typename Conv3d::LayoutC;
+  using ElementAccumulator = typename Conv3d::ElementAccumulator;
+  using ElementCompute = typename Conv3d::ElementCompute;
+  using EpilogueOutputOp = typename Conv3d::EpilogueOutputOp;
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv3d::kConvolutionalOperator;
+  /// Reduction kernel
+  using ReductionOp = cutlass::reduction::thread::ReduceAdd<
+    ElementAccumulator,
+    typename EpilogueOutputOp::ElementAccumulator,
+    EpilogueOutputOp::kCount
+  >;
+  using ReductionKernel = cutlass::reduction::kernel::ReduceSplitK<
+    cutlass::MatrixShape<4, 32 * EpilogueOutputOp::kCount>,
+    EpilogueOutputOp,
+    ReductionOp
+  >;
+  using ReductionDevice = cutlass::reduction::device::ReduceSplitK<ReductionKernel>;
+  using ReductionStrideIndex = typename ReductionDevice::StrideIndex;
+public:
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
+public:
+  TestbedConv3d(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
+  }
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view,
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+    if (dist_kind == cutlass::Distribution::Uniform) {
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+      if (bits <= 8) {
+        scope = 2;
+      }
+      else if (bits == 16) {
+        scope = 4;
+      }
+      else {
+        scope = 8;
+      }
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    }
+    else if (dist_kind == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(view);
+    }
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    }
+    else {
+    }
+  }
+  void initialize(
+    cutlass::conv::Conv3dProblemSize const &problem_size, uint64_t seed = 2019) {
+    tensor_A.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size));
+    tensor_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_C.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    initialize_tensor(tensor_A.host_view(), init_A, seed);
+    initialize_tensor(tensor_B.host_view(), init_B, seed * 17);
+    initialize_tensor(tensor_C.host_view(), init_C, seed * 39);
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D_computed.sync_device();
+    tensor_D_reference.sync_device();
+  }
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+    size_t smem_size = sizeof(typename Conv3d::UnderlyingKernel::SharedStorage);
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+    result = cudaGetDeviceProperties(&properties, device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+    return true;
+  }
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv3dProblemSize const &problem_size,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute()) {
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+#if 0 //display conv2d problem size for debugging
+    std::cout << problem_size << std::endl
+              << "alpha, beta: (" << float(alpha) << ", " << float(beta) << ")" << std::endl
+              << "split_k_mode: " << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)") << std::endl
+              << std::endl;
+#endif
+    initialize(problem_size);
+    // configure the operator
+    Conv3d conv3d_op;
+    typename Conv3d::Arguments conv3d_args(
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_computed.device_ref(),
+      {alpha, beta},
+      split_k_mode
+    );
+    cutlass::Status status = conv3d_op.can_implement(conv3d_args);
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "can_implement failed for the given problem_size: \n";
+      return false;
+    }
+    // find workspace requirement for parallel split-k reduction
+    size_t workspace_size = Conv3d::get_workspace_size(conv3d_args);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+    status = conv3d_op.initialize(conv3d_args, workspace.get());
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+    // conv3d operation with parallel split-k-mode
+    if (split_k_mode == cutlass::conv::SplitKMode::kParallel) {
+      // conv3d output is written to workspace in global memory
+      conv3d_args.ref_D.reset(reinterpret_cast<ElementAccumulator*>(workspace.get()));
+      // accumulate mma for each cta in k-dimension (1.0 * A * B)
+      conv3d_args.output_op = {1.0, 0.0};
+      // update conv3d operator arguments
+      status = conv3d_op.update(conv3d_args, workspace.get());
+    }
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+    // run conv3d operator
+    status = conv3d_op();
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+    if (split_k_mode == cutlass::conv::SplitKMode::kParallel) {
+      // configure parallel reduction operator
+      ReductionDevice reduction_op;
+      typename ReductionDevice::Arguments reduction_args(
+        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn(),
+        problem_size.split_k_slices,
+        cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, problem_size),
+        {
+          reinterpret_cast<ElementAccumulator*> (workspace.get()),
+          ReductionStrideIndex(tensor_C.stride()[Conv3d::UnderlyingKernel::kTensorCStrideIdx])
+        },
+        {
+          tensor_D_computed.device_data(),
+          ReductionStrideIndex(tensor_C.stride()[Conv3d::UnderlyingKernel::kTensorCStrideIdx])
+        },
+        {
+          tensor_C.device_data(),
+          ReductionStrideIndex(tensor_C.stride()[Conv3d::UnderlyingKernel::kTensorCStrideIdx])
+        },
+        // apply alpha, beta to obtain the following equation alpha * ReduceAdd(A * B) + beta * C
+        {alpha, beta}
+      );
+      status = reduction_op.initialize(reduction_args, nullptr);
+      EXPECT_TRUE(status == cutlass::Status::kSuccess);
+      if (status != cutlass::Status::kSuccess) {
+        return false;
+      }
+      // run prallel reduction kernel
+      status = reduction_op();
+      EXPECT_TRUE(status == cutlass::Status::kSuccess);
+      if (status != cutlass::Status::kSuccess) {
+        return false;
+      }
+    }
+    bool passed = false;
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " device reference error: "
+                                   << cudaGetErrorString(result);
+    tensor_D_computed.sync_host();
+    //
+    // Reference check - support caching results
+    //
+    CachedTestKey cached_test_key = CreateCachedConv3dTestKey<
+        ElementA, LayoutA,
+        ElementB, LayoutB,
+        ElementC, LayoutC,
+        ElementAccumulator,
+        ElementCompute
+    >(
+        kConvolutionalOperator,
+        problem_size,
+        alpha,
+        beta,
+        tensor_A.host_view(),
+        tensor_B.host_view(),
+        tensor_C.host_view()
+      );
+    //
+    // Look for the cached key
+    //
+    bool cached_result_loaded = false;
+    CachedTestResult cached_test_result;
+    std::string conv3d_result_cache_name =
+      std::string("cached_results_") + CUTLASS_TARGET_NAME + ".txt";
+    if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+      CachedTestResultListing cached_results(conv3d_result_cache_name);
+      auto cached = cached_results.find(cached_test_key);
+      cached_result_loaded = cached.first;
+      if (cached_result_loaded) {
+        cached_test_result = cached.second;
+      }
+    }
+    if (!cached_result_loaded) {
+#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
+    cutlass::reference::device::Conv3d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementAccumulator,
+      ElementCompute
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_reference.device_ref(),
+      alpha,
+      beta
+    );
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_D_reference.sync_host();
+#else
+    cutlass::reference::host::Conv3d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementAccumulator,
+      ElementCompute
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      tensor_C.host_ref(),
+      tensor_D_reference.host_ref(),
+      alpha,
+      beta
+    );
+#endif
+      if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+        cached_test_result.D = TensorHash(tensor_D_reference.host_view());
+        CachedTestResultListing cached_results(conv3d_result_cache_name);
+        cached_results.append(cached_test_key, cached_test_result);
+        cached_results.write(conv3d_result_cache_name);
+      }
+    } // if (!cached_result_loaded)
+    uint32_t tensor_D_hash = TensorHash(tensor_D_computed.host_view());
+    if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+      passed = (tensor_D_hash == cached_test_result.D);
+      EXPECT_EQ(tensor_D_hash, cached_test_result.D)
+        << "Hash-based comparison failed for key:" << "\n" << cached_test_key << "\n";
+    }
+    else {
+      passed = cutlass::reference::host::TensorEquals(
+        tensor_D_computed.host_view(),
+        tensor_D_reference.host_view());
+    }
+    EXPECT_TRUE(passed);
+    if (!passed) {
+      std::stringstream fname;
+      fname << "error_Conv3d_ImplicitGemm_device_"
+        << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
+        << (Conv3d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
+            (Conv3d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" :
+              (Conv3d::kConvolutionalOperator == cutlass::conv::Operator::kDeconv ? "deconv_" : "wgrad_")))
+        << "ndhwc_"
+        << problem_size.N << "x"
+        << problem_size.D << "x"
+        << problem_size.H << "x"
+        << problem_size.W << "x"
+        << problem_size.C
+        << "_ktrsc_"
+        << problem_size.K << "x"
+        << problem_size.T << "x"
+        << problem_size.R << "x"
+        << problem_size.S << "x"
+        << problem_size.C
+        << "_padding_"
+        << problem_size.pad_d << "x"
+        << problem_size.pad_h << "x"
+        << problem_size.pad_w
+        << "_stride_"
+        << problem_size.stride_d << "x"
+        << problem_size.stride_h << "x"
+        << problem_size.stride_w
+        << "_dilation_"
+        << problem_size.dilation_d << "x"
+        << problem_size.dilation_h << "x"
+        << problem_size.dilation_w << "_"
+        << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_")
+        << Conv3d::ThreadblockShape::kM << "x"
+        << Conv3d::ThreadblockShape::kN << "x"
+        << Conv3d::ThreadblockShape::kK << "_"
+        << Conv3d::WarpShape::kM << "x"
+        << Conv3d::WarpShape::kN << "x"
+        << Conv3d::WarpShape::kK << ".txt";
+      std::cout << fname.str() << std::endl;
+      std::ofstream results(fname.str());
+      results << problem_size << std::endl;
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n";
+      results << "\nD reference (hash: " << cached_test_result.D << ")\n";
+      if (!cached_result_loaded) {
+        results
+          << tensor_D_reference.host_view() << "\n";
+      }
+      results
+        << "\nD computed (hash: " << tensor_D_hash << ")\n"
+        << tensor_D_computed.host_view() << "\n";
+    }
+    return passed;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
+// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
+// Additionally, each conv3d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
+// (conv_blacklist_sizes)
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename ImplicitGemm>
+bool TestAllConv3d(
+  const Conv3dProblemVector & conv_test_sizes = Conv3dProblemVector(),
+  const Conv3dProblemVector & conv_blacklist_sizes = Conv3dProblemVector()) {
+  bool passed = true;
+  //
+  // Testbed object
+  //
+  //TestbedConv3d<ImplicitGemm> testbed(cutlass::Distribution::Sequential, cutlass::Distribution::Sequential, cutlass::Distribution::Sequential);
+  TestbedConv3d<ImplicitGemm> testbed;
+  //
+  // Get conv problem sizes to run conv operator
+  //
+  TestbedConv3dProblemSizes conv3d_problems(128/cutlass::sizeof_bits<typename ImplicitGemm::ElementA>::value);
+  // Vector of conv3d problem sizes to avoid duplicate runs
+  Conv3dProblemVector conv_tested_sizes;
+  Conv3dProblemVector const *problem_vectors[] = {
+    &conv3d_problems.conv3d_default_sizes,
+    &conv3d_problems.conv3d_vnet_medical_sizes,
+    &conv_test_sizes
+  };
+  // Sweep conv3d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for (Conv3dProblemVector const * problem_vector : problem_vectors) {
+    //  Run conv testbed on default convolution sizes
+    for(auto conv_problem : *problem_vector) {
+      // Skip blacklist and avoid duplicate problem sizes
+      if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() ||
+          std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) {
+        continue;
+      }
+      //
+      // Procedurally disable certain cases
+      //
+      // CUTLASS DGRAD's unity stride specialization only support stride {1, 1, 1}
+      if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+            ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) &&
+          ((ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport ==
+            cutlass::conv::StrideSupport::kUnity) ||
+           (ImplicitGemm::UnderlyingKernel::Mma::IteratorB::kStrideSupport ==
+            cutlass::conv::StrideSupport::kUnity))) {
+        if (!((conv_problem.stride_d == 1) &&
+              (conv_problem.stride_h == 1) &&
+              (conv_problem.stride_w == 1))
+          ) {
+          continue;
+        }
+      }
+      //
+      // Test
+      //
+      // push back tested problem size to avoid re-running duplicates
+      conv_tested_sizes.push_back(conv_problem);
+      // test mode = xcross
+      passed = testbed.run(
+        conv_problem,
+        cutlass::conv::SplitKMode::kSerial);
+      if (!passed) {
+        return false;
+      }
+      // test mode = convolution
+      passed = testbed.run(
+        conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+        cutlass::conv::SplitKMode::kSerial);
+      if (!passed) {
+        return false;
+      }
+    }
+  }
+  // Sweep split-k-slice using serial reduction with non-unity alpha and non-zero beta for
+  // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters
+  // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep
+  // alpha and beta for local testing, but only runs one value for alpha and beta.
+  cutlass::conv::Conv3dProblemSize conv3d_split_k_test_size (
+    {1, 8, 8, 8, 32},            // input size  (NDHWC)
+    {32, 3, 3, 3, 32},               // filter size (KTRSC)
+    cutlass::Coord<3>({0, 0, 0}),   // padding (pad_d, pad_h, pad_w)
+    cutlass::Coord<3>({1, 1, 1}),   // stride (stride_d, stride_h, stride_w)
+    cutlass::Coord<3>({1, 1, 1})    // dilation (dilation_d, dilation_h, dilation_w)
+  );
+  cutlass::conv::SplitKMode split_k_modes [] = {
+    cutlass::conv::SplitKMode::kSerial,
+    cutlass::conv::SplitKMode::kParallel
+  };
+  int split_k_slices[] = {
+    1, 2, 3, 4, 201
+  };
+  double problem_alpha[] = {
+    2.0
+  };
+  double problem_beta[] = {
+    2.0
+  };
+  for (auto split_k_mode : split_k_modes) {
+    for (auto split_k_slice : split_k_slices) {
+      for (auto alpha : problem_alpha) {
+        for (auto beta : problem_beta) {
+          passed = testbed.run(
+            conv3d_split_k_test_size.reset_split_k_slices(split_k_slice),
+            split_k_mode,
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(alpha),
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(beta));
+          if (!passed) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+  return passed;
+}
+template <typename ImplicitGemm>
+bool TestSpecificConv3d(
+  const Conv3dProblemVector & problem_sizes) {
+  bool passed = true;
+  //
+  // Testbed object
+  //
+  TestbedConv3d<ImplicitGemm> testbed;
+  // Sweep conv3d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for(auto conv_problem : problem_sizes) {
+    //
+    // Test
+    //
+    // test mode = xcross
+    passed = testbed.run(
+      conv_problem,
+      cutlass::conv::SplitKMode::kSerial);
+    if (!passed) {
+      return false;
+    }
+    // test mode = convolution
+    passed = testbed.run(
+      conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+      cutlass::conv::SplitKMode::kSerial);
+    if (!passed) {
+      return false;
+    }
+  }
+  return true;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace device
+} // namespace conv
+} // namespace test

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv3d_with_broadcast_testbed.h ADDED Viewed

	@@ -0,0 +1,732 @@

+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM for fused epilogue broadcast testbed
+    Parallel split-k is not tested because we can just use regular conv kernel
+    when we need to use parallel-splitk.  Broadcast can happen in the reduction
+    kernel.
+*/
+#pragma once
+#include <fstream>
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+#include "conv3d_problems.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/device/convolution.h"
+#include "cutlass/core_io.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "../cache_testbed_output.h"
+namespace test {
+namespace conv {
+namespace device {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Conv3d>
+struct Conv3dWithBroadcastReferenceOp {
+  using OutputOp = typename Conv3d::EpilogueOutputOp;
+  using ElementCompute = typename OutputOp::ElementCompute;
+  using ElementZ = typename OutputOp::ElementZ;
+  using ElementT = typename OutputOp::ElementT;
+  typename OutputOp::BinaryOp binary_op;
+  typename OutputOp::ElementwiseOp elementwise_op;
+  Conv3dWithBroadcastReferenceOp() { }
+  void operator()(ElementZ &Z, ElementT &T, ElementCompute conv3d, ElementCompute bias) {
+    ElementCompute t_full = binary_op(conv3d, bias);
+    T = ElementT(t_full);
+    ElementCompute z_full = elementwise_op(t_full);
+    Z = ElementZ(z_full);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Fused testbed
+//
+//  Y = CONV(AB, C)
+//
+//  T[n, o, p, q, k] = ReductionOp(Y[n, o, p, q, k], Broadcast[k])
+//
+//  Z[n, o, p, q, k] = Elementwise(T[n, o, p, q, k])
+//
+template <
+  typename Conv3d,
+  typename ReferenceOp,
+  bool AddBroadcastFirst = false
+>
+class TestbedConv3dWithBroadcast {
+public:
+  using ElementA = typename Conv3d::ElementA;
+  using LayoutA = typename Conv3d::LayoutA;
+  using ElementB = typename Conv3d::ElementB;
+  using LayoutB = typename Conv3d::LayoutB;
+  using ElementC = typename Conv3d::ElementC;
+  using LayoutC = typename Conv3d::LayoutC;
+  using ElementAccumulator = typename Conv3d::ElementAccumulator;
+  using ElementCompute = typename Conv3d::ElementCompute;
+  using EpilogueOutputOp = typename Conv3d::EpilogueOutputOp;
+  using ElementZ = typename EpilogueOutputOp::ElementZ;
+  using ElementT = typename EpilogueOutputOp::ElementT;
+  using ElementVector = typename EpilogueOutputOp::ElementVector;
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv3d::kConvolutionalOperator;
+  static const bool kAddBroadcastFirst = AddBroadcastFirst;
+  static const bool kStoreT = EpilogueOutputOp::kStoreT;
+public:
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementAccumulator, LayoutC> tensor_C_reference;
+  cutlass::HostTensor<ElementZ, LayoutC> tensor_Z_computed;
+  cutlass::HostTensor<ElementZ, LayoutC> tensor_Z_reference;
+  cutlass::HostTensor<ElementT, LayoutC> tensor_T_computed;
+  cutlass::HostTensor<ElementT, LayoutC> tensor_T_reference;
+  cutlass::HostTensor<ElementAccumulator, LayoutC> tensor_Y_reference;
+  cutlass::HostTensor<ElementVector, LayoutC> tensor_Broadcast;            // Input Broadcast
+public:
+  TestbedConv3dWithBroadcast(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
+  }
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view,
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+    if (dist_kind == cutlass::Distribution::Uniform) {
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+      if (bits <= 8) {
+        scope = 2;
+      }
+      else if (bits == 16) {
+        if (cutlass::sizeof_bits<ElementAccumulator>::value <= 16) {
+          scope = 3;
+        }
+        else {
+          scope = 5;
+        }
+      }
+      else {
+        scope = 8;
+      }
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    }
+    else if (dist_kind == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(view);
+    }
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    }
+    else {
+    }
+  }
+  void initialize(
+    cutlass::conv::Conv3dProblemSize const &problem_size, bool non_packed_test = false, uint64_t seed = 2019) {
+    // to make the layout of tensors a little bit bigger than the problem size
+    cutlass::Tensor5DCoord stride_increment = cutlass::Tensor5DCoord(8, 16, 32, 32, 64);
+    cutlass::Tensor5DCoord tensor_A_extent = implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size);
+    cutlass::Tensor5DCoord tensor_B_extent = implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size);
+    cutlass::Tensor5DCoord tensor_C_extent = implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size);
+    if (non_packed_test) {
+      tensor_A_extent += stride_increment;
+      tensor_C_extent += stride_increment;
+    }
+    tensor_A.resize(tensor_A_extent);
+    tensor_B.resize(tensor_B_extent);
+    tensor_C.resize(tensor_C_extent);
+    tensor_C_reference.resize(tensor_C_extent);
+    tensor_Z_computed.resize(tensor_C_extent);
+    tensor_Z_reference.resize(tensor_C_extent);
+    tensor_T_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_T_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_Y_reference.resize(tensor_C_extent);
+    tensor_Broadcast.resize({
+      1,
+      1,
+      1,
+      1,
+      implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size).c(),
+    });
+    initialize_tensor(tensor_A.host_view(), init_A, seed);
+    initialize_tensor(tensor_B.host_view(), init_B, seed * 17);
+    initialize_tensor(tensor_C.host_view(), init_C, seed * 39);
+    initialize_tensor(tensor_Broadcast.host_view(), init_C, seed * 39);
+    for (int n = 0; n < tensor_C_reference.extent().n(); ++n) {
+      for (int o = 0; o < tensor_C_reference.extent().d(); ++o) {
+        for (int p = 0; p < tensor_C_reference.extent().h(); ++p) {
+          for (int q = 0; q < tensor_C_reference.extent().w(); ++q) {
+            for (int k = 0; k < tensor_C_reference.extent().c(); ++k) {
+              tensor_C_reference.at({n, o, p, q, k}) = ElementAccumulator(tensor_C.at({n, o, p, q, k}));
+            }
+          }
+        }
+      }
+    }
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_Broadcast.sync_device();
+    tensor_C_reference.sync_device();
+    tensor_Z_computed.sync_device();
+    tensor_Z_reference.sync_device();
+    tensor_T_computed.sync_device();
+    tensor_T_reference.sync_device();
+    tensor_Y_reference.sync_device();
+  }
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+    size_t smem_size = sizeof(typename Conv3d::UnderlyingKernel::SharedStorage);
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+    result = cudaGetDeviceProperties(&properties, device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+    return true;
+  }
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv3dProblemSize const &problem_size,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    bool non_packed_test = false,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(1)) {
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+#if 0 //display conv3d problem size for debugging
+    std::cout << problem_size << std::endl
+              << "alpha, beta: (" << alpha << ", " << beta << ")" << std::endl
+              << "split_k_mode: " << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)") << std::endl
+              << std::endl;
+#endif
+    initialize(problem_size, non_packed_test);
+    // configure the operator
+    Conv3d conv3d_op;
+    typename Conv3d::Arguments conv3d_args(
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_Z_computed.device_ref(),
+      {alpha, beta},
+      split_k_mode,
+      tensor_Broadcast.device_data(),
+      kStoreT ? tensor_T_computed.device_data() : nullptr,
+      0,         // This must be zero
+      implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size).c()
+    );
+    // initialize the kernel
+    size_t workspace_size = Conv3d::get_workspace_size(conv3d_args);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+    cutlass::Status status = conv3d_op.initialize(conv3d_args, workspace.get());
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+    // run conv3d operator
+    status = conv3d_op();
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+    bool passed = false;
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " device reference error: "
+                                   << cudaGetErrorString(result);
+    tensor_T_computed.sync_host();
+    tensor_Z_computed.sync_host();
+    //
+    // Reference check
+    //
+    // When kAddBroadcastFirst is true, add bias on the host
+    ElementCompute beta_ref = kAddBroadcastFirst ? ElementCompute(0) : beta;
+#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
+    cutlass::reference::device::Conv3d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementAccumulator,
+      LayoutC,
+      ElementAccumulator,
+      ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C_reference.device_ref(),
+      tensor_Y_reference.device_ref(),
+      alpha,
+      beta_ref);
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_Y_reference.sync_host();
+#else
+    cutlass::reference::host::Conv3d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementAccumulator,
+      LayoutC,
+      ElementAccumulator,
+      ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      tensor_C_reference.host_ref(),
+      tensor_Y_reference.host_ref(),
+      alpha,
+      beta_ref);
+#endif
+    ReferenceOp reference_op;
+    // compute tensor Z and tensor T
+    for (int n = 0; n < problem_size.N; ++n) {
+      for (int o = 0; o < (kConvolutionalOperator == cutlass::conv::Operator::kFprop ? problem_size.Z : problem_size.D); ++o) {
+        for (int p = 0; p < (kConvolutionalOperator == cutlass::conv::Operator::kFprop ? problem_size.P : problem_size.H); ++p) {
+          for (int q = 0; q < (kConvolutionalOperator == cutlass::conv::Operator::kFprop ? problem_size.Q : problem_size.W); ++q) {
+            for (int k = 0; k < (kConvolutionalOperator == cutlass::conv::Operator::kFprop ? problem_size.K : problem_size.C); ++k) {
+              ElementZ z{};
+              ElementT t{};
+              ElementCompute accum = tensor_Y_reference.at({n, o, p, q, k});
+              ElementCompute bias = ElementCompute(tensor_Broadcast.at({0, 0, 0, 0, k}));
+              if (kAddBroadcastFirst) {
+                reference_op(z, t, accum + bias,
+                            beta * ElementCompute(tensor_C_reference.at({n, o, p, q, k})));
+              } else {
+                reference_op(z, t, accum, bias);
+              }
+              tensor_Z_reference.at({n, o, p, q, k}) = z;
+              tensor_T_reference.at({n, o, p, q, k}) = t;
+            }
+          }
+        }
+      }
+    }
+    if (kStoreT) {
+      passed = cutlass::reference::host::TensorEquals(
+        tensor_T_computed.host_view(),
+        tensor_T_reference.host_view());
+      EXPECT_TRUE(passed);
+    }
+    passed = cutlass::reference::host::TensorEquals(
+      tensor_Z_computed.host_view(),
+      tensor_Z_reference.host_view());
+    EXPECT_TRUE(passed);
+    if (!passed) {
+      std::stringstream fname;
+      fname << "error_Conv3d_ImplicitGemm_device_"
+        << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
+        << (Conv3d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
+            (Conv3d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" :
+              (Conv3d::kConvolutionalOperator == cutlass::conv::Operator::kDeconv ? "deconv_" : "wgrad_")))
+        << "nnhwc_"
+        << problem_size.N << "x"
+        << problem_size.D << "x"
+        << problem_size.H << "x"
+        << problem_size.W << "x"
+        << problem_size.C
+        << "_krsc_"
+        << problem_size.K << "x"
+        << problem_size.T << "x"
+        << problem_size.R << "x"
+        << problem_size.S << "x"
+        << problem_size.C
+        << "_padding_"
+        << problem_size.pad_d << "x"
+        << problem_size.pad_h << "x"
+        << problem_size.pad_w
+        << "_stride_"
+        << problem_size.stride_d << "x"
+        << problem_size.stride_h << "x"
+        << problem_size.stride_w
+        << "_dilation_"
+        << problem_size.dilation_d << "x"
+        << problem_size.dilation_h << "x"
+        << problem_size.dilation_w << "_"
+        << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_")
+        << (non_packed_test ? "non_packed_tensor_test_" : "packed_tensor_test_")
+        << Conv3d::ThreadblockShape::kM << "x"
+        << Conv3d::ThreadblockShape::kN << "x"
+        << Conv3d::ThreadblockShape::kK << "_"
+        << Conv3d::WarpShape::kM << "x"
+        << Conv3d::WarpShape::kN << "x"
+        << Conv3d::WarpShape::kK << ".txt";
+      std::cout << fname.str() << std::endl;
+      std::ofstream results(fname.str());
+      results << problem_size << std::endl;
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n"
+        << "\nBroadcast:\n" << tensor_Broadcast.host_view() << "\n"
+        << "\nY reference:\n" << tensor_Y_reference.host_view() << "\n"
+        << "\nT reference:\n" << tensor_T_reference.host_view() << "\n"
+        << "\nT computed:\n" << tensor_T_computed.host_view() << "\n"
+        << "\nZ reference:\n" << tensor_Z_reference.host_view() << "\n"
+        << "\nZ computed:\n" << tensor_Z_computed.host_view() << "\n";
+    }
+    return passed;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
+// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv3dProblemSizes
+// Additionally, each conv3d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
+// (conv_blacklist_sizes)
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename ImplicitGemm,
+          typename ReferenceOp = Conv3dWithBroadcastReferenceOp<ImplicitGemm>,
+          bool AddBroadcastFirst = false,
+          bool TestSplitK = true
+>
+bool TestAllConv3dWithBroadcast(
+  const Conv3dProblemVector &conv_test_sizes = Conv3dProblemVector(),
+  const Conv3dProblemVector &conv_blacklist_sizes = Conv3dProblemVector(),
+  bool non_packed_test = false) {
+  bool passed = true;
+  //
+  // Testbed object
+  //
+  TestbedConv3dWithBroadcast<ImplicitGemm, ReferenceOp, AddBroadcastFirst> testbed;
+  //
+  // Get conv problem sizes to run conv operator
+  //
+  TestbedConv3dProblemSizes conv3d_problems(128/cutlass::sizeof_bits<typename ImplicitGemm::ElementA>::value);
+  // Vector of conv3d problem sizes to avoid duplicate runs
+  Conv3dProblemVector conv_tested_sizes;
+  Conv3dProblemVector const *problem_vectors[] = {
+    &conv3d_problems.conv3d_default_sizes,
+    &conv3d_problems.conv3d_vnet_medical_sizes,
+    &conv_test_sizes
+  };
+  // Sweep conv3d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for (Conv3dProblemVector const * problem_vector : problem_vectors) {
+    //  Run conv testbed on default convolution sizes
+    for(auto conv_problem : *problem_vector) {
+      // Skip blacklist and avoid duplicate problem sizes
+      if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() ||
+          std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) {
+        continue;
+      }
+      //
+      // Procedurally disable certain cases
+      //
+      // CUTLASS DGRAD's *unity* stride specialization only support stride {1, 1}
+      if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+            ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) &&
+          (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport ==
+            cutlass::conv::StrideSupport::kUnity)) {
+        if (!((conv_problem.stride_d == 1) &&
+              (conv_problem.stride_h == 1) &&
+              (conv_problem.stride_w == 1))
+          ) {
+          continue;
+        }
+      }
+#if 0 // relax restrictions on analytic strided dgrad
+      // CUTLASS DGRAD's *strided* specialization only support stride >= {2, 2}
+      if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+            ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) &&
+          (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport ==
+            cutlass::conv::StrideSupport::kStrided)) {
+         if (((conv_problem.stride_d == 1) && (conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+           continue;
+         }
+      }
+#endif
+      //
+      // Test
+      //
+      // push back tested problem size to avoid re-running duplicates
+      conv_tested_sizes.push_back(conv_problem);
+      // test mode = xcross
+      passed = testbed.run(
+        conv_problem,
+        cutlass::conv::SplitKMode::kSerial, non_packed_test);
+      if (!passed) {
+        return false;
+      }
+      // test mode = convolution
+      passed = testbed.run(
+        conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+        cutlass::conv::SplitKMode::kSerial, non_packed_test);
+      if (!passed) {
+        return false;
+      }
+    }
+  }
+  if (!TestSplitK)
+    return passed;
+  // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for
+  // a single conv3d problem size. Convolution unit tests take a long time to run so only sweep parameters
+  // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep
+  // alpha and beta for local testing, but only runs one value for alpha and beta.
+  cutlass::conv::Conv3dProblemSize conv3d_split_k_test_size (
+    {1, 8, 8, 8, 32},               // input size  (NDHWC)
+    {32, 3, 3, 3, 32},              // filter size (KTRSC)
+    cutlass::Coord<3>({0, 0, 0}),   // padding (pad_d, pad_h, pad_w)
+    cutlass::Coord<3>({1, 1, 1}),   // stride (stride_d, stride_h, stride_w)
+    cutlass::Coord<3>({1, 1, 1})    // dilation (dilation_d, dilation_h, dilation_w)
+  );
+  cutlass::conv::SplitKMode split_k_modes [] = {
+    cutlass::conv::SplitKMode::kSerial
+  };
+  int split_k_slices[] = {
+    1, 2, 3, 4, 201
+  };
+  double problem_alpha[] = {
+    2.0
+  };
+  double problem_beta[] = {
+    2.0
+  };
+  for (auto split_k_mode : split_k_modes) {
+    for (auto split_k_slice : split_k_slices) {
+      for (auto alpha : problem_alpha) {
+        for (auto beta : problem_beta) {
+          passed = testbed.run(
+            conv3d_split_k_test_size.reset_split_k_slices(split_k_slice),
+            split_k_mode,
+            false,/*non_packed_test*/
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(alpha),
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(beta));
+          if (!passed) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+  return passed;
+}
+template <typename ImplicitGemm,
+          typename ReferenceOp = Conv3dWithBroadcastReferenceOp<ImplicitGemm>,
+          bool AddBroadcastFirst = false>
+bool TestSpecificConv3dWithBroadcast(
+  const Conv3dProblemVector & problem_sizes,
+  bool non_packed_test = false) {
+  bool passed = true;
+  //
+  // Testbed object
+  //
+  TestbedConv3dWithBroadcast<ImplicitGemm, ReferenceOp, AddBroadcastFirst> testbed;
+  // Sweep conv3d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for(auto conv_problem : problem_sizes) {
+    //
+    // Test
+    //
+    // test mode = xcross, non_packed_test = false
+    passed = testbed.run(
+      conv_problem,
+      cutlass::conv::SplitKMode::kSerial, non_packed_test);
+    if (!passed) {
+      return false;
+    }
+    // test mode = convolution, non_packed_test = false
+    passed = testbed.run(
+      conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+      cutlass::conv::SplitKMode::kSerial, non_packed_test);
+    if (!passed) {
+      return false;
+    }
+  }
+  return true;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace device
+} // namespace conv
+} // namespace test

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/depthwise_conv2d_direct_conv_testbed.h ADDED Viewed

	@@ -0,0 +1,473 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Depthwise Direct Conv testbed
+*/
+#pragma once
+#include <fstream>
+#include "../../common/cutlass_unit_test.h"
+#include "../cache_testbed_output.h"
+#include "conv2d_problems.h"
+#include "cutlass/conv/device/direct_convolution.h"
+#include "cutlass/core_io.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/device/convolution.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+namespace test {
+namespace conv {
+namespace device {
+template <typename Conv2d>
+class TestbedDepthwiseDirectConv2d {
+ public:
+  using ElementA = typename Conv2d::ElementA;
+  using LayoutA = typename Conv2d::LayoutA;
+  using ElementB = typename Conv2d::ElementB;
+  using LayoutB = typename Conv2d::LayoutB;
+  using ElementC = typename Conv2d::ElementC;
+  using LayoutC = typename Conv2d::LayoutC;
+  using ElementAccumulator = typename Conv2d::ElementAccumulator;
+  using ElementCompute = typename Conv2d::ElementCompute;
+  using EpilogueOutputOp = typename Conv2d::EpilogueOutputOp;
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv2d::kConvolutionalOperator;
+ public:
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_reordered_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
+  int tested_problem_count;
+ public:
+  TestbedDepthwiseDirectConv2d(cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+                               cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+                               cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+                               uint64_t seed_ = 2080)
+      : init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_), tested_problem_count(0) {}
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(cutlass::TensorView<Element, Layout> view,
+                         cutlass::Distribution::Kind dist_kind,
+                         uint64_t seed) {
+    if (dist_kind == cutlass::Distribution::Uniform) {
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+      if (bits <= 8) {
+        scope = 2;
+      } else if (bits == 16) {
+        if (cutlass::sizeof_bits<ElementAccumulator>::value <= 16) {
+          scope = 3;
+        } else {
+          scope = 5;
+        }
+      } else {
+        scope = 8;
+      }
+      cutlass::reference::host::TensorFillRandomUniform(view, seed, scope, -scope, 0);
+    } else if (dist_kind == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(view);
+    } else if (dist_kind == cutlass::Distribution::Gaussian) {
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    } else if (dist_kind == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    } else {
+    }
+  }
+  void initialize(cutlass::conv::Conv2dProblemSize const &problem_size, uint64_t seed = 2019) {
+    tensor_A.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size));
+    tensor_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_reordered_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_C.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    initialize_tensor(tensor_A.host_view(), init_A, seed);
+    initialize_tensor(tensor_B.host_view(), init_B, seed * 17);
+    initialize_tensor(tensor_reordered_B.host_view(), init_B, seed * 17);
+    initialize_tensor(tensor_C.host_view(), init_C, seed * 39);
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_reordered_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D_computed.sync_device();
+    tensor_D_reference.sync_device();
+  }
+  bool sufficient(int smem_size) const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+    result = cudaGetDeviceProperties(&properties, device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+    if (properties.sharedMemPerBlockOptin < static_cast<size_t>(smem_size)) {
+      return false;
+    }
+    return true;
+  }
+  /// Executes one test
+  bool run(cutlass::conv::Conv2dProblemSize const &problem_size,
+           cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+           ElementCompute alpha = ElementCompute(1.5),
+           ElementCompute beta = ElementCompute(1)) {
+    // increment tested problem count run by the testbed
+    tested_problem_count++;
+#if 0 // display conv2d problem size for debugging
+    std::cout << problem_size << std::endl
+              << "alpha, beta: (" << alpha << ", " << beta << ")" << std::endl
+              << "split_k_mode: "
+              << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)")
+              << std::endl
+              << std::endl;
+#endif
+    initialize(problem_size);
+    // configure the operator
+    Conv2d conv2d_op;
+    typename Conv2d::Arguments conv2d_args(problem_size,
+                                           tensor_A.device_ref(),
+                                           tensor_B.device_ref(),
+                                           tensor_C.device_ref(),
+                                           tensor_D_computed.device_ref(),
+                                           {alpha, beta},
+                                           tensor_reordered_B.device_ref(),
+                                           split_k_mode);
+    // find workspace requirement for parallel split-k reduction
+    size_t workspace_size = Conv2d::get_workspace_size(conv2d_args);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+    cutlass::Status status = conv2d_op.can_implement(problem_size);
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+    status = conv2d_op.initialize(conv2d_args, workspace.get());
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+    if (!sufficient(conv2d_op.get_smem_size())) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+    // run conv2d operator
+    status = conv2d_op();
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to run." << std::endl;
+      return false;
+    }
+    bool passed = false;
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " device reference error: " << cudaGetErrorString(result);
+    tensor_D_computed.sync_host();
+    //
+    // Reference check - support caching results
+    //
+    CachedTestKey cached_test_key =
+        CreateCachedConv2dTestKey<ElementA,
+                                  LayoutA,
+                                  ElementB,
+                                  LayoutB,
+                                  ElementC,
+                                  LayoutC,
+                                  ElementAccumulator,
+                                  ElementCompute>(kConvolutionalOperator,
+                                                  problem_size,
+                                                  alpha,
+                                                  beta,
+                                                  tensor_A.host_view(),
+                                                  tensor_B.host_view(),
+                                                  tensor_C.host_view());
+    //
+    // Look for the cached key
+    //
+    bool cached_result_loaded = false;
+    CachedTestResult cached_test_result;
+    std::string conv2d_result_cache_name =
+        std::string("cached_results_") + CUTLASS_TARGET_NAME + ".txt";
+    if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+      CachedTestResultListing cached_results(conv2d_result_cache_name);
+      auto cached = cached_results.find(cached_test_key);
+      cached_result_loaded = cached.first;
+      if (cached_result_loaded) {
+        cached_test_result = cached.second;
+      }
+    }
+    if (!cached_result_loaded) {
+#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
+      cutlass::reference::device::Conv2d<ElementA,
+                                         LayoutA,
+                                         ElementB,
+                                         LayoutB,
+                                         ElementC,
+                                         LayoutC,
+                                         ElementCompute,
+                                         ElementAccumulator>(kConvolutionalOperator,
+                                                             problem_size,
+                                                             tensor_A.device_ref(),
+                                                             tensor_B.device_ref(),
+                                                             tensor_C.device_ref(),
+                                                             tensor_D_reference.device_ref(),
+                                                             alpha,
+                                                             beta);
+      // sync host (copy device data to host) for dumping error output in case of mismatches
+      tensor_D_reference.sync_host();
+#else
+      cutlass::reference::host::Conv2d<ElementA,
+                                       LayoutA,
+                                       ElementB,
+                                       LayoutB,
+                                       ElementC,
+                                       LayoutC,
+                                       ElementCompute,
+                                       ElementAccumulator>(kConvolutionalOperator,
+                                                           problem_size,
+                                                           tensor_A.host_ref(),
+                                                           tensor_B.host_ref(),
+                                                           tensor_C.host_ref(),
+                                                           tensor_D_reference.host_ref(),
+                                                           alpha,
+                                                           beta);
+#endif
+      if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+        cached_test_result.D = TensorHash(tensor_D_reference.host_view());
+        CachedTestResultListing cached_results(conv2d_result_cache_name);
+        cached_results.append(cached_test_key, cached_test_result);
+        cached_results.write(conv2d_result_cache_name);
+      }
+    } // if (!cached_result_loaded)
+    uint32_t tensor_D_hash = TensorHash(tensor_D_computed.host_view());
+    if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+      passed = (tensor_D_hash == cached_test_result.D);
+      EXPECT_EQ(tensor_D_hash, cached_test_result.D)
+        << "Hash-based comparison failed for key:" << "\n" << cached_test_key << "\n";
+    }
+    else {
+      passed = cutlass::reference::host::TensorEquals(
+        tensor_D_computed.host_view(),
+                                                      tensor_D_reference.host_view());
+    }
+    EXPECT_TRUE(passed);
+    std::stringstream ss_problem_size_text;
+    ss_problem_size_text         << "nhwc_"
+        << problem_size.N << "x"
+        << problem_size.H << "x"
+        << problem_size.W << "x"
+        << problem_size.C
+        << "_krsc_"
+        << problem_size.K << "x"
+        << problem_size.R << "x"
+        << problem_size.S << "x"
+        << problem_size.C
+        << "_padding_"
+        << problem_size.pad_h << "x"
+        << problem_size.pad_w
+        << "_stride_"
+        << problem_size.stride_h << "x"
+        << problem_size.stride_w
+        << "_dilation_"
+        << problem_size.dilation_h << "x"
+                         << problem_size.dilation_w << "_"
+        << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_");
+    if (!passed) {
+      std::stringstream fname;
+      fname << "error_Conv2d_DirectConv_device_"
+        << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
+        << (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
+            (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" : "wgrad_"))
+        << ss_problem_size_text.str()
+        << Conv2d::ThreadblockShape::kM << "x"
+        << Conv2d::ThreadblockShape::kN << "x"
+        << Conv2d::ThreadblockShape::kK << "_"
+        << Conv2d::WarpShape::kM << "x"
+        << Conv2d::WarpShape::kN << "x"
+        << Conv2d::WarpShape::kK << ".txt";
+      std::cout << fname.str() << std::endl;
+      std::ofstream results(fname.str());
+      results << problem_size << std::endl;
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n";
+      results << "\nD reference (hash: " << cached_test_result.D << ")\n";
+      if (!cached_result_loaded) {
+        results
+          << tensor_D_reference.host_view() << "\n";
+      }
+      results
+        << "\nD computed (hash: " << tensor_D_hash << ")\n"
+              << tensor_D_computed.host_view() << "\n";
+    }
+    return passed;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename DirectConv>
+bool TestSpecificDepthwiseDirectConv2d(const Conv2dProblemVector &problem_sizes) {
+  bool passed = true;
+  //
+  // Testbed object
+  //
+  TestbedDepthwiseDirectConv2d<DirectConv> testbed;
+  // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for (auto conv_problem : problem_sizes) {
+    //
+    // Test
+    //
+    // test mode = xcross
+    passed = testbed.run(
+      conv_problem,
+      cutlass::conv::SplitKMode::kSerial);
+    if (!passed) {
+      return false;
+    }
+    // test mode = convolution
+    passed = testbed.run(
+      conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+      cutlass::conv::SplitKMode::kSerial);
+    if (!passed) {
+      return false;
+    }
+  }
+  return true;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace device
+} // namespace conv
+} // namespace test
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/device_3x/conv_problem_sizes.hpp ADDED Viewed

	@@ -0,0 +1,1385 @@

+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief CUTLASS 3.x Implicit GEMM testbed sizes for ConvNd problem
+*/
+#pragma once
+#include "cutlass/conv/convnd_problem_shape.hpp"
+#include <vector>
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace test::conv::device {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template<int SpatialDim, cutlass::conv::Operator ConvOp, bool SupportStrides = (ConvOp != cutlass::conv::Operator::kDgrad)>
+std::vector<cutlass::conv::ConvProblemShape<ConvOp, SpatialDim>>
+inline
+get_conv_problem_vector();
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Fprop
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Specialization for 1D fprop problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kFprop, 1>> inline
+get_conv_problem_vector<1, cutlass::conv::Operator::kFprop>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kFprop, 1>;
+  std::vector<ProblemShape> problem_shapes;
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 64},  // nwc
+    {64, 1, 64},  // ksc
+    {0},          // padding lower (pad_w)
+    {0},          // padding upper (pad_w)
+    {1},          // stride (stride_w)
+    {1},          // dilation (dilation_w)
+    1             // group
+  });
+  // non-packed input strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,   8,  64},  // nwc
+    {800, 80, 1},   // stride (nwc)
+    {64,  1,  64},  // ksc
+    {64,  64, 1},   // stride (ksc)
+    {0},            // padding lower (pad_w)
+    {0},            // padding upper (pad_w)
+    {1},            // stride (stride_w)
+    {1},            // dilation (dilation_w)
+    1               // group
+  });
+  // non-packed output strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,   8,  64},  // nwc
+    {512, 64, 1},   // stride (nwc)
+    {64,  1,  64},  // ksc
+    {64,  64, 1},   // stride (ksc)
+    {800, 80, 1},   // stride (nqk)
+    {0},            // padding lower (pad_w)
+    {0},            // padding upper (pad_w)
+    {1},            // stride (stride_w)
+    {1},            // dilation (dilation_w)
+    1               // group
+  });
+  // Filter-K = 16 for predication
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1, 8, 64},
+    {16,1, 64},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // N = 2 and K = 128 for a larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 64},
+    {96, 1, 64},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // N = 7 and K = 256 for a even larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {7,   8, 64},
+    {256, 1, 64},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, no padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 64},
+    {256, 3, 64},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, symmetric padding with c % cta_k !=0
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 32},
+    {256, 3, 32},
+    {1},
+    {1},
+    {1},
+    {1},
+    1
+  });
+  // 4 filter, asymmetric padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 64},
+    {256, 4, 64},
+    {0},
+    {1},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, asymmetric padding and tstride of 2
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 64},
+    {256, 3, 64},
+    {0},
+    {1},
+    {2},
+    {1},
+    1
+  });
+  // 3 filter, asymmetric padding and dilation of 2
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 64},
+    {256, 3, 64},
+    {0},
+    {1},
+    {1},
+    {2},
+    1
+  });
+  return problem_shapes;
+}
+// Specialization for 2D fprop problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kFprop, 2>> inline
+get_conv_problem_vector<2, cutlass::conv::Operator::kFprop>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kFprop, 2>;
+  std::vector<ProblemShape> problem_shapes;
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 8, 64},  // nhwc
+    {64, 1, 1, 64},  // krsc
+    {0, 0},          // padding lower (pad_h, pad_w)
+    {0, 0},          // padding upper (pad_h, pad_w)
+    {1, 1},          // stride (stride_h, stride_w)
+    {1, 1},          // dilation (dilation_h, dilation_w)
+    1                // group
+  });
+  // non-packed input strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,    8,   8,  64},  // nhwc
+    {8000, 800, 80, 1},   // stride (nhwc)
+    {64,   1,   1,  64},  // krsc
+    {64,   64,  64, 1},   // stride (krsc)
+    {0, 0},               // padding lower (pad_h, pad_w)
+    {0, 0},               // padding upper (pad_h, pad_w)
+    {1, 1},               // stride (stride_h, stride_w)
+    {1, 1},               // dilation (dilation_h, dilation_w)
+    1                     // group
+  });
+  // non-packed output strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,    8,   8,  64},  // nhwc
+    {4096, 512, 64, 1},   // stride (nhwc)
+    {64,   1,   1,  64},  // krsc
+    {64,   64,  64, 1},   // stride (krsc)
+    {8000, 800, 80, 1},   // stride (npqk)
+    {0, 0},               // padding lower (pad_h, pad_w)
+    {0, 0},               // padding upper (pad_h, pad_w)
+    {1, 1},               // stride (stride_h, stride_w)
+    {1, 1},               // dilation (dilation_h, dilation_w)
+    1                     // group
+  });
+  // Filter-K = 16 for predication
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 8, 64},
+    {16, 1, 1, 64},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // N = 2 and K = 128 for a larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 8, 64},
+    {96, 1, 1, 64},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // N = 7 and K = 256 for a even larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {7,   8, 8, 64},
+    {256, 1, 1, 64},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 3x3 filter, no padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 8, 64},
+    {256, 3, 3, 64},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 3x3 filter, symmetric padding with c % cta_k !=0
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 8, 32},
+    {256, 3, 3, 32},
+    {1, 1},
+    {1, 1},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,2/1,2
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 8, 64},
+    {256, 2, 5, 64},
+    {1, 1},
+    {2, 2},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ stride
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   7, 7, 64},
+    {256, 2, 5, 64},
+    {1, 1},
+    {0, 0},
+    {2, 3},
+    {1, 1},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   16, 16, 64},
+    {256, 2,  5,  64},
+    {1, 1},
+    {0, 0},
+    {1, 1},
+    {2, 3},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ stride, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   16, 15, 64},
+    {256, 2,  5,  64},
+    {1, 1},
+    {0, 0},
+    {2, 3},
+    {2, 3},
+    1
+  });
+  return problem_shapes;
+}
+// Specialization for 3D fprop problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kFprop, 3>> inline
+get_conv_problem_vector<3, cutlass::conv::Operator::kFprop>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kFprop, 3>;
+  std::vector<ProblemShape> problem_shapes;
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  1, 8, 8, 64},  // ndhwc
+    {64, 1, 1, 1, 64},  // ktrsc
+    {0, 0, 0},          // padding lower (pad_d, pad_h, pad_w)
+    {0, 0, 0},          // padding upper (pad_d, pad_h, pad_w)
+    {1, 1, 1},          // stride (stride_d, stride_h, stride_w)
+    {1, 1, 1},          // dilation (dilation_d, dilation_h, dilation_w)
+    1                   // group
+  });
+  // non-packed input output strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,    1,    8,   8,  64},  // ndhwc
+    {8000, 8000, 800, 80, 1},   // stride (ndhwc)
+    {64,   1,    1,   1,  64},  // ktrsc
+    {64,   64,   64,  64, 1},   // stride (ktrsc)
+    {8000, 8000, 800, 80, 1},   // stride (nzpqk)
+    {0, 0, 0},                  // padding lower (pad_d, pad_h, pad_w)
+    {0, 0, 0},                  // padding upper (pad_d, pad_h, pad_w)
+    {1, 1, 1},                  // stride (stride_d, stride_h, stride_w)
+    {1, 1, 1},                  // dilation (dilation_d, dilation_h, dilation_w)
+    1                           // group
+  });
+  // Filter-K = 16 for predication
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  1, 8, 8, 64},
+    {16, 1, 1, 1, 64},
+    {0, 0, 0},
+    {0, 0, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // N = 7 and K = 256 for a larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  1, 8, 8, 64},
+    {96, 1, 1, 1, 64},
+    {0, 0, 0},
+    {0, 0, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x3x3 + no padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  3, 5, 8, 64},
+    {96, 3, 3, 3, 64},
+    {0, 0, 0},
+    {0, 0, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x3x3 + symmetric padding with c % cta_k !=0
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  3, 5, 8, 32},
+    {96, 3, 3, 3, 32},
+    {1, 1, 1},
+    {1, 1, 1},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + symmetric padding 111
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  3, 5, 8, 64},
+    {96, 3, 4, 5, 64},
+    {1, 1, 1},
+    {1, 1, 1},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  3, 5, 8, 64},
+    {96, 3, 4, 5, 64},
+    {1, 0, 1},
+    {0, 2, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010, w/ stride
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  16, 10, 16, 64},
+    {96, 3, 4, 5, 64},
+    {1, 0, 1},
+    {0, 2, 0},
+    {2, 2, 3},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  16, 10, 16, 64},
+    {96, 3,  4,  5,  64},
+    {1, 0, 1},
+    {0, 2, 0},
+    {1, 1, 1},
+    {2, 2, 3},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010, w/ stride, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  16, 10, 16, 64},
+    {96, 3,  4,  5,  64},
+    {1, 0, 1},
+    {0, 2, 0},
+    {2, 2, 3},
+    {2, 2, 3},
+    1
+  });
+  return problem_shapes;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Wgrad
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Specialization for 1D wgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kWgrad, 1>> inline
+get_conv_problem_vector<1, cutlass::conv::Operator::kWgrad>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kWgrad, 1>;
+  std::vector<ProblemShape> problem_shapes;
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 64},  // nwc
+    {64, 1, 64},  // ksc
+    {0},          // padding lower (pad_w)
+    {0},          // padding upper (pad_w)
+    {1},          // stride (stride_w)
+    {1},          // dilation (dilation_w)
+    1             // group
+  });
+  // Filter-K = 16 for predication
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1, 8, 64},
+    {16,1, 64},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // N = 2 and K = 128 for a larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 64},
+    {96, 1, 64},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // N = 7 and K = 256 for a even larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {7,   8, 64},
+    {256, 1, 64},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, no padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 32},
+    {256, 3, 32},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, symmetric padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 32},
+    {256, 3, 32},
+    {1},
+    {1},
+    {1},
+    {1},
+    1
+  });
+  // 4 filter, asymmetric padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 32},
+    {256, 4, 32},
+    {0},
+    {1},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, asymmetric padding and tstride of 2
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 32},
+    {256, 3, 32},
+    {0},
+    {1},
+    {2},
+    {1},
+    1
+  });
+  // 3 filter, asymmetric padding and dilation of 2
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 32},
+    {256, 3, 32},
+    {0},
+    {1},
+    {1},
+    {2},
+    1
+  });
+  // To test streamk, equals to gemm-MxNxK size 128x640x2048
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   1024, 128},
+    {640, 1,    128},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // To test streamk, equals to gemm-MxNxK size 128x640x2080
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   1040, 128},
+    {640, 1,    128},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  return problem_shapes;
+}
+// Specialization for 2D wgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kWgrad, 2>> inline
+get_conv_problem_vector<2, cutlass::conv::Operator::kWgrad>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kWgrad, 2>;
+  std::vector<ProblemShape> problem_shapes;
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 8, 64},  // nhwc
+    {64, 1, 1, 64},  // krsc
+    {0, 0},          // padding lower (pad_h, pad_w)
+    {0, 0},          // padding upper (pad_h, pad_w)
+    {1, 1},          // stride (stride_h, stride_w)
+    {1, 1},          // dilation (dilation_h, dilation_w)
+    1                // group
+  });
+  // Filter-K = 16 for predication
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 8, 64},
+    {16, 1, 1, 64},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // N = 2 and K = 128 for a larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 8, 64},
+    {96, 1, 1, 64},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // N = 7 and K = 256 for a even larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {7,   8, 8, 64},
+    {256, 1, 1, 64},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 3x3 filter, no padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 8, 32},
+    {256, 3, 3, 32},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 3x3 filter, symmetric padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 8, 32},
+    {256, 3, 3, 32},
+    {1, 1},
+    {1, 1},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 8, 32},
+    {256, 2, 5, 32},
+    {1, 1},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ stride
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   15, 16, 32},
+    {256, 2,  5,  32},
+    {1, 1},
+    {0, 0},
+    {2, 3},
+    {1, 1},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   16, 16, 32},
+    {256, 2,  5,  32},
+    {1, 1},
+    {0, 0},
+    {1, 1},
+    {2, 3},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ stride, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   16, 15, 32},
+    {256, 2,  5,  32},
+    {1, 1},
+    {0, 0},
+    {2, 3},
+    {2, 3},
+    1
+  });
+  // To test streamk, equals to gemm-MxNxK size 128x640x2048
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   64, 16, 128},
+    {640, 1,  1,  128},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // To test streamk, equals to gemm-MxNxK size 128x640x2080
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   65, 16, 128},
+    {640, 1,  1,  128},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  return problem_shapes;
+}
+// Specialization for 3D wgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kWgrad, 3>> inline
+get_conv_problem_vector<3, cutlass::conv::Operator::kWgrad>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kWgrad, 3>;
+  std::vector<ProblemShape> problem_shapes;
+  problem_shapes.push_back({
+     cutlass::conv::Mode::kCrossCorrelation,
+     {2,  1, 8, 8, 64},  // ndhwc
+     {64, 1, 1, 1, 64},  // ktrsc
+     {0, 0, 0},          // padding lower (pad_d, pad_h, pad_w)
+     {0, 0, 0},          // padding upper (pad_d, pad_h, pad_w)
+     {1, 1, 1},          // stride (stride_d, stride_h, stride_w)
+     {1, 1, 1},          // dilation (dilation_d, dilation_h, dilation_w)
+     1                   // group
+   });
+  // Filter 3x3x3 + no padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  3, 5, 8, 32},
+    {96, 3, 3, 3, 32},
+    {0, 0, 0},
+    {0, 0, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  3, 5, 8, 32},
+    {96, 3, 4, 5, 32},
+    {1, 0, 1},
+    {0, 2, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010, w/ stride
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  16, 10, 16, 32},
+    {96, 3,  4,  5,  32},
+    {1, 0, 1},
+    {0, 2, 0},
+    {2, 2, 3},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  16, 10, 16, 32},
+    {96, 3,  4,  5,  32},
+    {1, 0, 1},
+    {0, 2, 0},
+    {1, 1, 1},
+    {2, 2, 3},
+    1
+  });
+  // To test streamk, equals to gemm-MxNxK size 128x640x2048
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   1, 64, 16, 128},
+    {640, 1, 1,  1,  128},
+    {0, 0, 0},
+    {0, 0, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // To test streamk, equals to gemm-MxNxK size 128x640x2080
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   1, 65, 16, 128},
+    {640, 1, 1,  1,  128},
+    {0, 0, 0},
+    {0, 0, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  return problem_shapes;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Grouped Wgrad
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Get problem size vectors for group conv problems
+template<int SpatialDim, cutlass::conv::Operator ConvOp>
+std::vector<cutlass::conv::ConvProblemShape<ConvOp, SpatialDim>>
+inline
+get_grouped_conv_problem_vector(int GroupsPerTile);
+// Specialization for 3D wgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kWgrad, 3>> inline
+get_grouped_conv_problem_vector<3, cutlass::conv::Operator::kWgrad>(int GroupsPerTile) {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kWgrad, 3>;
+  std::vector<ProblemShape> problem_shapes;
+  if (GroupsPerTile == 1) {
+    // channel_per_group == 64
+    problem_shapes.push_back({
+      cutlass::conv::Mode::kCrossCorrelation,
+      {1, 1, 16, 16, 2048}, // ndhwc
+      {2048, 1, 3, 3, 64},  // ktrsc
+      {0, 1, 1},            // padding lower (pad_d, pad_h, pad_w)
+      {0, 1, 1},            // padding upper (pad_d, pad_h, pad_w)
+      {1, 1, 1},            // stride (stride_d, stride_h, stride_w)
+      {1, 1, 1},            // dilation (dilation_d, dilation_h, dilation_w)
+      32                    // groups
+    });
+  }
+  else if (GroupsPerTile == 2) {
+    // channel_per_group == 32
+    problem_shapes.push_back({
+      cutlass::conv::Mode::kCrossCorrelation,
+      {1, 1, 16, 16, 1024}, // ndhwc
+      {1024, 1, 3, 3, 32},  // ktrsc
+      {0, 1, 1},            // padding lower (pad_d, pad_h, pad_w)
+      {0, 1, 1},            // padding upper (pad_d, pad_h, pad_w)
+      {1, 1, 1},            // stride (stride_d, stride_h, stride_w)
+      {1, 1, 1},            // dilation (dilation_d, dilation_h, dilation_w)
+      32                    // groups
+    });
+  }
+  else if (GroupsPerTile == 4) {
+    // channel_per_group == 16
+    problem_shapes.push_back({
+      cutlass::conv::Mode::kCrossCorrelation,
+      {1, 1, 16, 16, 512}, // ndhwc
+      {512, 1, 3, 3, 16},  // ktrsc
+      {0, 1, 1},           // padding lower (pad_d, pad_h, pad_w)
+      {0, 1, 1},           // padding upper (pad_d, pad_h, pad_w)
+      {1, 1, 1},           // stride (stride_d, stride_h, stride_w)
+      {1, 1, 1},           // dilation (dilation_d, dilation_h, dilation_w)
+      32                   // groups
+    });
+  }
+  else if (GroupsPerTile == 8) {
+    // channel_per_group == 8
+    problem_shapes.push_back({
+      cutlass::conv::Mode::kCrossCorrelation,
+      {1, 1, 16, 16, 256},  // ndhwc
+      {256, 1, 3, 3, 8},    // ktrsc
+      {0, 1, 1},            // padding lower (pad_d, pad_h, pad_w)
+      {0, 1, 1},            // padding upper (pad_d, pad_h, pad_w)
+      {1, 1, 1},            // stride (stride_d, stride_h, stride_w)
+      {1, 1, 1},            // dilation (dilation_d, dilation_h, dilation_w)
+      32                    // groups
+    });
+  }
+  return problem_shapes;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Unit Stride Dgrad
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Specialization for 1D dgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 1>> inline
+get_conv_problem_vector<1, cutlass::conv::Operator::kDgrad, false>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 1>;
+  std::vector<ProblemShape> problem_shapes;
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 64},  // nqk
+    {64, 1, 64},  // ksc
+    {0},          // padding lower (pad_w)
+    {0},          // padding upper (pad_w)
+    {1},          // stride (stride_w)
+    {1},          // dilation (dilation_w)
+    1             // group
+  });
+  // non-packed input strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,   8,  64},  // nqk
+    {800, 80, 1},   // stride (nqk)
+    {64,  1,  64},  // ksc
+    {64,  64, 1},   // stride (ksc)
+    {0},            // padding lower (pad_w)
+    {0},            // padding upper (pad_w)
+    {1},            // stride (stride_w)
+    {1},            // dilation (dilation_w)
+    1               // group
+  });
+  // non-packed output strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,   8,  64},  // nqk
+    {512, 64, 1},   // stride (nqk)
+    {64,  1,  64},  // ksc
+    {64,  64, 1},   // stride (ksc)
+    {800, 80, 1},   // stride (nwc)
+    {0},            // padding lower (pad_w)
+    {0},            // padding upper (pad_w)
+    {1},            // stride (stride_w)
+    {1},            // dilation (dilation_w)
+    1               // group
+  });
+  // Filter-K = 16 for predication
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 16},
+    {64, 1, 16},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // N = 2 and K = 128 for a larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 96},
+    {64, 1, 96},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // N = 7 and K = 256 for a even larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {7,  8, 256},
+    {64, 1, 256},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, no padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 256},
+    {64, 3, 256},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, symmetric padding with k % cta_k !=0
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 256},
+    {32, 3, 256},
+    {1},
+    {1},
+    {1},
+    {1},
+    1
+  });
+  // 4 filter, asymmetric padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 256},
+    {64, 4, 256},
+    {0},
+    {1},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, asymmetric padding and dilation of 2
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   16, 64},
+    {256, 3,  64},
+    {0},
+    {1},
+    {1},
+    {2},
+    1
+  });
+  return problem_shapes;
+}
+// Specialization for 2D dgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 2>> inline
+get_conv_problem_vector<2, cutlass::conv::Operator::kDgrad, false>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 2>;
+  std::vector<ProblemShape> problem_shapes;
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 8, 64},  // npqk
+    {64, 1, 1, 64},  // krsc
+    {0, 0},          // padding lower (pad_h, pad_w)
+    {0, 0},          // padding upper (pad_h, pad_w)
+    {1, 1},          // stride (stride_h, stride_w)
+    {1, 1},          // dilation (dilation_h, dilation_w)
+    1                // group
+  });
+  // non-packed input strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,    8,   8,  64},  // npqk
+    {8000, 800, 80, 1},   // stride (npqk)
+    {64,   1,   1,  64},  // krsc
+    {64,   64,  64, 1},   // stride (krsc)
+    {0, 0},               // padding lower (pad_h, pad_w)
+    {0, 0},               // padding upper (pad_h, pad_w)
+    {1, 1},               // stride (stride_h, stride_w)
+    {1, 1},               // dilation (dilation_h, dilation_w)
+    1                     // group
+  });
+  // non-packed output strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,    8,   8,  64},  // npqk
+    {4096, 512, 64, 1},   // stride (npqk)
+    {64,   1,   1,  64},  // krsc
+    {64,   64,  64, 1},   // stride (krsc)
+    {8000, 800, 80, 1},   // stride (nhwc)
+    {0, 0},               // padding lower (pad_h, pad_w)
+    {0, 0},               // padding upper (pad_h, pad_w)
+    {1, 1},               // stride (stride_h, stride_w)
+    {1, 1},               // dilation (dilation_h, dilation_w)
+    1                     // group
+  });
+  // Filter-K = 16 for predication
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 8, 16},
+    {64, 1, 1, 16},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // N = 2 and K = 128 for a larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 8, 96},
+    {64, 1, 1, 96},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // N = 7 and K = 256 for a even larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {7,  8, 8, 256},
+    {64, 1, 1, 256},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 3x3 filter, no padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 8, 256},
+    {64, 3, 3, 256},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 3x3 filter, symmetric padding with k % cta_k !=0
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 8, 256},
+    {32, 3, 3, 256},
+    {1, 1},
+    {1, 1},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 8, 256},
+    {64, 2, 5, 256},
+    {1, 1},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   16, 16, 64},
+    {256, 2,  5,  64},
+    {1, 1},
+    {0, 0},
+    {1, 1},
+    {2, 3},
+    1
+  });
+  return problem_shapes;
+}
+// Specialization for 3D dgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 3>> inline
+get_conv_problem_vector<3, cutlass::conv::Operator::kDgrad, false>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 3>;
+  std::vector<ProblemShape> problem_shapes;
+  // Filter-K = 16 for predication
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  1, 8, 8, 16},
+    {64, 1, 1, 1, 16},
+    {0, 0, 0},
+    {0, 0, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // non-packed input output strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,    1,    8,   8,  64},  // nzpqk
+    {8000, 8000, 800, 80, 1},   // stride (nzpqk)
+    {64,   1,    1,   1,  64},  // ktrsc
+    {64,   64,   64,  64, 1},   // stride (ktrsc)
+    {8000, 8000, 800, 80, 1},   // stride (ndhwc)
+    {0, 0, 0},                  // padding lower (pad_d, pad_h, pad_w)
+    {0, 0, 0},                  // padding upper (pad_d, pad_h, pad_w)
+    {1, 1, 1},                  // stride (stride_d, stride_h, stride_w)
+    {1, 1, 1},                  // dilation (dilation_d, dilation_h, dilation_w)
+    1                           // group
+  });
+  // N = 7 and K = 256 for a larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  1, 8, 8, 96},
+    {64, 1, 1, 1, 96},
+    {0, 0, 0},
+    {0, 0, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + symmetric padding 111
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  3, 5, 8, 96},
+    {64, 3, 4, 5, 96},
+    {1, 1, 1},
+    {1, 1, 1},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  3, 5, 8, 96},
+    {64, 3, 4, 5, 96},
+    {1, 0, 1},
+    {0, 2, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  16, 10, 16, 64},
+    {64, 3,  4,  5,  96},
+    {1, 0, 1},
+    {0, 2, 0},
+    {1, 1, 1},
+    {2, 2, 3},
+    1
+  });
+  return problem_shapes;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Strided Dgrad
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Specialization for 1D dgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 1>> inline
+get_conv_problem_vector<1, cutlass::conv::Operator::kDgrad, true>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 1>;
+  std::vector<ProblemShape> problem_shapes;
+  // Test TMA truncation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  512, 64},  // nqk
+    {64, 1, 64},  // ksc
+    {0},          // padding lower (pad_w)
+    {0},          // padding upper (pad_w)
+    {2},          // stride (stride_w)
+    {1},          // dilation (dilation_w)
+    1             // group
+  });
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  1024, 64},  // nqk
+    {64, 1, 64},  // ksc
+    {0},          // padding lower (pad_w)
+    {0},          // padding upper (pad_w)
+    {4},          // stride (stride_w)
+    {1},          // dilation (dilation_w)
+    1             // group
+  });
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  2048, 64},  // nqk
+    {64, 1, 64},  // ksc
+    {0},          // padding lower (pad_w)
+    {0},          // padding upper (pad_w)
+    {8},          // stride (stride_w)
+    {1},          // dilation (dilation_w)
+    1             // group
+  });
+  // non-packed input/output strides.
+  // stride divides dilation
+  // asymmetric padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {3,   8,  64},  // nqk
+    {800, 80, 1},   // stride (nqk)
+    {64,  3,  64},  // ksc
+    {64,  64, 1},   // stride (ksc)
+    {800, 80, 1},   // stride (nwc)
+    {0},            // padding lower (pad_w)
+    {1},            // padding upper (pad_w)
+    {2},            // stride (stride_w)
+    {4},            // dilation (dilation_w)
+    1               // group
+  });
+  // non-packed input/output strides.
+  // dilation divides stride
+  // asymmetric padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {3,   8,  64},  // nqk
+    {800, 80, 1},   // stride (nqk)
+    {64,  3,  64},  // ksc
+    {64,  64, 1},   // stride (ksc)
+    {800, 80, 1},   // stride (nwc)
+    {1},            // padding lower (pad_w)
+    {0},            // padding upper (pad_w)
+    {4},            // stride (stride_w)
+    {2},            // dilation (dilation_w)
+    1               // group
+  });
+  // non-packed input/output strides.
+  // stride dilation dont divide
+  // asymmetric padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {3,   8,  64},  // nqk
+    {800, 80, 1},   // stride (nqk)
+    {64,  3,  64},  // ksc
+    {64,  64, 1},   // stride (ksc)
+    {800, 80, 1},   // stride (nwc)
+    {1},            // padding lower (pad_w)
+    {2},            // padding upper (pad_w)
+    {2},            // stride (stride_w)
+    {3},            // dilation (dilation_w)
+    1               // group
+  });
+  return problem_shapes;
+}
+// Specialization for 2D dgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 2>> inline
+get_conv_problem_vector<2, cutlass::conv::Operator::kDgrad, true>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 2>;
+  std::vector<ProblemShape> problem_shapes;
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ dilation
+  // mode 0 stride divides dilation
+  // mode 1 dilation divides stride
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {3,   16, 16, 64},
+    {256, 2, 5, 64},
+    {1, 0},
+    {0, 1},
+    {2, 4},
+    {4, 2},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ dilation
+  // mode 0 dilation divides stride
+  // mode 1 stride divides dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {3,   16, 16, 64},
+    {256, 2, 5, 64},
+    {1, 0},
+    {0, 1},
+    {4, 2},
+    {2, 4},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ dilation
+  // stride dilation dont divide
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {3,   16, 16, 64},
+    {256, 2, 5, 64},
+    {1, 0},
+    {0, 1},
+    {3, 2},
+    {2, 3},
+    1
+  });
+  return problem_shapes;
+}
+// Specialization for 3D dgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 3>> inline
+get_conv_problem_vector<3, cutlass::conv::Operator::kDgrad, true>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 3>;
+  std::vector<ProblemShape> problem_shapes;
+  // Filter 3x4x5 + asymmetric padding 102/010, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  16, 10, 16, 64},
+    {64, 3, 4, 5, 96},
+    {1, 0, 1},
+    {0, 2, 0},
+    {2, 1, 2},
+    {4, 2, 3},
+    1
+  });
+  return problem_shapes;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass::test

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/conv/device_3x/testbed_conv.hpp ADDED Viewed

	@@ -0,0 +1,768 @@

+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed for 3.x API
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "../../common/cutlass_unit_test.h"
+#include "cute/tensor.hpp"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/convnd_problem_shape.hpp"
+#include "../test/unit/gemm/device/gemm_testbed_3x.hpp"
+#include "thrust/universal_vector.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/host/conv.hpp"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "conv_problem_sizes.hpp"
+#include "../cache_testbed_output.h"
+#include <iostream>
+#include "cute/layout.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace test::conv::device {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Initializes a flat device buffer
+template <typename Element>
+static void
+initialize_values(
+    thrust::universal_vector<Element>& dst_ptr,
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+  if (cutlass::Distribution::Uniform == dist_kind) {
+    int scope;
+    int bits = cutlass::sizeof_bits<Element>::value;
+    if (bits <= 8) {
+      scope = 2;
+    }
+    else if (bits == 16) {
+      scope = 4;
+    }
+    else {
+      scope = 8;
+    }
+    cutlass::reference::host::BlockFillRandomUniform(
+        dst_ptr.data().get(), dst_ptr.size(), seed, scope, -scope, 0);
+  }
+  else if (cutlass::Distribution::Identity == dist_kind) {
+    cutlass::reference::host::BlockFillRandomUniform(
+        dst_ptr.data().get(), dst_ptr.size(), seed, 0, 0, 0);
+  }
+  else if (cutlass::Distribution::Gaussian == dist_kind) {
+    cutlass::reference::host::BlockFillRandomGaussian(dst_ptr.data().get(), dst_ptr.size(), seed, 0, 0.5);
+  }
+  else if (cutlass::Distribution::Sequential == dist_kind) {
+    cutlass::reference::host::BlockFillSequential(dst_ptr.data().get(), dst_ptr.size());
+  }
+  else {
+    std::cerr << "Invalid distribution kind!\n.";
+    exit(1);
+  }
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// utils for sparse or dense conv parameters
+template <class Conv>
+struct DenseConvParams {
+  // Default Kernel data types
+  using ElementA = typename Conv::ConvKernel::ElementA;
+  using ElementB = typename Conv::ConvKernel::ElementB;
+  static constexpr cutlass::conv::Operator ConvOp = Conv::DispatchPolicy::ConvOp;
+  static constexpr int NumSpatialDimensions = Conv::NumSpatialDimensions;
+  using ProblemShape = cutlass::conv::ConvProblemShape<ConvOp, NumSpatialDimensions>;
+  // get the default arguments without sparse data
+  auto get_mainloop_arguments(
+    [[maybe_unused]] ProblemShape const& problem_shape,
+    thrust::universal_vector<ElementA>& tensor_A,
+    thrust::universal_vector<ElementB>& tensor_B
+  ) {
+    auto args = typename Conv::ConvKernel::MainloopArguments {
+      tensor_A.data().get(),
+      tensor_B.data().get(),
+    };
+    return args;
+  }
+};
+template <class Conv>
+struct SparseConvParams {
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <class Conv, bool isSparseEnabled_ = false>
+struct ConvTestbed {
+  // Kernel data types
+  using ElementA = typename Conv::ConvKernel::ElementA;
+  using ElementB = typename Conv::ConvKernel::ElementB;
+  using ElementC = cute::conditional_t<cute::is_void_v<typename Conv::ConvKernel::ElementC>,
+      typename Conv::ConvKernel::ElementD, typename Conv::ConvKernel::ElementC>;
+  using ElementD = typename Conv::ConvKernel::ElementD;
+  using ElementAccumulator = typename Conv::ConvKernel::ElementAccumulator;
+  // ConvTest for sparse kernel
+  static constexpr bool isSparseEnabled = isSparseEnabled_;
+  using ConvParams = cute::conditional_t<isSparseEnabled, SparseConvParams<Conv>, DenseConvParams<Conv>>;
+  ConvParams params;
+  //
+  // FusionOperation derived types/queries
+  //
+  using FusionOp = typename Conv::EpilogueOutputOp;
+  // fusion types are potentially void if the fusion is not supported
+  // helper so we don't try to construct HostTensor with void type
+  template <typename T, typename U = uint8_t>
+  using non_void_t               = cute::conditional_t<cute::is_void_v<T>, U, T>;
+  using ElementScalar            = typename FusionOp::ElementScalar;
+  using ElementCompute           = typename FusionOp::ElementCompute;
+  using BiasType                 = typename cutlass::epilogue::collective::detail::IsThreadEpilogueOpWithBias<FusionOp>::type;
+  using ElementBias              = non_void_t<BiasType>;
+  using ActivationType           = non_void_t<typename cutlass::epilogue::collective::detail::IsThreadEpilogueOpWithActivation<FusionOp>::type,
+                                   cutlass::epilogue::thread::Identity<ElementCompute>>;
+  static constexpr bool IsActivationEnabled = cutlass::epilogue::collective::detail::IsThreadEpilogueOpWithActivation<FusionOp>::value;
+  using ActivationFunctor        = cute::conditional_t<IsActivationEnabled, ActivationType, cutlass::epilogue::thread::Identity<ElementCompute>>;
+  static constexpr bool IsBiasEnabled = cutlass::epilogue::collective::detail::IsThreadEpilogueOpWithBias<FusionOp>::value &&
+                                        !cute::is_same_v<BiasType, void>;
+  static constexpr bool IsPerChannelScaleEnabled = cutlass::epilogue::collective::detail::IsThreadEpilogueOpWithPerChannelScaled<FusionOp>::value;
+  static constexpr bool DisableSource = cute::is_void_v<typename FusionOp::ElementSource>;
+  static constexpr bool IsResidualEnabled = cutlass::epilogue::collective::detail::IsThreadEpilogueOpWithResidualAdd<FusionOp>::value;
+  using StrideC  = typename Conv::ConvKernel::StrideC;
+  using StrideD  = typename Conv::ConvKernel::StrideD;
+  using ThreadEpilogueOp = typename Conv::ConvKernel::CollectiveEpilogue::ThreadEpilogueOp;
+  static constexpr cutlass::conv::Operator ConvOp = Conv::DispatchPolicy::ConvOp;
+  static constexpr int NumSpatialDimensions = Conv::NumSpatialDimensions;
+  using ProblemShape = cutlass::conv::ConvProblemShape<ConvOp, NumSpatialDimensions>;
+  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
+  using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+  using MaxSwizzleSize = typename gemm::device::detail::MaxSwizzleSize;
+  using Splits = typename gemm::device::detail::Splits;
+  using Schedule = typename Conv::DispatchPolicy::Schedule;
+  /// Initialization
+  cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform;
+  cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform;
+  cutlass::Distribution::Kind init_C = cutlass::Distribution::Uniform;
+  cutlass::Distribution::Kind init_bias = cutlass::Distribution::Uniform;
+  cutlass::Distribution::Kind init_disable = cutlass::Distribution::Identity; // all zeros
+  uint64_t seed = 6090;
+  float epsilon = 0.0f;
+  int split_p_slices = 1;
+  thrust::universal_vector<ElementA> tensor_A;
+  thrust::universal_vector<ElementB> tensor_B;
+  thrust::universal_vector<ElementC> tensor_C;
+  thrust::universal_vector<ElementD> tensor_D_computed;
+  thrust::universal_vector<ElementD> tensor_D_reference;
+  thrust::universal_vector<ElementBias> tensor_bias;
+  thrust::universal_vector<ElementScalar> tensor_alpha;
+  thrust::universal_vector<ElementScalar> tensor_beta;
+  // Return true on success, else false
+  bool initialize(ProblemShape const& problem_shape, uint64_t seed = 6090) {
+    tensor_A.resize(sizeof(ElementA) * problem_shape.size_A());
+    tensor_B.resize(sizeof(ElementB) * problem_shape.size_B());
+    tensor_C.resize(sizeof(ElementC) * problem_shape.size_C());
+    tensor_D_computed.resize(sizeof(ElementD) * problem_shape.size_C());
+    tensor_D_reference.resize(sizeof(ElementD) * problem_shape.size_C());
+    tensor_bias.resize(sizeof(ElementBias) * cute::size(cute::get<0>(problem_shape.get_shape_B())));
+    if constexpr (IsPerChannelScaleEnabled) {
+      tensor_alpha.resize(sizeof(ElementScalar) * cute::size(cute::get<0>(problem_shape.get_shape_B())));
+      tensor_beta.resize(sizeof(ElementScalar) * cute::size(cute::get<0>(problem_shape.get_shape_B())));
+    }
+    initialize_values(tensor_A, init_A, seed);
+    initialize_values(tensor_B, init_B, seed * 11);
+    initialize_values(tensor_C, init_C, seed * 17);
+    initialize_values(tensor_bias, init_bias, seed * 19);
+    if constexpr (IsPerChannelScaleEnabled) {
+      initialize_values(tensor_alpha, init_bias, seed * 23);
+      if constexpr (DisableSource) {
+        initialize_values(tensor_beta, init_disable, seed * 27);
+      }
+      else {
+        initialize_values(tensor_beta, init_bias, seed * 27);
+      }
+    }
+    bool flag = true;
+    if constexpr (isSparseEnabled) {
+      flag &= params.initialize(problem_shape, tensor_B, static_cast<int>(seed + 2023));
+    }
+    return flag;
+  }
+  // Determine SMEM requirements and waive if not satisfied
+  bool sufficient() const {
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+    int max_smem_size;
+    result = cudaDeviceGetAttribute(&max_smem_size, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaDeviceGetAttribute() failed");
+    }
+    return max_smem_size >= Conv::ConvKernel::SharedStorageSize;
+  }
+  auto transform_shape_and_stride_with_groups(ProblemShape const& problem_shape) {
+    using TensorExtent = cute::array<int32_t, NumSpatialDimensions + 3>;
+    using TensorStride = cute::array<int64_t, NumSpatialDimensions + 3>;
+    TensorExtent shape_a_g{};
+    TensorExtent shape_b_g{};
+    TensorExtent shape_c_g{};
+    TensorStride stride_a_g{};
+    TensorStride stride_b_g{};
+    TensorStride stride_c_g{};
+    auto shape_a = cute::reverse(problem_shape.shape_A);
+    auto shape_b = cute::reverse(problem_shape.shape_B);
+    auto shape_c = cute::reverse(problem_shape.shape_C);
+    auto stride_a = cute::reverse(problem_shape.stride_A);
+    auto stride_b = cute::reverse(problem_shape.stride_B);
+    auto stride_c = cute::reverse(problem_shape.stride_C);
+    int32_t G = problem_shape.groups;
+    if constexpr (ConvOp == cutlass::conv::Operator::kFprop ||
+                  ConvOp == cutlass::conv::Operator::kDgrad) {
+      // shape_a_g = (c,w,h,d,n,g) or (k,q,p,z,n,g)
+      // shape_b_g = (c,s,r,k,t,g)
+      // shape_c_g = (k,q,p,z,n,g) or (c,w,h,d,n,g)
+      shape_a_g = cute::to_array<int32_t>(tuple_cat(
+        cute::make_shape(cute::size<0>(shape_a) / G),
+        cute::take<1,NumSpatialDimensions + 2>(shape_a),
+        cute::make_shape(G)));
+      shape_b_g = cute::to_array<int32_t>(tuple_cat(
+        cute::take<0,NumSpatialDimensions + 1>(shape_b),
+        cute::make_shape(cute::size<NumSpatialDimensions + 1>(shape_b) / G, G)));
+      shape_c_g = cute::to_array<int32_t>(tuple_cat(
+        cute::make_shape(cute::size<0>(shape_c) / G),
+        cute::take<1,NumSpatialDimensions + 2>(shape_c),
+        cute::make_shape(G)));
+      stride_a_g = cute::to_array<int64_t>(append(stride_a, cute::size<0>(shape_a) / G));
+      stride_b_g = cute::to_array<int64_t>(append(stride_b,
+        cute::size<NumSpatialDimensions + 1>(stride_b) * cute::size<NumSpatialDimensions + 1>(shape_b) / G));
+      stride_c_g = cute::to_array<int64_t>(append(stride_c, cute::size<0>(shape_c) / G));
+    }
+    else if constexpr (ConvOp == cutlass::conv::Operator::kWgrad) {
+      // shape_a_g = (k,q,p,z,n,g)
+      // shape_b_g = (c,w,h,d,n,g)
+      // shape_c_g = (c,s,r,k,t,g)
+      shape_a_g = cute::to_array<int32_t>(tuple_cat(
+        cute::make_shape(cute::size<0>(shape_a) / G),
+        cute::take<1,NumSpatialDimensions + 2>(shape_a),
+        cute::make_shape(G)));
+      shape_b_g = cute::to_array<int32_t>(tuple_cat(
+        cute::make_shape(cute::size<0>(shape_b) / G),
+        cute::take<1,NumSpatialDimensions + 2>(shape_b),
+        cute::make_shape(G)));
+      shape_c_g = cute::to_array<int32_t>(tuple_cat(
+        cute::take<0,NumSpatialDimensions + 1>(shape_c),
+        cute::make_shape(cute::size<NumSpatialDimensions + 1>(shape_c) / G, G)));
+      stride_a_g = cute::to_array<int64_t>(append(stride_a, cute::size<0>(shape_a) / G));
+      stride_b_g = cute::to_array<int64_t>(append(stride_b, cute::size<0>(shape_b) / G));
+      stride_c_g = cute::to_array<int64_t>(append(stride_c,
+        cute::size<NumSpatialDimensions + 1>(stride_c) * cute::size<NumSpatialDimensions + 1>(shape_c) / G));
+    }
+    return make_tuple(shape_a_g, shape_b_g, shape_c_g,
+                      stride_a_g, stride_b_g, stride_c_g);
+  }
+  // Executes one test
+  bool run(
+    ProblemShape const& problem_shape,
+    ElementScalar alpha = ElementScalar(1),
+    ElementScalar beta = ElementScalar(0),
+    dim3 cluster_shape = dim3(0, 0, 0),
+    dim3 cluster_shape_fallback = dim3(0, 0, 0),
+    RasterOrderOptions raster_order = RasterOrderOptions::Heuristic,
+    MaxSwizzleSize max_swizzle = MaxSwizzleSize{},
+    Splits splits = Splits{},
+    DecompositionMode decomposition_mode = DecompositionMode::Heuristic
+  ) {
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device.\n";
+      }
+      return true;
+    }
+    bool ret = initialize(problem_shape);
+    if (!ret) {
+      std::cerr << "initialize failed for the given problem_shape: \n";
+      return false;
+    }
+    cutlass::KernelHardwareInfo hw_info;
+    cudaGetDevice(&hw_info.device_id);
+    hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+    hw_info.cluster_shape = cluster_shape;
+    hw_info.cluster_shape_fallback = cluster_shape_fallback;
+    // configure the operator
+    Conv conv_op;
+    auto stride_C = StrideC{};
+    auto stride_D = StrideD{};
+    if constexpr (ConvOp == cutlass::conv::Operator::kWgrad) {
+      stride_C = cutlass::make_cute_packed_stride(
+        StrideC{}, problem_shape.shape_C, problem_shape.stride_C, ConvOp);
+      stride_D = cutlass::make_cute_packed_stride(
+        StrideD{}, problem_shape.shape_C, problem_shape.stride_C, ConvOp);
+    }
+    // Need to support non-packed output strides for fprop and dgrad kernel.
+    else {
+      cute::for_each(cute::make_seq<cute::rank<0>(StrideC{})>{}, [&](auto i) {
+        cute::get<0, i>(stride_C) = problem_shape.stride_C[ProblemShape::RankT-2-i];
+      });
+      cute::for_each(cute::make_seq<cute::rank<0>(StrideD{})>{}, [&](auto i) {
+        cute::get<0, i>(stride_D) = problem_shape.stride_C[ProblemShape::RankT-2-i];
+      });
+    }
+    using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
+   using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+    typename Conv::ConvKernel::TileScheduler::Arguments scheduler_args{};
+    if constexpr (cute::is_same_v<typename Conv::ConvKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>) {
+      scheduler_args = { static_cast<int>(splits), static_cast<int>(max_swizzle), raster_order, decomposition_mode };
+    }
+    auto mainloop_args = params.get_mainloop_arguments(problem_shape, tensor_A, tensor_B);
+    auto epilogue_args = typename Conv::ConvKernel::EpilogueArguments {
+      {},
+      tensor_C.data().get(),
+      stride_C,
+      tensor_D_computed.data().get(),
+      stride_D,
+    };
+    auto args = typename Conv::Arguments {
+      problem_shape,
+      mainloop_args, // MainloopArguments
+      epilogue_args, // EpilogueArguments
+      hw_info,
+      scheduler_args
+    };
+    auto &fusion_args = args.epilogue.thread;
+    fusion_args.alpha = alpha;
+    fusion_args.beta = beta;
+    if constexpr (IsPerChannelScaleEnabled) {
+      fusion_args.alpha_ptr = tensor_alpha.data().get();
+      fusion_args.beta_ptr = tensor_beta.data().get();
+    }
+    if constexpr (IsBiasEnabled) {
+      fusion_args.bias_ptr = tensor_bias.data().get();
+    }
+    // Clamp bound
+    if constexpr (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::Clamp<ElementCompute>>) {
+      fusion_args.activation.lower_bound = CUTLASS_STL_NAMESPACE::numeric_limits<ElementCompute>::lowest();
+      fusion_args.activation.upper_bound = CUTLASS_STL_NAMESPACE::numeric_limits<ElementCompute>::max();
+    }
+    // Scale
+    if constexpr (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ScaledGELU_taylor<ElementCompute>> ||
+                  cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ScaledGELU<ElementCompute>> ||
+                  cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ScaledSiLu<ElementCompute>> ||
+                  cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ScaledHardSwish<ElementCompute>> ) {
+      fusion_args.activation.scale = ElementCompute{1};
+    }
+    // LeakyRelu
+    if constexpr (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::LeakyReLU<ElementCompute>> ) {
+      fusion_args.activation.leaky_alpha = ElementCompute{0};
+    }
+    cutlass::Status status = cutlass::Status::kInvalid;
+    status = conv_op.can_implement(args);
+    EXPECT_EQ(conv_op.can_implement(args), cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "can_implement failed for the given problem_shape: \n";
+      print(problem_shape);
+      return false;
+    }
+    // find workspace requirement for parallel split-k reduction
+    size_t workspace_size = Conv::get_workspace_size(args);
+    thrust::universal_vector<uint8_t> workspace(workspace_size);
+    status = conv_op.initialize(args, workspace.data().get());
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+    // run conv3d operator
+    status = conv_op();
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+    bool passed = false;
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " Kernel execution error: "
+                                   << cudaGetErrorString(result);
+    // Create cute::Tensors using the logical rank-3 MNK multi-mode shapes the mainloop gives us
+    auto [shape_mA, shape_mB, shape_mC, stride_mA, stride_mB, stride_mC] =
+      transform_shape_and_stride_with_groups(problem_shape);
+    auto shape_mBias = cute::make_shape(cute::size(cute::get<0>(problem_shape.get_shape_B())));
+    auto mA = make_tensor(tensor_A.data().get(), make_layout(shape_mA, stride_mA));
+    auto mB = make_tensor(tensor_B.data().get(), make_layout(shape_mB, stride_mB));
+    auto mC = make_tensor(tensor_C.data().get(), make_layout(shape_mC, stride_mC));
+    auto mD_ref = make_tensor(tensor_D_reference.data().get(), make_layout(shape_mC, stride_mC));
+    auto mD_computed = make_tensor(tensor_D_computed.data().get(), make_layout(shape_mC, stride_mC));
+    auto mBias = make_tensor(tensor_bias.data().get(), make_layout(shape_mBias));
+    auto mAlpha = make_tensor(tensor_alpha.data().get(), make_layout(shape_mBias));
+    auto mBeta = make_tensor(tensor_beta.data().get(), make_layout(shape_mBias));
+    cutlass::reference::host::ConvEpilogueFusionParams<
+      ElementAccumulator,
+      ElementScalar,
+      ElementCompute,
+      ElementC,
+      ElementD,
+      IsResidualEnabled,
+      decltype(mAlpha),
+      decltype(mBeta),
+      decltype(mBias),
+      ActivationFunctor>
+        epilogue_fusion_params{};
+    epilogue_fusion_params.alpha = alpha;
+    epilogue_fusion_params.beta = beta;
+    if constexpr (IsPerChannelScaleEnabled) {
+      epilogue_fusion_params.tensor_alpha = mAlpha;
+      epilogue_fusion_params.tensor_beta = mBeta;
+    }
+    if constexpr (IsBiasEnabled) {
+      epilogue_fusion_params.tensor_bias = mBias;
+    }
+    auto padding = cute::reverse(problem_shape.lower_padding);
+    auto tstride = cute::reverse(problem_shape.traversal_stride);
+    auto dilation = cute::reverse(problem_shape.dilation);
+    cutlass::reference::host::ConvReferenceImpl<
+      ConvOp,
+      NumSpatialDimensions,
+      decltype(mA),
+      decltype(mB),
+      decltype(mC),
+      decltype(mD_ref),
+      decltype(padding),
+      decltype(tstride),
+      decltype(dilation),
+      decltype(epilogue_fusion_params)>
+        reference_impl(mA, mB, mC, mD_ref, padding, tstride, dilation, epilogue_fusion_params);
+    //
+    // Reference check - support caching results
+    //
+    CachedTestKey cached_test_key = CreateCachedConvNd3xTestKey<
+        ProblemShape,
+        ElementA,
+        ElementB,
+        ElementC,
+        ElementD
+    >(
+        ConvOp,
+        problem_shape,
+        alpha,
+        beta,
+        tensor_A,
+        tensor_B,
+        tensor_C
+      );
+    //
+    // Look for the cached key
+    //
+    bool cached_result_loaded = false;
+    CachedTestResult cached_test_result;
+    std::string convnd_result_cache_name =
+      std::string("cached_results_") + CUTLASS_TARGET_NAME + ".txt";
+    #if (CUTLASS_TEST_ENABLE_CACHED_RESULTS)
+      CachedTestResultListing cached_results(convnd_result_cache_name);
+      auto cached = cached_results.find(cached_test_key);
+      cached_result_loaded = cached.first;
+      if (cached_result_loaded) {
+        cached_test_result = cached.second;
+      }
+    #endif
+    if (!cached_result_loaded) {
+      // Compute reference
+      reference_impl.compute_reference();
+      #if (CUTLASS_TEST_ENABLE_CACHED_RESULTS)
+        cached_test_result.D = TensorHash(tensor_D_reference);
+        CachedTestResultListing cached_results(convnd_result_cache_name);
+        cached_results.append(cached_test_key, cached_test_result);
+        cached_results.write(convnd_result_cache_name);
+      #endif
+    } // if (!cached_result_loaded)
+    #if (CUTLASS_TEST_ENABLE_CACHED_RESULTS)
+      uint32_t tensor_D_computed_hash = TensorHash(tensor_D_computed);
+      passed = (tensor_D_computed_hash == cached_test_result.D);
+      // If hash fails, double check against reference implementation.
+      if(!passed) {
+        std::cerr << "Hash-based comparison unsuccessful for key:" << "\n" << cached_test_key
+            << ", comparing with reference implementation now.\n";
+        if (cached_result_loaded) {
+          // Compute reference
+          reference_impl.compute_reference();
+        }
+        // Validate kernel against reference
+        passed = compare_reference(mD_ref, mD_computed, mA, mB, mAlpha, mBeta, mBias, this->epsilon);
+      }
+    #else
+      // Validate kernel against reference
+      passed = compare_reference(mD_ref, mD_computed, mA, mB, mAlpha, mBeta, mBias, this->epsilon);
+    #endif
+    EXPECT_TRUE(passed);
+    return passed;
+  }
+  template<
+    class Engine, class Layout,
+    class EngineA, class LayoutA,
+    class EngineB, class LayoutB,
+    class EngineAlpha, class LayoutAlpha,
+    class EngineBeta, class LayoutBeta,
+    class EngineBias, class LayoutBias>
+  static constexpr bool
+  compare_reference(
+      cute::Tensor<Engine, Layout> const& reference,
+      cute::Tensor<Engine, Layout> const& computed,
+      cute::Tensor<EngineA, LayoutA> const& A,
+      cute::Tensor<EngineB, LayoutB> const& B,
+      cute::Tensor<EngineAlpha, LayoutAlpha> const& tensor_alpha,
+      cute::Tensor<EngineBeta, LayoutBeta> const& tensor_beta,
+      cute::Tensor<EngineBias, LayoutBias> const& tensor_bias,
+      float epsilon = 0.0f) {
+    if (size(reference) != size(computed)) {
+      return false;
+    }
+    bool passed = true;
+    if (epsilon == 0.0f) {
+      // fast refcheck w/o epsilon
+      for (size_t i = 0; i < size_t(size(reference)); ++i) {
+        if (reference(i) != computed(i)) {
+          passed = false;
+          printf("[%llu] %f, %f\n", static_cast<unsigned long long>(i),
+            float(reference(i)), float(computed(i)));
+          break;
+        }
+      }
+    } else {
+      // refcheck with epsilon
+      for (size_t i = 0; i < size_t(size(reference)); ++i) {
+        auto ref = static_cast<float>(reference(i));
+        auto act = static_cast<float>(computed(i));
+        auto abs_error = std::abs(act - ref);
+        auto rel_error = abs_error / (std::max(std::abs(act), std::abs(ref)) + 0.00001f);
+        if (std::isnan(abs_error) || std::isnan(rel_error) ||
+            std::min(abs_error, rel_error) > epsilon) {
+          passed = false;
+          printf("[%llu] %f, %f\n", static_cast<unsigned long long>(i),
+            float(reference(i)), float(computed(i)));
+          break;
+        }
+      }
+    }
+    #if CUTLASS_DEBUG_TRACE_LEVEL > 1
+    if (not passed) {
+      cute::print("Reference:");
+      cute::print_tensor(reference);
+      cute::print("\nComputed:");
+      cute::print_tensor(computed);
+      cute::print("\n");
+      for (size_t i = 0; i < size_t(size(A)); ++i) {
+        printf("[%llu]: A = %f\n", static_cast<unsigned long long>(i), float(A(i)));
+      }
+      for (size_t i = 0; i < size_t(size(B)); ++i) {
+        printf("[%llu]: B = %f\n", static_cast<unsigned long long>(i), float(B(i)));
+      }
+      if constexpr (IsPerChannelScaleEnabled) {
+        for (size_t i = 0; i < size_t(size(tensor_alpha)); ++i) {
+          printf("[%llu]: alpha = %f\n", static_cast<unsigned long long>(i),
+            float(tensor_alpha(i)));
+        }
+        for (size_t i = 0; i < size_t(size(tensor_beta)); ++i) {
+          printf("[%llu]: beta = %f\n", static_cast<unsigned long long>(i),
+            float(tensor_beta(i)));
+        }
+      }
+      if constexpr (IsBiasEnabled) {
+        for (size_t i = 0; i < size_t(size(tensor_bias)); ++i) {
+          printf("[%llu]: bias = %f\n", static_cast<unsigned long long>(i),
+            float(tensor_bias(i)));
+        }
+      }
+      for (size_t i = 0; i < size_t(size(reference)); ++i) {
+        printf("[%llu]: ref = %f, computed = %f\n", static_cast<unsigned long long>(i),
+          float(reference(i)), float(computed(i)));
+      }
+    }
+    #endif
+    return passed;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Conv, bool SupportStrides = (Conv::DispatchPolicy::ConvOp != cutlass::conv::Operator::kDgrad)>
+bool TestAllConv(double alpha = 1.0, double beta = 0.0, float epsilon = 0.0f,
+                 dim3 cluster_shape = dim3(0, 0, 0),
+                 dim3 cluster_shape_fallback = dim3(0, 0, 0)
+                 ) {
+  using ElementScalar = typename Conv::EpilogueOutputOp::ElementScalar;
+  bool passed = true;
+  ConvTestbed<Conv> testbed;
+  testbed.epsilon = epsilon;
+  auto problem_vector = get_conv_problem_vector<
+      Conv::NumSpatialDimensions, Conv::DispatchPolicy::ConvOp, SupportStrides>();
+  using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
+  using MaxSwizzleSize = typename gemm::device::detail::MaxSwizzleSize;
+  using Splits = typename gemm::device::detail::Splits;
+  std::vector<DecompositionMode> decomposition_modes = {DecompositionMode::Heuristic};
+  static constexpr bool UsesStreamKScheduler = cute::is_same_v<typename Conv::ConvKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>;
+  if constexpr (UsesStreamKScheduler) {
+    decomposition_modes.push_back(DecompositionMode::DataParallel);
+    decomposition_modes.push_back(DecompositionMode::SplitK);
+    decomposition_modes.push_back(DecompositionMode::StreamK);
+  }
+  for (auto conv_problem : problem_vector) {
+    #if CUTLASS_DEBUG_TRACE_LEVEL > 0
+    print(conv_problem);
+    #endif
+    for (DecompositionMode decomp_mode : decomposition_modes) {
+      std::vector problem_splits = {Splits{1}};
+      if constexpr (UsesStreamKScheduler) {
+        if (decomp_mode == DecompositionMode::SplitK) {
+          problem_splits.push_back(Splits{2});
+          problem_splits.push_back(Splits{4});
+        }
+      }
+      for (auto splits : problem_splits) {
+        passed = testbed.run(
+          conv_problem,
+          cutlass::from_real<ElementScalar>(alpha),
+          cutlass::from_real<ElementScalar>(beta),
+          cluster_shape,
+          cluster_shape_fallback,
+          RasterOrderOptions::Heuristic, // raster_order
+          MaxSwizzleSize(1),
+          splits,
+          decomp_mode
+          );
+        if (!passed) {
+          printf("Failed test for "); print(conv_problem);
+          return false;
+        }
+      } // splits
+    } // decomposition_mode
+  }
+  return passed;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace test::conv::device
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/cute/ampere/tiled_cp_async_testbed.hpp ADDED Viewed

	@@ -0,0 +1,158 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#include "cutlass_unit_test.h"
+#include <iostream>
+#include <iomanip>
+#include <utility>
+#include <type_traits>
+#include <vector>
+#include <numeric>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <cute/tensor.hpp>
+using namespace cute;
+template <class ElementType, class SmemLayout>
+struct SharedStorage
+{
+  cute::ArrayEngine<ElementType, cute::cosize_v<SmemLayout>> smem;
+};
+template <class T, class TiledCopy, class GmemLayout, class SmemLayout>
+__global__ void
+test_tiled_cp_async_device_cute(T const* g_in, T* g_out,
+                     TiledCopy const tiled_copy,
+                     GmemLayout gmem_layout, SmemLayout smem_layout)
+{
+  using namespace cute;
+  extern __shared__ char shared_memory[];
+  using SharedStorage = SharedStorage<T, SmemLayout>;
+  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+  auto thr_copy = tiled_copy.get_slice(threadIdx.x);
+  Tensor gA = make_tensor(make_gmem_ptr(g_in), gmem_layout);
+  Tensor gB = make_tensor(make_gmem_ptr(g_out), gmem_layout);
+  // Construct SMEM tensor
+  Tensor sA = make_tensor(make_smem_ptr(shared_storage.smem.begin()), smem_layout);
+  auto tAgA = thr_copy.partition_S(gA);
+  auto tAsA = thr_copy.partition_D(sA);
+#if 0
+  if (thread0()) {
+    print("gA  : "); print(gA.layout());   print("\n");
+    print("sA  : "); print(sA.layout());   print("\n");
+    print("tAgA: "); print(tAgA.layout()); print("\n");
+    print("tAsA: "); print(tAsA.layout()); print("\n");
+  }
+#endif
+  copy(tiled_copy, tAgA, tAsA);
+  cp_async_fence();
+  cp_async_wait<0>();
+  __syncthreads();
+  // Store trivially smem -> gmem
+  if (thread0()) {
+    copy(sA, gB);
+  }
+}
+template <class T, class TiledCopy, class GMEM_Layout, class SMEM_Layout>
+void
+test_tiled_cp_async(
+               TiledCopy const tiled_copy,
+               GMEM_Layout const& gmem_layout,
+               SMEM_Layout const& smem_layout)
+{
+  using namespace cute;
+  // Allocate and initialize host test data
+  size_t N = ceil_div(cosize(gmem_layout) * sizeof_bits<T>::value, 8);
+  thrust::host_vector<T> h_in(N);
+  Tensor hA_in  = make_tensor(recast_ptr<T>(h_in.data()), gmem_layout);
+  for (int i = 0; i < size(hA_in); ++i) { hA_in(i) = static_cast<T>(i % 13); }
+  // Allocate and initialize device test data
+  thrust::device_vector<T> d_in = h_in;
+  thrust::device_vector<T> d_out(h_in.size(), T(-1));
+  // Launch
+  int smem_size = int(sizeof(SharedStorage<T, decltype(smem_layout)>));
+  test_tiled_cp_async_device_cute<<<1, 128, smem_size>>>(
+    reinterpret_cast<T const*>(raw_pointer_cast(d_in.data())),
+    reinterpret_cast<T*>      (raw_pointer_cast(d_out.data())),
+    tiled_copy,
+    gmem_layout,
+    smem_layout);
+  // Copy results back to host
+  thrust::host_vector<T> h_out = d_out;
+  Tensor hA_out = make_tensor(recast_ptr<T>(h_out.data()), gmem_layout);
+  // Validate the results. Print only the first 3 errors.
+  int count = 3;
+  for (int i = 0; i < size(hA_out) && count > 0; ++i) {
+    EXPECT_EQ(hA_in(i), hA_out(i));
+    if (hA_in(i) != hA_out(i)) {
+      --count;
+    }
+  }
+}
+template <typename T, typename M, typename N, typename GMEM_STRIDE_TYPE, typename SMEM_LAYOUT, typename TILED_COPY>
+void test_cp_async_no_swizzle() {
+  using namespace cute;
+  auto smem_atom = SMEM_LAYOUT{};
+  auto smem_layout = tile_to_shape(smem_atom, Shape<M, N>{});
+  auto gmem_layout = make_layout(make_shape(M{}, N{}), GMEM_STRIDE_TYPE{});
+  test_tiled_cp_async<T>(TILED_COPY{}, gmem_layout, smem_layout);
+}
+template <typename T, typename M, typename N, typename GMEM_STRIDE_TYPE, typename SWIZZLE_ATOM, typename SMEM_LAYOUT, typename TILED_COPY>
+void test_cp_async_with_swizzle() {
+  using namespace cute;
+  auto swizzle_atom = SWIZZLE_ATOM{};
+  auto smem_atom = composition(swizzle_atom, SMEM_LAYOUT{});
+  auto smem_layout = tile_to_shape(smem_atom, Shape<M, N>{});
+  auto gmem_layout = make_layout(make_shape(M{}, N{}), GMEM_STRIDE_TYPE{});
+  test_tiled_cp_async<T>(TILED_COPY{}, gmem_layout, smem_layout);
+}

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/cute/cooperative_gemm_common.hpp ADDED Viewed

	@@ -0,0 +1,775 @@

+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#include "cutlass/relatively_equal.h"
+#include "cutlass_unit_test.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include <iostream>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <cute/tensor.hpp>
+using namespace cute;
+template<typename T>
+struct fp64_tester {
+  using value_type = double;
+};
+template<typename T>
+struct fp64_tester<complex<T>> {
+  using value_type = complex<double>;
+};
+template<class TA,
+         class TB,
+         class TC,
+         class ALayout, // logical shape (M, K)
+         class BLayout, // logical shape (N, K)
+         class CLayout> // logical shape (M, N)
+auto host_generate_gemm_inputs(
+  ALayout a_layout,
+  BLayout b_layout,
+  CLayout c_layout
+) {
+  thrust::host_vector<TA> h_a(cosize(a_layout));
+  thrust::host_vector<TB> h_b(cosize(b_layout));
+  thrust::host_vector<TC> h_c(cosize(c_layout));
+  thrust::host_vector<TC> h_c_out(cosize(c_layout));
+  auto h_a_tensor = make_tensor(h_a.data(), a_layout);
+  auto h_b_tensor = make_tensor(h_b.data(), b_layout);
+  auto h_c_tensor = make_tensor(h_c.data(), c_layout);
+  size_t max_size   = std::max<size_t>({static_cast<size_t>(size(a_layout)),
+                                        static_cast<size_t>(size(b_layout)),
+                                        static_cast<size_t>(size(c_layout))});
+  for (size_t i = 0; i < max_size; ++i) {
+    double di = static_cast<double>(i);
+    if(i < size(a_layout)) {
+      h_a_tensor(i) = static_cast<TA>(di / size(a_layout));
+    }
+    if(i < size(b_layout)) {
+      h_b_tensor(i) = static_cast<TB>(di / size(a_layout));
+    }
+    if(i < size(c_layout)) {
+      h_c_tensor(i) = static_cast<TC>((di*di) / size(a_layout));
+    }
+  }
+  return std::make_tuple(h_a, h_b, h_c, h_c_out);
+}
+template<class Alpha, class EngineA, class ALayout,
+         class EngineB, class BLayout,
+         class Beta, class EngineC, class CLayout,
+         class ALoadTransform  = cute::identity,
+         class BLoadTransform  = cute::identity,
+         class CLoadTransform  = cute::identity,
+         class CStoreTransform = cute::identity>
+thrust::host_vector<typename EngineC::value_type>
+host_reference_gemm(Alpha                           alpha,
+                    Tensor<EngineA, ALayout> const& h_a_tensor,
+                    Tensor<EngineB, BLayout> const& h_b_tensor,
+                    Beta                            beta,
+                    Tensor<EngineC, CLayout> const& h_c_tensor,
+                    ALoadTransform           const& a_load_transform = {},
+                    BLoadTransform           const& b_load_transform = {},
+                    CLoadTransform           const& c_load_transform = {},
+                    CStoreTransform          const& c_store_transform = {})
+  {
+  // Cannot use ::value_type because it propagates to complex::value_type,
+  // so ViewEngine<complex<double>>::value_type == double
+  using TA = remove_cv_t<typename EngineA::element_type>;
+  using TB = remove_cv_t<typename EngineB::element_type>;
+  using TC = remove_cv_t<typename EngineC::element_type>;
+  using tester = fp64_tester<TC>;
+  using ABC_64 = typename tester::value_type;
+  static_assert(std::is_same_v<typename fp64_tester<TA>::value_type, typename fp64_tester<TB>::value_type>);
+  static_assert(std::is_same_v<typename fp64_tester<TB>::value_type, typename fp64_tester<TC>::value_type>);
+  thrust::host_vector<TC> h_c_ref(cosize(h_c_tensor.layout()), static_cast<TC>(0.0));
+  auto h_c_ref_tensor = make_tensor(h_c_ref.data(), h_c_tensor.layout());
+  // A * B
+  for (int k = 0; k < size<1>(h_a_tensor); k++) {
+    for (int m = 0; m < size<0>(h_a_tensor); m++) {
+      for (int n = 0; n < size<0>(h_b_tensor); n++) {
+          const auto a_value      = a_load_transform(h_a_tensor(m, k));
+          const auto b_value      = b_load_transform(h_b_tensor(n, k));
+          const auto a_value_fp64 = static_cast<ABC_64>(a_value);
+          const auto b_value_fp64 = static_cast<ABC_64>(b_value);
+          h_c_ref_tensor(m, n) += static_cast<TC>(a_value_fp64 * b_value_fp64);
+      }
+    }
+  }
+  // C = A*B + C
+  for (int i = 0; i < size(h_c_ref_tensor); i++) {
+    const auto ab_value_fp64 = static_cast<ABC_64>(h_c_ref_tensor(i));
+    const auto c_value_fp64  = static_cast<ABC_64>(c_load_transform(h_c_tensor(i)));
+    h_c_ref_tensor(i)        = c_store_transform(static_cast<TC>(alpha * ab_value_fp64 + beta * c_value_fp64));
+  }
+  return h_c_ref;
+}
+template<class EngineC, class CLayout>
+void verify_gemm_correctness(cute::Tensor<EngineC, CLayout> const& h_c_out_tensor,
+                             cute::Tensor<EngineC, CLayout> const& h_c_ref_tensor)
+{
+  // Cannot use ::value_type because it propagates to complex::value_type,
+  // so ViewEngine<complex<double>>::value_type == double
+  using TC = remove_cv_t<typename EngineC::element_type>;
+  using tester = fp64_tester<TC>;
+  using ABC_64 = typename tester::value_type;
+  for (int i = 0; i < size(h_c_ref_tensor); i++) {
+    ABC_64 h_c_ref_i = h_c_ref_tensor(i);
+    ABC_64 h_c_out_i = h_c_out_tensor(i);
+    double epsilon(0.1f);
+    double nonzero_floor(std::numeric_limits<double>::min());
+    bool passed = cutlass::relatively_equal(h_c_out_i, h_c_ref_i, epsilon, nonzero_floor);
+    ASSERT_TRUE(passed) << i << " - result:" << h_c_out_i << " expected:" << h_c_ref_i;
+  }
+}
+template<uint32_t ThreadBlockSize,
+         uint32_t CopyMaxVecBits,
+         class GMemALayout,
+         class GMemBLayout,
+         class GMemCLayout,
+         class SMemALayout,
+         class SMemBLayout,
+         class SMemCLayout,
+         class TA,
+         class TB,
+         class TC,
+         class Alpha,
+         class Beta,
+         class TiledMma,
+         class ALoadTransform,
+         class BLoadTransform,
+         class CLoadTransform,
+         class CStoreTransform,
+         class SMemCopyOpA,
+         class SMemCopyOpB,
+         class SMemCopyLdOpC,
+         class SMemCopyStOpC>
+__launch_bounds__(ThreadBlockSize) __global__ void
+cooperative_gemm_kernel(GMemALayout gmem_a_layout,
+                        GMemBLayout gmem_b_layout,
+                        GMemCLayout gmem_c_layout,
+                        SMemALayout smem_a_layout,
+                        SMemBLayout smem_b_layout,
+                        SMemCLayout smem_c_layout,
+                        TA       const* a,
+                        TB       const* b,
+                        TC       const* c,
+                        TC            * c_out,
+                        Alpha    const  alpha,
+                        Beta     const  beta,
+                        TiledMma        tiled_mma,
+                        ALoadTransform  a_load_transform,
+                        BLoadTransform  b_load_transform,
+                        CLoadTransform  c_load_transform,
+                        CStoreTransform c_store_transform,
+                        SMemCopyOpA     a_copy_op,
+                        SMemCopyOpB     b_copy_op,
+                        SMemCopyLdOpC   c_copy_ld_op,
+                        SMemCopyStOpC   c_copy_st_op)
+{
+    using namespace cute;
+    Tensor g_a_tensor     = make_tensor(make_gmem_ptr(a), gmem_a_layout);
+    Tensor g_b_tensor     = make_tensor(make_gmem_ptr(b), gmem_b_layout);
+    Tensor g_c_tensor     = make_tensor(make_gmem_ptr(c), gmem_c_layout);
+    Tensor g_c_out_tensor = make_tensor(make_gmem_ptr(c_out), gmem_c_layout);
+    constexpr uint32_t copy_max_vec_bytes = CopyMaxVecBits / 8;
+    extern __shared__ float4 smem_buf[];
+    auto* smem_ptr = reinterpret_cast<unsigned char*>(smem_buf);
+    auto* smem_ptr_a = smem_ptr;
+    auto* smem_ptr_b = smem_ptr_a + round_up((sizeof(TA) * cosize(smem_a_layout)), copy_max_vec_bytes);
+    auto* smem_ptr_c = smem_ptr_b + round_up((sizeof(TB) * cosize(smem_b_layout)), copy_max_vec_bytes);
+    Tensor s_a_tensor = make_tensor(make_smem_ptr<TA>(smem_ptr_a), smem_a_layout);
+    Tensor s_b_tensor = make_tensor(make_smem_ptr<TB>(smem_ptr_b), smem_b_layout);
+    Tensor s_c_tensor = make_tensor(make_smem_ptr<TC>(smem_ptr_c), smem_c_layout);
+    cooperative_copy<ThreadBlockSize, CopyMaxVecBits>(threadIdx.x, g_a_tensor, s_a_tensor);
+    cooperative_copy<ThreadBlockSize, CopyMaxVecBits>(threadIdx.x, g_b_tensor, s_b_tensor);
+    cooperative_copy<ThreadBlockSize, CopyMaxVecBits>(threadIdx.x, g_c_tensor, s_c_tensor);
+    cp_async_fence();
+    cp_async_wait<0>();
+    __syncthreads();
+    cooperative_gemm(
+      threadIdx.x, tiled_mma,
+      alpha, s_a_tensor, s_b_tensor, beta, s_c_tensor,
+      a_load_transform, b_load_transform, c_load_transform, c_store_transform,
+      a_copy_op, b_copy_op, c_copy_ld_op, c_copy_st_op
+    );
+    __syncthreads();
+    cooperative_copy<ThreadBlockSize, CopyMaxVecBits>(threadIdx.x, s_c_tensor, g_c_out_tensor);
+}
+template<uint32_t ThreadBlockSize,
+         uint32_t CopyMaxVecBits,
+         class GMemALayout,
+         class GMemBLayout,
+         class GMemCLayout,
+         class SMemALayout,
+         class SMemBLayout,
+         class TA,
+         class TB,
+         class TC,
+         class TiledMma,
+         class ALoadTransform,
+         class BLoadTransform,
+         class CLoadTransform,
+         class CStoreTransform,
+         class SMemCopyOpA,
+         class SMemCopyOpB>
+__launch_bounds__(ThreadBlockSize) __global__ void
+cooperative_gemm_kernel_rmem_c(GMemALayout gmem_a_layout,
+                               GMemBLayout gmem_b_layout,
+                               GMemCLayout gmem_c_layout,
+                               SMemALayout smem_a_layout,
+                               SMemBLayout smem_b_layout,
+                               TA        const* a,
+                               TB        const* b,
+                               TC        const* c,
+                               TC             * c_out,
+                               TiledMma         tiled_mma,
+                               ALoadTransform   a_load_transform,
+                               BLoadTransform   b_load_transform,
+                               CLoadTransform   c_load_transform,
+                               CStoreTransform  c_store_transform,
+                               SMemCopyOpA      a_copy_op,
+                               SMemCopyOpB      b_copy_op)
+  {
+    using namespace cute;
+    Tensor g_a_tensor     = make_tensor(make_gmem_ptr(a), gmem_a_layout);
+    Tensor g_b_tensor     = make_tensor(make_gmem_ptr(b), gmem_b_layout);
+    Tensor g_c_tensor     = make_tensor(make_gmem_ptr(c), gmem_c_layout);
+    Tensor g_c_out_tensor = make_tensor(make_gmem_ptr(c_out), gmem_c_layout);
+    constexpr uint32_t copy_max_vec_bytes = CopyMaxVecBits / 8;
+    extern __shared__ float4 smem_buf[];
+    auto* smem_ptr = reinterpret_cast<unsigned char*>(smem_buf);
+    auto* smem_ptr_a = smem_ptr;
+    auto* smem_ptr_b = smem_ptr_a + round_up((sizeof(TA) * cosize(smem_a_layout)), copy_max_vec_bytes);
+    Tensor s_a_tensor = make_tensor(make_smem_ptr<TA>(smem_ptr_a), smem_a_layout);
+    Tensor s_b_tensor = make_tensor(make_smem_ptr<TB>(smem_ptr_b), smem_b_layout);
+    cooperative_copy<ThreadBlockSize, CopyMaxVecBits>(threadIdx.x, g_a_tensor, s_a_tensor);
+    cooperative_copy<ThreadBlockSize, CopyMaxVecBits>(threadIdx.x, g_b_tensor, s_b_tensor);
+    cp_async_fence();
+    cp_async_wait<0>();
+    __syncthreads();
+    // Create C fragment for storing intermediate results
+    auto thr_mma = TiledMma().get_thread_slice(threadIdx.x);
+    Tensor g_c_partition = thr_mma.partition_C(g_c_tensor);
+    Tensor g_c_out_partition = thr_mma.partition_C(g_c_out_tensor);
+    Tensor r_c_partition = thr_mma.make_fragment_C(g_c_partition);
+    // Create indexing help for predicated GEMMs
+    Tensor cC   = make_identity_tensor(shape(gmem_c_layout));
+    Tensor tCcC = thr_mma.partition_C(cC);
+    // Load C from global
+    // (always loading in predicated way)
+    CUTE_UNROLL
+    for (int i = 0; i < size(r_c_partition); ++i)
+    {
+      if (elem_less(tCcC(i), shape(g_c_tensor)))
+      {
+        r_c_partition(i) = c_load_transform(g_c_partition(i));
+      }
+    }
+    cooperative_gemm(
+      threadIdx.x, tiled_mma, s_a_tensor, s_b_tensor, r_c_partition,
+      a_load_transform, b_load_transform, a_copy_op, b_copy_op
+    );
+    __syncthreads();
+    // Store C to global
+    // (always storing in predicated way)
+    CUTE_UNROLL
+    for (int i = 0; i < size(r_c_partition); ++i)
+    {
+      if (elem_less(tCcC(i), shape(g_c_tensor)))
+      {
+        g_c_out_partition(i) = c_store_transform(r_c_partition(i));
+      }
+    }
+}
+template<uint32_t ThreadBlockSize,
+         uint32_t CopyMaxVecBits,
+         class TA,
+         class TB,
+         class TC,
+         class GMemALayout, // logical shape (M, K)
+         class GMemBLayout, // logical shape (N, K)
+         class GMemCLayout, // logical shape (M, N)
+         class SMemALayout, // logical shape (M, K)
+         class SMemBLayout, // logical shape (N, K)
+         class SMemCLayout, // logical shape (M, N)
+         class TiledMma,
+         class ALoadTransform = cute::identity,
+         class BLoadTransform = cute::identity,
+         class CLoadTransform = cute::identity,
+         class CStoreTransform = cute::identity,
+         class ASMemCopyOp = AutoVectorizingCopyWithAssumedAlignment<CopyMaxVecBits>,
+         class BSMemCopyOp = AutoVectorizingCopyWithAssumedAlignment<CopyMaxVecBits>,
+         class CSMemCopyLdOp = AutoVectorizingCopyWithAssumedAlignment<CopyMaxVecBits>,
+         class CSMemCopyStOp = AutoVectorizingCopyWithAssumedAlignment<CopyMaxVecBits>>
+void test_cooperative_gemm(GMemALayout     gmem_a_layout,
+                           GMemBLayout     gmem_b_layout,
+                           GMemCLayout     gmem_c_layout,
+                           SMemALayout     smem_a_layout,
+                           SMemBLayout     smem_b_layout,
+                           SMemCLayout     smem_c_layout,
+                           TiledMma        tiled_mma,
+                           ALoadTransform  a_load_transform  = {},
+                           BLoadTransform  b_load_transform  = {},
+                           CLoadTransform  c_load_transform  = {},
+                           CStoreTransform c_store_transform = {},
+                           ASMemCopyOp     a_smem_copy_op = {},
+                           BSMemCopyOp     b_smem_copy_op = {},
+                           CSMemCopyLdOp   c_smem_copy_ld_op = {},
+                           CSMemCopyStOp   c_smem_copy_st_op = {})
+{
+  static_assert(std::is_same_v<typename fp64_tester<TA>::value_type, typename fp64_tester<TB>::value_type>);
+  static_assert(std::is_same_v<typename fp64_tester<TB>::value_type, typename fp64_tester<TC>::value_type>);
+  static_assert(size<0>(gmem_a_layout) == size<0>(gmem_c_layout));  // AM == CM
+  static_assert(size<0>(gmem_b_layout) == size<1>(gmem_c_layout));  // BN == CN
+  static_assert(size<1>(gmem_a_layout) == size<1>(gmem_b_layout));  // AK == BK
+  static_assert(size<0>(smem_a_layout) == size<0>(smem_c_layout));  // AM == CM
+  static_assert(size<0>(smem_b_layout) == size<1>(smem_c_layout));  // BN == CN
+  static_assert(size<1>(smem_a_layout) == size<1>(smem_b_layout));  // AK == BK
+  static_assert(cute::size(gmem_a_layout) == cute::size(smem_a_layout));
+  static_assert(cute::size(gmem_b_layout) == cute::size(smem_b_layout));
+  static_assert(cute::size(gmem_c_layout) == cute::size(smem_c_layout));
+#if 0
+  print("   "); print("gmem:    "); print(gmem_layout); print("\n");
+  print("   "); print("smem:    "); print(smem_layout); print("\n");
+  print("   "); print("threads: "); print(ThreadBlockSize); print("\n");
+#endif
+  const auto alpha = static_cast<TC>(1.1);
+  const auto beta  = static_cast<TC>(1.2);
+  // Generate inputs
+  auto [h_a, h_b, h_c, h_c_out] = host_generate_gemm_inputs<TA, TB, TC>(gmem_a_layout, gmem_b_layout, gmem_c_layout);
+  thrust::device_vector<TA> d_a(h_a);
+  thrust::device_vector<TB> d_b(h_b);
+  thrust::device_vector<TC> d_c(h_c);
+  thrust::device_vector<TC> d_c_out(h_c_out.size(), TC(float(-1)));
+  constexpr uint32_t copy_max_vec_bytes = CopyMaxVecBits / 8;
+  const size_t shared_memory_size = round_up(sizeof(TA) * h_a.size(), copy_max_vec_bytes) +
+                                    round_up(sizeof(TB) * h_b.size(), copy_max_vec_bytes) +
+                                    sizeof(TC) * h_c.size();
+  auto kernel = cooperative_gemm_kernel<
+    ThreadBlockSize, CopyMaxVecBits,
+    GMemALayout, GMemBLayout, GMemCLayout,
+    SMemALayout, SMemBLayout, SMemCLayout,
+    TA, TB, TC, decltype(alpha), decltype(beta),
+    TiledMma,
+    ALoadTransform, BLoadTransform, CLoadTransform, CStoreTransform,
+    ASMemCopyOp, BSMemCopyOp, CSMemCopyLdOp, CSMemCopyStOp
+  >;
+  ASSERT_EQ(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, static_cast<int>(shared_memory_size)), 0);
+  kernel<<<1, ThreadBlockSize, shared_memory_size>>>(
+    gmem_a_layout,
+    gmem_b_layout,
+    gmem_c_layout,
+    smem_a_layout,
+    smem_b_layout,
+    smem_c_layout,
+    thrust::raw_pointer_cast(d_a.data()),
+    thrust::raw_pointer_cast(d_b.data()),
+    thrust::raw_pointer_cast(d_c.data()),
+    thrust::raw_pointer_cast(d_c_out.data()),
+    alpha,
+    beta,
+    tiled_mma,
+    a_load_transform,
+    b_load_transform,
+    c_load_transform,
+    c_store_transform,
+    a_smem_copy_op,
+    b_smem_copy_op,
+    c_smem_copy_ld_op,
+    c_smem_copy_st_op
+  );
+  cudaError_t result = cudaDeviceSynchronize();
+  if (result != cudaSuccess) {
+    cudaError_t error = cudaGetLastError();
+    FAIL() << "Error at kernel sync: " << cudaGetErrorString(error) << "\n";
+  }
+  // Reference gemm
+  auto h_c_ref = host_reference_gemm(alpha,
+                                     make_tensor(h_a.data(), gmem_a_layout),
+                                     make_tensor(h_b.data(), gmem_b_layout),
+                                     beta,
+                                     make_tensor(h_c.data(), gmem_c_layout),
+                                     a_load_transform,
+                                     b_load_transform,
+                                     c_load_transform,
+                                     c_store_transform);
+  // Copy result data
+  h_c_out = d_c_out;
+  // Verify correctness
+  verify_gemm_correctness(make_tensor(h_c_out.data(), gmem_c_layout),
+                          make_tensor(h_c_ref.data(), gmem_c_layout));
+}
+template<uint32_t ThreadBlockSize,
+         uint32_t CopyMaxVecBits,
+         class TA,
+         class TB,
+         class TC,
+         class GMemALayout, // logical shape (M, K)
+         class GMemBLayout, // logical shape (N, K)
+         class GMemCLayout, // logical shape (M, N)
+         class SMemALayout, // logical shape (M, K)
+         class SMemBLayout, // logical shape (N, K)
+         class TiledMma,
+         class ALoadTransform = cute::identity,
+         class BLoadTransform = cute::identity,
+         class CLoadTransform = cute::identity,
+         class CStoreTransform = cute::identity,
+         class ASMemCopyOp = AutoVectorizingCopyWithAssumedAlignment<CopyMaxVecBits>,
+         class BSMemCopyOp = AutoVectorizingCopyWithAssumedAlignment<CopyMaxVecBits>>
+void test_cooperative_gemm_rmem_c(GMemALayout     gmem_a_layout,
+                                  GMemBLayout     gmem_b_layout,
+                                  GMemCLayout     gmem_c_layout,
+                                  SMemALayout     smem_a_layout,
+                                  SMemBLayout     smem_b_layout,
+                                  TiledMma        tiled_mma,
+                                  ALoadTransform  a_load_transform  = {},
+                                  BLoadTransform  b_load_transform  = {},
+                                  CLoadTransform  c_load_transform  = {},
+                                  CStoreTransform c_store_transform = {},
+                                  ASMemCopyOp     a_smem_copy_op    = {},
+                                  BSMemCopyOp     b_smem_copy_op    = {})
+{
+  static_assert(size<0>(gmem_a_layout) == size<0>(gmem_c_layout));  // AM == CM
+  static_assert(size<0>(gmem_b_layout) == size<1>(gmem_c_layout));  // BN == CN
+  static_assert(size<1>(gmem_a_layout) == size<1>(gmem_b_layout));  // AK == BK
+  static_assert(size<1>(smem_a_layout) == size<1>(smem_b_layout));  // AK == BK
+  static_assert(cute::size(gmem_a_layout) == cute::size(smem_a_layout));
+  static_assert(cute::size(gmem_b_layout) == cute::size(smem_b_layout));
+#if 0
+  print("   "); print("gmem:    "); print(gmem_layout); print("\n");
+  print("   "); print("smem:    "); print(smem_layout); print("\n");
+  print("   "); print("threads: "); print(ThreadBlockSize); print("\n");
+#endif
+  const auto alpha = static_cast<TC>(1.0);
+  const auto beta  = static_cast<TC>(1.0);
+  // Generate inputs
+  auto [h_a, h_b, h_c, h_c_out] =
+    host_generate_gemm_inputs<TA, TB, TC>(gmem_a_layout, gmem_b_layout, gmem_c_layout);
+  thrust::device_vector<TA> d_a(h_a);
+  thrust::device_vector<TB> d_b(h_b);
+  thrust::device_vector<TC> d_c(h_c);
+  thrust::device_vector<TC> d_c_out(h_c_out.size(), static_cast<TC>(-1));
+  constexpr uint32_t copy_max_vec_bytes = CopyMaxVecBits / 8;
+  const size_t shared_memory_size = round_up(sizeof(TA) * h_a.size(), copy_max_vec_bytes) +
+                                    round_up(sizeof(TB) * h_b.size(), copy_max_vec_bytes);
+  auto kernel = cooperative_gemm_kernel_rmem_c<
+    ThreadBlockSize, CopyMaxVecBits,
+    GMemALayout, GMemBLayout, GMemCLayout,
+    SMemALayout, SMemBLayout,
+    TA, TB, TC,
+    TiledMma,
+    ALoadTransform, BLoadTransform, CLoadTransform, CStoreTransform,
+    ASMemCopyOp, BSMemCopyOp
+  >;
+  ASSERT_EQ(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, static_cast<int>(shared_memory_size)), 0);
+  kernel<<<1, ThreadBlockSize, shared_memory_size>>>(
+    gmem_a_layout,
+    gmem_b_layout,
+    gmem_c_layout,
+    smem_a_layout,
+    smem_b_layout,
+    thrust::raw_pointer_cast(d_a.data()),
+    thrust::raw_pointer_cast(d_b.data()),
+    thrust::raw_pointer_cast(d_c.data()),
+    thrust::raw_pointer_cast(d_c_out.data()),
+    tiled_mma,
+    a_load_transform, b_load_transform, c_load_transform, c_store_transform,
+    a_smem_copy_op, b_smem_copy_op
+  );
+  cudaError_t result = cudaDeviceSynchronize();
+  if (result != cudaSuccess) {
+    cudaError_t error = cudaGetLastError();
+    FAIL() << "Error at kernel sync: " << cudaGetErrorString(error) << "\n";
+  }
+  // Copy result data
+  h_c_out = d_c_out;
+  // Reference gemm
+  auto h_c_ref = host_reference_gemm(alpha,
+                                     make_tensor(h_a.data(), gmem_a_layout),
+                                     make_tensor(h_b.data(), gmem_b_layout),
+                                     beta,
+                                     make_tensor(h_c.data(), gmem_c_layout),
+                                     a_load_transform,
+                                     b_load_transform,
+                                     c_load_transform,
+                                     c_store_transform);
+  // Verify correctness
+  verify_gemm_correctness(make_tensor(h_c_out.data(), gmem_c_layout),
+                          make_tensor(h_c_ref.data(), gmem_c_layout));
+}
+template<uint32_t ThreadBlockSize,
+         uint32_t CopyMaxVecBits,
+         class TA,
+         class TB,
+         class TC,
+         class ShapeMNK,
+         class TiledMma,
+         class ... Ops>
+void test_cooperative_gemm_col_major_layout(ShapeMNK shape_mnk,
+                                            TiledMma tiled_mma,
+                                            Ops ... ops)
+{
+  auto a_layout = make_layout(select<0, 2>(shape_mnk));
+  auto b_layout = make_layout(select<1, 2>(shape_mnk), GenRowMajor{});
+  auto c_layout = make_layout(select<0, 1>(shape_mnk));
+  test_cooperative_gemm<ThreadBlockSize,
+                        CopyMaxVecBits,
+                        TA, TB, TC>
+    (a_layout,
+     b_layout,
+     c_layout,
+     a_layout,
+     b_layout,
+     c_layout,
+     tiled_mma,
+     ops...);
+}
+template<uint32_t ThreadBlockSize,
+         uint32_t CopyMaxVecBits,
+         class TA,
+         class TB,
+         class TC,
+         class SMemAtomLayoutA,
+         class SMemAtomLayoutB,
+         class SMemAtomLayoutC,
+         class ShapeMNK,
+         class TiledMma,
+         class ... Ops>
+std::enable_if_t<std::conjunction_v<cute::is_layout<SMemAtomLayoutA>,
+                                    cute::is_layout<SMemAtomLayoutB>,
+                                    cute::is_layout<SMemAtomLayoutC>>>
+test_cooperative_gemm_col_major_layout(SMemAtomLayoutA smem_atom_layout_a,
+                                       SMemAtomLayoutB smem_atom_layout_b,
+                                       SMemAtomLayoutC smem_atom_layout_c,
+                                       ShapeMNK        shape_mnk,
+                                       TiledMma        tiled_mma,
+                                       Ops&&    ...    ops)
+{
+  auto gmem_a_layout = make_layout(select<0, 2>(shape_mnk));
+  auto gmem_b_layout = make_layout(select<1, 2>(shape_mnk), GenRowMajor{});
+  auto gmem_c_layout = make_layout(select<0, 1>(shape_mnk));
+  auto smem_a_layout = tile_to_shape(
+      smem_atom_layout_a,
+      make_shape(shape<0>(gmem_a_layout), shape<1>(gmem_a_layout)));
+  auto smem_b_layout = tile_to_shape(
+      smem_atom_layout_b,
+      make_shape(shape<0>(gmem_b_layout), shape<1>(gmem_b_layout)));
+  auto smem_c_layout = tile_to_shape(
+      smem_atom_layout_c,
+      make_shape(shape<0>(gmem_c_layout), shape<1>(gmem_c_layout)));
+  test_cooperative_gemm<ThreadBlockSize,
+                        CopyMaxVecBits,
+                        TA, TB, TC>
+    (gmem_a_layout,
+     gmem_b_layout,
+     gmem_c_layout,
+     smem_a_layout,
+     smem_b_layout,
+     smem_c_layout,
+     tiled_mma,
+     ops...);
+}
+template<uint32_t ThreadBlockSize,
+         uint32_t CopyMaxVecBits,
+         class TA,
+         class TB,
+         class TC,
+         class ShapeMNK,
+         class TiledMma,
+         class ... Ops>
+void test_cooperative_gemm_col_major_layout_rmem_c(ShapeMNK    shape_mnk,
+                                                   TiledMma    tiled_mma,
+                                                   Ops ... ops)
+{
+  auto a_layout = make_layout(select<0, 2>(shape_mnk));
+  auto b_layout = make_layout(select<1, 2>(shape_mnk), GenRowMajor{});
+  auto c_layout = make_layout(select<0, 1>(shape_mnk));
+  test_cooperative_gemm_rmem_c<ThreadBlockSize,
+                               CopyMaxVecBits,
+                               TA, TB,TC>
+    (a_layout,
+     b_layout,
+     c_layout,
+     a_layout,
+     b_layout,
+     tiled_mma,
+     ops...);
+}
+template<uint32_t ThreadBlockSize,
+         uint32_t CopyMaxVecBits,
+         class TA,
+         class TB,
+         class TC,
+         class SMemAtomLayoutA,
+         class SMemAtomLayoutB,
+         class ShapeMNK,
+         class TiledMma,
+         class ... Ops>
+std::enable_if_t<std::conjunction_v<cute::is_layout<SMemAtomLayoutA>,
+                                    cute::is_layout<SMemAtomLayoutB>>>
+test_cooperative_gemm_col_major_layout_rmem_c(SMemAtomLayoutA smem_atom_layout_a,
+                                              SMemAtomLayoutB smem_atom_layout_b,
+                                              ShapeMNK        shape_mnk,
+                                              TiledMma        tiled_mma,
+                                              Ops      ...    ops)
+{
+  auto gmem_a_layout = make_layout(select<0, 2>(shape_mnk));
+  auto gmem_b_layout = make_layout(select<1, 2>(shape_mnk), GenRowMajor{});
+  auto gmem_c_layout = make_layout(select<0, 1>(shape_mnk));
+  auto smem_a_layout = tile_to_shape(
+      smem_atom_layout_a,
+      make_shape(shape<0>(gmem_a_layout), shape<1>(gmem_a_layout)));
+  auto smem_b_layout = tile_to_shape(
+      smem_atom_layout_b,
+      make_shape(shape<0>(gmem_b_layout), shape<1>(gmem_b_layout)));
+  test_cooperative_gemm_rmem_c<ThreadBlockSize, CopyMaxVecBits,
+                               TA, TB, TC>
+    (gmem_a_layout,
+     gmem_b_layout,
+     gmem_c_layout,
+     smem_a_layout,
+     smem_b_layout,
+     tiled_mma,
+     ops...);
+}
+template<uint32_t ThreadBlockSize,
+         typename T,
+         class ... Args>
+void test_cooperative_gemm_col_major_layout_rmem_c(Args&& ... args)
+{
+  test_cooperative_gemm_col_major_layout_rmem_c<ThreadBlockSize,
+                                                cute::sizeof_bits_v<T>,
+                                                T, T, T>
+    (static_cast<Args&&>(args)...);
+}
+template<uint32_t ThreadBlockSize,
+         class T,
+         class ... Args>
+void test_cooperative_gemm_col_major_layout(Args&& ... args)
+{
+  test_cooperative_gemm_col_major_layout<ThreadBlockSize,
+                                         cute::sizeof_bits_v<T>,
+                                         T, T, T>
+    (static_cast<Args&&>(args)...);
+}

build/torch210-cxx11-cu130-aarch64-linux/include/third-party/cutlass/test/unit/cute/hopper/tma_load_testbed.hpp ADDED Viewed

	@@ -0,0 +1,217 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#include "cutlass_unit_test.h"
+#include <iostream>
+#include <cstdint>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <cute/tensor.hpp>
+namespace cutlass::test {
+template <class ElementType, class SmemLayout>
+struct SharedStorage
+{
+  cute::ArrayEngine<ElementType, cute::cosize_v<SmemLayout>> smem;
+  alignas(16) cute::uint64_t tma_load_mbar[1];
+};
+#if CUDA_12_0_SM90_FEATURES_SUPPORTED
+template <class T, class TiledCopy, class CTA_Tiler, class GmemLayout, class SmemLayout>
+__global__ void
+tma_test_device_cute(T const* g_in, T* g_out,
+                     CUTE_GRID_CONSTANT TiledCopy const tma, CTA_Tiler cta_tiler,
+                     GmemLayout gmem_layout, SmemLayout smem_layout)
+{
+  using namespace cute;
+  CUTE_STATIC_ASSERT_V(product_each(shape(cta_tiler)) == product_each(shape(smem_layout)));
+  // Use Shared Storage structure to allocate and distribute aligned SMEM addresses
+  extern __shared__ char shared_memory[];
+  using SharedStorage = SharedStorage<T, SmemLayout>;
+  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+  // Construct SMEM tensor
+  Tensor sA = make_tensor(make_smem_ptr(shared_storage.smem.begin()), smem_layout);  // (CTA_TILE_M,CTA_TILE_N,...)
+  // Shared memory barriers use 64bits in SMEM for synchronization
+  uint64_t* tma_load_mbar = shared_storage.tma_load_mbar;
+  // TMA requires special handling of strides to deal with coord codomain mapping
+  // Represent the full tensors -- get these from TMA
+  Tensor mA = tma.get_tma_tensor(shape(gmem_layout));
+  Tensor mB = make_tensor(make_gmem_ptr<T>(g_out), gmem_layout);
+  constexpr int R = rank_v<CTA_Tiler>;
+  Tensor gA = flat_divide(mA, cta_tiler);               // (CTA_TILE_M,CTA_TILE_N,...REST_M,REST_N,...)
+  Tensor gB = flat_divide(mB, cta_tiler);               // (CTA_TILE_M,CTA_TILE_N,...REST_M,REST_N,...)
+  //
+  // Prepare the TMA_LOAD
+  //
+  auto cta_tma = tma.get_slice(Int<0>{});                            // CTA slice
+  Tensor tAgA_x = cta_tma.partition_S(gA);                           // (TMA,TMA_M,TMA_N,REST_M,REST_N)
+  Tensor tAsA_x = cta_tma.partition_D(sA);                           // (TMA,TMA_M,TMA_N)
+#if 0
+  if (thread0()) {
+    print(tma);
+    print("TILE  :  "); print(cta_tiler); print("\n");
+    print("  mA  :  "); print(  mA);   print("\n");
+    print("  mB  :  "); print(  mB);   print("\n");
+    print("  gA  :  "); print(  gA);   print("\n");
+    print("  gB  :  "); print(  gB);   print("\n");
+    print("  sA  :  "); print(  sA);   print("\n");
+    print("tAgA_x:  "); print(tAgA_x); print("\n");
+    print("tAsA_x:  "); print(tAsA_x); print("\n");
+  }
+#endif
+  //
+  // Perform the TMA_LOAD
+  //
+  // INPUT: Group the REST_X modes and the TMA_X modes to easily iterate through the tiles
+  Tensor tAgA = group_modes<1,rank(tAgA_x)>(tAgA_x);                 // (TMA,REST)
+  Tensor tAsA = group_modes<1,rank(tAsA_x)>(tAsA_x);                 // (TMA,REST)
+  static_assert(size<1>(tAsA) == 1);
+  // OUTPUT: Group the CTA_TILE_X modes and REST_X modes for output
+  Tensor tBgB = group_modes<0,R>(group_modes<R,rank(gB)>(gB));       // (CTA_TILE, REST)
+#if 0
+  if (thread0()) {
+    print("tAgA  :  "); print(tAgA); print("\n");
+    print("tAsA  :  "); print(tAsA); print("\n");
+    print("tBgB  :  "); print(tBgB); print("\n");
+  }
+#endif
+  // Test L2 prefetch
+  if (threadIdx.x == 0) {
+    prefetch(tma, tAgA);
+  }
+  // Loop over the TMA stages, using smem as our buffer
+  for (int stage = 0; stage < size<1>(tAgA); ++stage)
+  {
+    // Set the bytes transferred in this TMA transaction (may involve multiple issues)
+    constexpr int kTmaTransactionBytes = sizeof(make_tensor_like(tensor<0>(tAsA)));
+    if (threadIdx.x == 0)
+    {
+      /// Initialize shared memory barrier
+      tma_load_mbar[0] = 0;
+      cute::initialize_barrier(tma_load_mbar[0], 1 /*numThreads*/);
+      cute::set_barrier_transaction_bytes(tma_load_mbar[0], kTmaTransactionBytes);
+      copy(tma.with(tma_load_mbar[0]), tAgA(_,stage), tAsA(_,0));
+    }
+    __syncthreads();
+    /// Wait on the shared memory barrier until the phase bit flips from kPhaseBit value
+    constexpr int kPhaseBit = 0;
+    cute::wait_barrier(tma_load_mbar[0], kPhaseBit);
+    //
+    // Write out trivially smem -> gmem
+    //
+    // Subbyte elements could cause race conditions, so be even more conservative
+    if (thread0()) {
+      copy(sA, tBgB(_,stage));
+    }
+    __syncthreads();
+  }
+}
+template <class T, class TmaType = T, class CopyOp, class GMEM_Layout, class SMEM_Layout, class CTA_Tile>
+auto
+test_tma_load(CopyOp      const& copy_op,
+              GMEM_Layout const& gmem_layout,
+              SMEM_Layout const& smem_layout,
+              CTA_Tile    const& cta_tile)
+{
+  using namespace cute;
+  // Allocate and initialize host test data
+  size_t N = ceil_div(cosize(gmem_layout) * sizeof_bits<T>::value, 8);
+  thrust::host_vector<uint8_t> h_in(N);
+  for (size_t i = 0; i < h_in.size(); ++i) {
+    h_in[i] = uint8_t(i % 13);
+  }
+  Tensor hA_in  = make_tensor(recast_ptr<T>(h_in.data()), gmem_layout);
+  // Allocate and initialize device test data
+  thrust::device_vector<uint8_t> d_in = h_in;
+  thrust::device_vector<uint8_t> d_out(h_in.size(), uint8_t(-1)); // overflow uint
+  // Create TMA for this device Tensor
+  Tensor gA = make_tensor(make_gmem_ptr<T>(raw_pointer_cast(d_in.data())), gmem_layout);
+  auto tma = make_tma_copy<TmaType>(copy_op, gA, smem_layout, cta_tile, Int<1>{});
+  //print(tma);
+  // Launch
+  int smem_size = int(sizeof(SharedStorage<T, decltype(smem_layout)>));
+  tma_test_device_cute<<<1, 128, smem_size>>>(
+    reinterpret_cast<T const*>(raw_pointer_cast(d_in.data())),
+    reinterpret_cast<T*>      (raw_pointer_cast(d_out.data())),
+    tma, cta_tile,
+    gmem_layout,
+    smem_layout);
+  // Copy results back to host
+  thrust::host_vector<uint8_t> h_out = d_out;
+  Tensor hA_out = make_tensor(recast_ptr<T>(h_out.data()), gmem_layout);
+  // Validate the results. Print only the first 3 errors.
+  int count = 3;
+  for (int i = 0; i < int(size(hA_out)) && count > 0; ++i) {
+    EXPECT_EQ(hA_in(i), hA_out(i));
+    if (hA_in(i) != hA_out(i)) {
+      --count;
+    }
+  }
+  return tma;
+}
+#endif
+} // end namespace cutlass::test