danieldk HF Staff commited on 3 days ago

Commit

38c7386

verified ·

1 Parent(s): e7410d9

Build uploaded using `kernels` (batch 7/10).

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm50.h +432 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm60.h +252 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm61.h +142 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm70.h +661 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm75.h +789 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm80.h +1500 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm89.h +641 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm90.h +241 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sparse_sm80.h +1234 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sparse_sm89.h +406 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/reg_reconfig.h +89 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/simd.h +125 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/simd_sm60.h +104 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/simd_sm61.h +147 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/synclog.hpp +1271 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/wmma.h +218 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/wmma_sm70.h +132 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/wmma_sm72.h +206 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/wmma_sm75.h +203 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/array.h +2860 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/array_planar_complex.h +89 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/array_subbyte.h +561 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/barrier.h +377 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/bfloat16.h +679 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/blas3.h +143 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/blas3_types.h +78 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/block_striped.h +267 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/cluster_launch.hpp +394 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/complex.h +821 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/constants.h +1239 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/collective/collective_builder.hpp +94 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/collective/collective_conv.hpp +63 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/collective/detail.hpp +271 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/collective/sm100_implicit_gemm_umma_warpspecialized.hpp +917 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp +785 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/conv2d_problem_size.h +658 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/conv3d_problem_size.h +519 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/convnd_problem_shape.hpp +601 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/convolution.h +194 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/detail.hpp +137 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/device/conv_universal_adapter.hpp +448 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/device/direct_convolution.h +270 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/device/implicit_gemm_convolution.h +388 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h +269 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/dispatch_policy.hpp +136 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/conv_universal.hpp +65 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d.h +322 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_dgrad.h +1927 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop.h +2007 -0
build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h +357 -0

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm50.h ADDED Viewed

	@@ -0,0 +1,432 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+#pragma once
+#include "cutlass/arch/mma.h"
+#include "cutlass/complex.h"
+#include "cutlass/quaternion.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/gemm.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace arch {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<gemm::GemmShape<1, 1, 1>, 1, float, LayoutA, float, LayoutB, float, LayoutC, OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = float;
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<float, 1> &d,
+    Array<float, 1> const &a,
+    Array<float, 1> const &b,
+    Array<float, 1> const &c
+  ) {
+    d[0] = a[0] * b[0] + c[0];
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<gemm::GemmShape<1, 1, 1>, 1, double, LayoutA, double, LayoutB, double, LayoutC, OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = double;
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<double, 1> &d,
+    Array<double, 1> const &a,
+    Array<double, 1> const &b,
+    Array<double, 1> const &c
+  ) {
+    d[0] = a[0] * b[0] + c[0];
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<gemm::GemmShape<1, 1, 1>, 1, int, LayoutA, int, LayoutB, int, LayoutC, OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = int;
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<int, 1> &d,
+    Array<int, 1> const &a,
+    Array<int, 1> const &b,
+    Array<int, 1> const &c
+  ) {
+    d[0] = a[0] * b[0] + c[0];
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<
+  gemm::GemmShape<1, 1, 1>,
+  1,
+  complex<float>,
+  LayoutA,
+  complex<float>,
+  LayoutB,
+  complex<float>,
+  LayoutC,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAddComplex;
+  using ElementC = complex<float>;
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<complex<float>, 1> &d,
+    Array<complex<float>, 1> const &a,
+    Array<complex<float>, 1> const &b,
+    Array<complex<float>, 1> const &c
+  ) {
+    d[0].real() = a[0].real() * b[0].real() + c[0].real();
+    d[0].imag() = a[0].imag() * b[0].real() + c[0].imag();
+    d[0].real() = -a[0].imag() * b[0].imag() + d[0].real();
+    d[0].imag() = a[0].real() * b[0].imag() + d[0].imag();
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<
+  gemm::GemmShape<1, 1, 1>,
+  1,
+  complex<float>,
+  LayoutA,
+  float,
+  LayoutB,
+  complex<float>,
+  LayoutC,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAddComplex;
+  using ElementC = complex<float>;
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<complex<float>, 1> &d,
+    Array<complex<float>, 1> const &a,
+    Array<float, 1> const &b,
+    Array<complex<float>, 1> const &c
+  ) {
+    d[0].real() = a[0].real() * b[0] + c[0].real();
+    d[0].imag() = a[0].imag() * b[0] + c[0].imag();
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<
+  gemm::GemmShape<1, 1, 1>,
+  1,
+  float,
+  LayoutA,
+  complex<float>,
+  LayoutB,
+  complex<float>,
+  LayoutC,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAddComplex;
+  using ElementC = complex<float>;
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<complex<float>, 1> &d,
+    Array<float, 1> const &a,
+    Array<complex<float>, 1> const &b,
+    Array<complex<float>, 1> const &c
+  ) {
+    d[0].real() = a[0] * b[0].real() + c[0].real();
+    d[0].imag() = a[0] * b[0].imag() + d[0].imag();
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<
+  gemm::GemmShape<1, 1, 1>,
+  1,
+  complex<double>,
+  LayoutA,
+  complex<double>,
+  LayoutB,
+  complex<double>,
+  LayoutC,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAddComplex;
+  using ElementC = complex<double>;
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<complex<double>, 1> &d,
+    Array<complex<double>, 1> const &a,
+    Array<complex<double>, 1> const &b,
+    Array<complex<double>, 1> const &c
+  ) {
+    d[0].real() = a[0].real() * b[0].real() + c[0].real();
+    d[0].imag() = a[0].imag() * b[0].real() + c[0].imag();
+    d[0].real() = -a[0].imag() * b[0].imag() + d[0].real();
+    d[0].imag() = a[0].real() * b[0].imag() + d[0].imag();
+  }
+};
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<
+  gemm::GemmShape<1, 1, 1>,
+  1,
+  complex<double>,
+  LayoutA,
+  double,
+  LayoutB,
+  complex<double>,
+  LayoutC,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAddComplex;
+  using ElementC = complex<double>;
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<complex<double>, 1> &d,
+    Array<complex<double>, 1> const &a,
+    Array<double, 1> const &b,
+    Array<complex<double>, 1> const &c
+  ) {
+    d[0].real() = a[0].real() * b[0] + c[0].real();
+    d[0].imag() = a[0].imag() * b[0] + c[0].imag();
+  }
+};
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<
+  gemm::GemmShape<1, 1, 1>,
+  1,
+  double,
+  LayoutA,
+  complex<double>,
+  LayoutB,
+  complex<double>,
+  LayoutC,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAddComplex;
+  using ElementC = complex<double>;
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<complex<double>, 1> &d,
+    Array<double, 1> const &a,
+    Array<complex<double>, 1> const &b,
+    Array<complex<double>, 1> const &c
+  ) {
+    d[0].real() = a[0] * b[0].real() + c[0].real();
+    d[0].imag() = a[0] * b[0].imag() + d[0].imag();
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<gemm::GemmShape<1, 1, 1>, 1, half_t, LayoutA, half_t, LayoutB, float, LayoutC, OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = float;
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<float, 1> &d,
+    Array<half_t, 1> const &a,
+    Array<half_t, 1> const &b,
+    Array<float, 1> const &c
+  ) {
+    d[0] = float(a[0]) * float(b[0]) + c[0];
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation for Quaternions
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<gemm::GemmShape<1, 1, 1>, 1, Quaternion<float>, LayoutA, Quaternion<float>, LayoutB, Quaternion<float>, LayoutC, OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAdd;
+  using Element = Quaternion<float>;
+  using ElementC = Element;
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<Element, 1> &d,
+    Array<Element, 1> const &a,
+    Array<Element, 1> const &b,
+    Array<Element, 1> const &c
+  ) {
+    multiply_add<Element, Element, Element> op;
+    d[0] = op(a[0], b[0], c[0]);
+  }
+};
+}
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm60.h ADDED Viewed

	@@ -0,0 +1,252 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+#pragma once
+#include <cuda_fp16.h>
+#include "cutlass/arch/mma.h"
+#include "cutlass/layout/matrix.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace arch {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation
+template <typename LayoutA, typename LayoutB, typename LayoutC>
+struct Mma<
+  gemm::GemmShape<2,1,1>,
+  1,
+  half_t,
+  LayoutA,
+  half_t,
+  LayoutB,
+  half_t,
+  LayoutC,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<2, 1, 1>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = half_t;
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<half_t, 2> &d,
+    Array<half_t, 2> const &a,
+    Array<half_t, 1> const &b,
+    Array<half_t, 2> const &c
+  ) {
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600))
+    __half2 const & A = reinterpret_cast<__half2 const &>(a);
+    __half2 B = __half2half2(reinterpret_cast<__half const &>(b));
+    __half2 const & C = reinterpret_cast<__half2 const &>(c);
+    __half2 D = __hfma2(A, B, C);
+    d = reinterpret_cast<Array<half_t, 2> &>(D);
+#else
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      d[i] = a[i] * b[0] + c[i];
+    }
+#endif
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation
+template <typename LayoutA, typename LayoutB>
+struct Mma<
+  gemm::GemmShape<1,2,1>,
+  1,
+  half_t,
+  LayoutA,
+  half_t,
+  LayoutB,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<1, 2, 1>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = half_t;
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<half_t, 2> &d,
+    Array<half_t, 1> const &a,
+    Array<half_t, 2> const &b,
+    Array<half_t, 2> const &c
+  ) {
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600))
+    __half2 const & A = __half2half2(reinterpret_cast<__half const &>(a));
+    __half2 B = reinterpret_cast<__half2 const &>(b);
+    __half2 const & C = reinterpret_cast<__half2 const &>(c);
+    __half2 D = __hfma2(A, B, C);
+    d = reinterpret_cast<Array<half_t, 2> &>(D);
+#else
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      d[i] = a[0] * b[i] + c[i];
+    }
+#endif
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation
+template <>
+struct Mma <
+  gemm::GemmShape<2, 2, 1>,
+  1,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<2, 2, 1>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = half_t;
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<half_t, 4> &d,
+    Array<half_t, 2> const &a,
+    Array<half_t, 2> const &b,
+    Array<half_t, 4> const &c
+  ) {
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600))
+    __half2 const & A = reinterpret_cast<__half2 const &>(a);
+    __half2 Blo = __low2half2(reinterpret_cast<__half2 const &>(b));
+    __half2 Bhi = __high2half2(reinterpret_cast<__half2 const &>(b));
+    __half2 const *C = reinterpret_cast<__half2 const *>(&c);
+    __half2 Dlo = __hfma2(A, Blo, C[0]);
+    __half2 Dhi = __hfma2(A, Bhi, C[1]);
+    Array<half_t, 2> * D = reinterpret_cast<Array<half_t, 2> *>(&d);
+    D[0] = reinterpret_cast<Array<half_t, 2> const &>(Dlo);
+    D[1] = reinterpret_cast<Array<half_t, 2> const &>(Dhi);
+#else
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < 2; ++j) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < 2; ++i) {
+        d[i + 2 * j] = a[i] * b[j] + c[i + 2 * j];
+      }
+    }
+#endif
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation
+template <>
+struct Mma<
+  gemm::GemmShape<2, 2, 1>,
+  1,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<2, 2, 1>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = half_t;
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<half_t, 4> &d,
+    Array<half_t, 2> const &a,
+    Array<half_t, 2> const &b,
+    Array<half_t, 4> const &c
+  ) {
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600))
+    __half2 Alo = __low2half2(reinterpret_cast<__half2 const &>(a));
+    __half2 Ahi = __high2half2(reinterpret_cast<__half2 const &>(a));
+    __half2 const & B = reinterpret_cast<__half2 const &>(b);
+    __half2 const *C = reinterpret_cast<__half2 const *>(&c);
+    __half2 Dlo = __hfma2(Alo, B, C[0]);
+    __half2 Dhi = __hfma2(Ahi, B, C[1]);
+    Array<half_t, 2> * D = reinterpret_cast<Array<half_t, 2> *>(&d);
+    D[0] = reinterpret_cast<Array<half_t, 2> &>(Dlo);
+    D[1] = reinterpret_cast<Array<half_t, 2> &>(Dhi);
+#else
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < 2; ++j) {
+        d[i * 2 + j] = a[i] * b[j] + c[i * 2 + j];
+      }
+    }
+#endif
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+}
+}

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm61.h ADDED Viewed

	@@ -0,0 +1,142 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+#pragma once
+#include "cutlass/layout/matrix.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace arch {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation
+template <typename LayoutA, typename LayoutB, typename LayoutC>
+struct Mma<
+  gemm::GemmShape<1,1,4>,
+  1,
+  int8_t,
+  LayoutA,
+  int8_t,
+  LayoutB,
+  int,
+  LayoutC,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<1, 1, 4>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = int;
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<int, 1> &d,
+    Array<int8_t, 4> const &a,
+    Array<int8_t, 4> const &b,
+    Array<int, 1> const &c
+  ) {
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610))
+    unsigned const &A = reinterpret_cast<unsigned const &>(a);
+    unsigned const &B = reinterpret_cast<unsigned const &>(b);
+    asm volatile("dp4a.s32.s32 %0, %1, %2, %3;"
+                 : "=r"(d[0])
+                 : "r"(A), "r"(B), "r"(c[0]));
+#else
+    d[0] = c[0];
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < 4; ++k) {
+      d[0] += a[k] * b[k];
+    }
+#endif
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation
+template <typename LayoutC>
+struct Mma<
+  gemm::GemmShape<1, 1, 2>,
+  1,
+  int16_t,
+  layout::RowMajor,
+  int16_t,
+  layout::ColumnMajor,
+  int,
+  LayoutC,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<1, 1, 2>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = int;
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<int, 1> &d,
+    Array<int16_t, 2> const &a,
+    Array<int16_t, 2> const &b,
+    Array<int, 1> const &c
+  ) {
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610))
+    unsigned const &A = reinterpret_cast<unsigned const &>(a);
+    unsigned const &B = reinterpret_cast<unsigned const &>(b);
+    asm volatile("dp2a.s32.s32 %0, %1, %2, %3;"
+                 : "=r"(d[0])
+                 : "r"(A), "r"(B), "r"(c[0]));
+#else
+    d[0] = c[0];
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < 2; ++k) {
+      d[0] += a[k] * b[k];
+    }
+#endif
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+}
+}

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm70.h ADDED Viewed

	@@ -0,0 +1,661 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+#include "mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))
+#define CUTLASS_ARCH_MMA_SM70_SUPPORTED
+#endif
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700))
+#if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 &&__CUDACC_VER_MINOR__ >= 1))
+#define CUTLASS_ARCH_MMA_SM70_ENABLED
+#endif
+#endif
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace arch {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix multiply accumulate 884 - FP16 accumulation
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: F16 = F16 * F16 + F16
+template <>
+struct Mma<
+  gemm::GemmShape<8,8,4>,
+  8,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<8, 8, 4>;
+  using ElementA = half_t;
+  using LayoutA = layout::ColumnMajor;
+  using FragmentA = Array<half_t, 4>;
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 4>;
+  using ElementC = half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<half_t, 8>;
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) {
+#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
+    unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+    unsigned const *C = reinterpret_cast<unsigned const *>(&c);
+    unsigned *D = reinterpret_cast<unsigned *>(&d);
+    asm volatile("mma.sync.aligned.m8n8k4.col.col.f16.f16.f16.f16 {%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11};\n"
+      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+      : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(B[1]), "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])
+    );
+#else
+    assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
+#endif
+  }
+};
+/// Matrix multiply-add operation: F16 = F16 * F16 + F16
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 4>,
+  8,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<8, 8, 4>;
+  using ElementA = half_t;
+  using LayoutA = layout::ColumnMajor;
+  using FragmentA = Array<half_t, 4>;
+  using ElementB = half_t;
+  using LayoutB = layout::RowMajor;
+  using FragmentB = Array<half_t, 4>;
+  using ElementC = half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<half_t, 8>;
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) {
+#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
+    unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+    unsigned const *C = reinterpret_cast<unsigned const *>(&c);
+    unsigned *D = reinterpret_cast<unsigned *>(&d);
+    asm volatile("mma.sync.aligned.m8n8k4.col.row.f16.f16.f16.f16 {%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11};\n"
+      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+      : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(B[1]), "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])
+    );
+#else
+    assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
+#endif
+  }
+};
+/// Matrix multiply-add operation: F16 = F16 * F16 + F16
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 4>,
+  8,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<8, 8, 4>;
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 4>;
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 4>;
+  using ElementC = half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<half_t, 8>;
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) {
+#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
+    unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+    unsigned const *C = reinterpret_cast<unsigned const *>(&c);
+    unsigned *D = reinterpret_cast<unsigned *>(&d);
+    asm volatile("mma.sync.aligned.m8n8k4.row.col.f16.f16.f16.f16 {%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11};\n"
+      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+      : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(B[1]), "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])
+    );
+#else
+    assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
+#endif
+  }
+};
+/// Matrix multiply-add operation: F16 = F16 * F16 + F16
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 4>,
+  8,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<8, 8, 4>;
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 4>;
+  using ElementB = half_t;
+  using LayoutB = layout::RowMajor;
+  using FragmentB = Array<half_t, 4>;
+  using ElementC = half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<half_t, 8>;
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) {
+#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
+    unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+    unsigned const *C = reinterpret_cast<unsigned const *>(&c);
+    unsigned *D = reinterpret_cast<unsigned *>(&d);
+    asm volatile("mma.sync.aligned.m8n8k4.row.row.f16.f16.f16.f16 {%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11};\n"
+      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+      : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(B[1]), "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])
+    );
+#else
+    assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
+#endif
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix multiply accumulate 884 - FP32 accumulation
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: F32 = F16 * F16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 4>,
+  8,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<8, 8, 4>;
+  using ElementA = half_t;
+  using LayoutA = layout::ColumnMajor;
+  using FragmentA = Array<half_t, 4>;
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 4>;
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 8>;
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+  /// Multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) {
+#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
+  unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+  unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+  asm volatile("mma.sync.aligned.m8n8k4.col.col.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, "
+      "{%12,%13,%14,%15,%16,%17,%18,%19};\n"
+      : "=f"(D[0]),
+        "=f"(D[1]),
+        "=f"(D[2]),
+        "=f"(D[3]),
+        "=f"(D[4]),
+        "=f"(D[5]),
+        "=f"(D[6]),
+        "=f"(D[7])
+      : "r"(A[0]),
+        "r"(A[1]),
+        "r"(B[0]),
+        "r"(B[1]),
+        "f"(C[0]),
+        "f"(C[1]),
+        "f"(C[2]),
+        "f"(C[3]),
+        "f"(C[4]),
+        "f"(C[5]),
+        "f"(C[6]),
+        "f"(C[7])
+  );
+#else
+    assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
+#endif
+  }
+};
+/// Matrix multiply-add operation: F32 = F16 * F16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 4>,
+  8,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<8, 8, 4>;
+  using ElementA = half_t;
+  using LayoutA = layout::ColumnMajor;
+  using FragmentA = Array<half_t, 4>;
+  using ElementB = half_t;
+  using LayoutB = layout::RowMajor;
+  using FragmentB = Array<half_t, 4>;
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 8>;
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+  /// Multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) {
+#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
+  unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+  unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+  asm volatile("mma.sync.aligned.m8n8k4.col.row.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, "
+      "{%12,%13,%14,%15,%16,%17,%18,%19};\n"
+      : "=f"(D[0]),
+        "=f"(D[1]),
+        "=f"(D[2]),
+        "=f"(D[3]),
+        "=f"(D[4]),
+        "=f"(D[5]),
+        "=f"(D[6]),
+        "=f"(D[7])
+      : "r"(A[0]),
+        "r"(A[1]),
+        "r"(B[0]),
+        "r"(B[1]),
+        "f"(C[0]),
+        "f"(C[1]),
+        "f"(C[2]),
+        "f"(C[3]),
+        "f"(C[4]),
+        "f"(C[5]),
+        "f"(C[6]),
+        "f"(C[7])
+  );
+#else
+    assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
+#endif
+  }
+};
+/// Matrix multiply-add operation: F32 = F16 * F16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 4>,
+  8,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<8, 8, 4>;
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 4>;
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 4>;
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 8>;
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+  /// Multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) {
+#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
+  unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+  unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+  asm volatile("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, "
+      "{%12,%13,%14,%15,%16,%17,%18,%19};\n"
+      : "=f"(D[0]),
+        "=f"(D[1]),
+        "=f"(D[2]),
+        "=f"(D[3]),
+        "=f"(D[4]),
+        "=f"(D[5]),
+        "=f"(D[6]),
+        "=f"(D[7])
+      : "r"(A[0]),
+        "r"(A[1]),
+        "r"(B[0]),
+        "r"(B[1]),
+        "f"(C[0]),
+        "f"(C[1]),
+        "f"(C[2]),
+        "f"(C[3]),
+        "f"(C[4]),
+        "f"(C[5]),
+        "f"(C[6]),
+        "f"(C[7])
+  );
+#else
+    assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
+#endif
+  }
+};
+/// Matrix multiply-add operation: F32 = F16 * F16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 4>,
+  8,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::RowMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<8, 8, 4>;
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 4>;
+  using ElementB = half_t;
+  using LayoutB = layout::RowMajor;
+  using FragmentB = Array<half_t, 4>;
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 8>;
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+  /// Multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) {
+#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
+  unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+  unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+  asm volatile("mma.sync.aligned.m8n8k4.row.row.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, "
+      "{%12,%13,%14,%15,%16,%17,%18,%19};\n"
+      : "=f"(D[0]),
+        "=f"(D[1]),
+        "=f"(D[2]),
+        "=f"(D[3]),
+        "=f"(D[4]),
+        "=f"(D[5]),
+        "=f"(D[6]),
+        "=f"(D[7])
+      : "r"(A[0]),
+        "r"(A[1]),
+        "r"(B[0]),
+        "r"(B[1]),
+        "f"(C[0]),
+        "f"(C[1]),
+        "f"(C[2]),
+        "f"(C[3]),
+        "f"(C[4]),
+        "f"(C[5]),
+        "f"(C[6]),
+        "f"(C[7])
+  );
+#else
+    assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
+#endif
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation specialized for the entire warp
+template <
+  typename LayoutA,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename Operator
+>
+struct Mma<
+  gemm::GemmShape<16, 16, 4>,
+  32,
+  half_t,
+  LayoutA,
+  half_t,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  Operator
+> :
+  public Mma<
+    gemm::GemmShape<8, 8, 4>,
+    8,
+    half_t,
+    LayoutA,
+    half_t,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    Operator> {
+  using Shape = gemm::GemmShape<16, 16, 4>;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace arch
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm75.h ADDED Viewed

	@@ -0,0 +1,789 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply for SM75
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+#include "cutlass/arch/wmma.h"
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+// CUDA Toolkit includes for nvcuda::wmma needed for binarized matrix multiply.
+#include <mma.h>
+#include "cutlass/wmma_array.h"
+#endif
+// CUTLASS includes
+#include "cutlass/arch/mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+////////////////////////////////////////////////////////////////////////////////
+#if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))
+#define CUTLASS_ARCH_MMA_SM75_SUPPORTED 1
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750))
+#define CUTLASS_ARCH_MMA_SM75_ENABLED
+#endif
+#endif
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace arch {
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 1688 - FP16 accumulation
+//
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation - F16 = F16 * F16 + F16
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 8>,
+  32,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<16, 8, 8>;
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 4>;
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 2>;
+  using ElementC = half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<half_t, 4>;
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm75;
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+  unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+  unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+  unsigned const *C = reinterpret_cast<unsigned const *>(&c);
+  unsigned *D = reinterpret_cast<unsigned *>(&d);
+  asm volatile(
+    "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 {%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(C[0]), "r"(C[1]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 1688 - FP32 accumulation
+//
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: F32 = F16 * F16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 8>,
+  32,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<16, 8, 8>;
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 4>;
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 2>;
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm75;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+  unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+  unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+  asm volatile("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      :
+        "r"(A[0]), "r"(A[1]),
+        "r"(B[0]),
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+//
+// Integer matrix multiply  (8b) with SATURATE
+//
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: S32 = S8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 16>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+  using Shape = gemm::GemmShape<8, 8, 16>;
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 4>;
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 4>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm75;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+  unsigned const & A = reinterpret_cast<unsigned const &>(a);
+  unsigned const & B = reinterpret_cast<unsigned const &>(b);
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+  asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.s8.s8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+/// Matrix multiply-add operation: S32 = U8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 16>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+  using Shape = gemm::GemmShape<8, 8, 16>;
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 4>;
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 4>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm75;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+  unsigned const & A = reinterpret_cast<unsigned const &>(a);
+  unsigned const & B = reinterpret_cast<unsigned const &>(b);
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+  asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.u8.s8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+/// Matrix multiply-add operation: S32 = S8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 16>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+  using Shape = gemm::GemmShape<8, 8, 16>;
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 4>;
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 4>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm75;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+  unsigned const & A = reinterpret_cast<unsigned const &>(a);
+  unsigned const & B = reinterpret_cast<unsigned const &>(b);
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+  asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.s8.u8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+/// Matrix multiply-add operation: S32 = U8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 16>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+  using Shape = gemm::GemmShape<8, 8, 16>;
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 4>;
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 4>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm75;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+  unsigned const & A = reinterpret_cast<unsigned const &>(a);
+  unsigned const & B = reinterpret_cast<unsigned const &>(b);
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+  asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.u8.u8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+//
+// Integer matrix multiply  (4b) - SATURATE
+//
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: S32 = S4 * S4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 32>,
+  32,
+  int4b_t,
+  layout::RowMajor,
+  int4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+  using Shape = gemm::GemmShape<8, 8, 32>;
+  using ElementA = int4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int4b_t, 8>;
+  using ElementB = int4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int4b_t, 8>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm75;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+  unsigned const & A = reinterpret_cast<unsigned const &>(a);
+  unsigned const & B = reinterpret_cast<unsigned const &>(b);
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+  asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.s4.s4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+/// Matrix multiply-add operation: S32 = U4 * S4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 32>,
+  32,
+  uint4b_t,
+  layout::RowMajor,
+  int4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+  using Shape = gemm::GemmShape<8, 8, 32>;
+  using ElementA = uint4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint4b_t, 8>;
+  using ElementB = int4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int4b_t, 8>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm75;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+  unsigned const & A = reinterpret_cast<unsigned const &>(a);
+  unsigned const & B = reinterpret_cast<unsigned const &>(b);
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+  asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.u4.s4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+/// Matrix multiply-add operation: S32 = S4 * U4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 32>,
+  32,
+  int4b_t,
+  layout::RowMajor,
+  uint4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+  using Shape = gemm::GemmShape<8, 8, 32>;
+  using ElementA = int4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int4b_t, 8>;
+  using ElementB = uint4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint4b_t, 8>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm75;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+  unsigned const & A = reinterpret_cast<unsigned const &>(a);
+  unsigned const & B = reinterpret_cast<unsigned const &>(b);
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+  asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.s4.u4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+/// Matrix multiply-add operation: S32 = U4 * U4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 32>,
+  32,
+  uint4b_t,
+  layout::RowMajor,
+  uint4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+  using Shape = gemm::GemmShape<8, 8, 32>;
+  using ElementA = uint4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint4b_t, 8>;
+  using ElementB = uint4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint4b_t, 8>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm75;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+  unsigned const & A = reinterpret_cast<unsigned const &>(a);
+  unsigned const & B = reinterpret_cast<unsigned const &>(b);
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+  asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.u4.u4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+//
+// b1 ^ b1 + s32 => s32
+//
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation
+template <>
+struct Mma<
+  gemm::GemmShape<8,8,128>,
+  32,
+  uint1b_t,
+  layout::RowMajor,
+  uint1b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpXorPopc> {
+  using Shape = gemm::GemmShape<8,8,128>;
+  using ElementA = uint1b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint1b_t, 32>;
+  using ElementB = uint1b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint1b_t, 32>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+  using Operator = OpXorPopc;
+  using ArchTag = arch::Sm75;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+  using WmmaFragmentA = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_a,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          nvcuda::wmma::experimental::precision::b1,
+          nvcuda::wmma::row_major>;
+  using WmmaFragmentB = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_b,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          nvcuda::wmma::experimental::precision::b1,
+          nvcuda::wmma::col_major>;
+  using WmmaFragmentC = nvcuda::wmma::fragment<
+          nvcuda::wmma::accumulator,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          int>;
+  WmmaFragmentA const & A = reinterpret_cast<WmmaFragmentA const &>(a);
+  WmmaFragmentB const & B = reinterpret_cast<WmmaFragmentB const &>(b);
+  WmmaFragmentC const & C = reinterpret_cast<WmmaFragmentC const &>(c);
+  WmmaFragmentC & D = reinterpret_cast<WmmaFragmentC &>(d);
+  nvcuda::wmma::bmma_sync(D, A, B, C, nvcuda::wmma::experimental::bmmaBitOpXOR,
+                                          nvcuda::wmma::experimental::bmmaAccumulateOpPOPC);
+#else
+  CUTLASS_UNUSED(a);
+  CUTLASS_UNUSED(b);
+  CUTLASS_UNUSED(c);
+  CUTLASS_UNUSED(d);
+  CUTLASS_NOT_IMPLEMENTED(); // WMMA must be supported to issue binary matrix multiply-accumulate instructions.
+#endif // defined(CUTLASS_ARCH_WMMA_ENABLED)
+#endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace arch
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm80.h ADDED Viewed

	@@ -0,0 +1,1500 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+#include "mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+////////////////////////////////////////////////////////////////////////////////
+#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))
+#define CUTLASS_ARCH_MMA_SM80_SUPPORTED 1
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+#define CUTLASS_ARCH_MMA_SM80_ENABLED
+#if (__CUDA_ARCH__ <= 900)
+#define CUTLASS_ARCH_MMA_B1_AND_SM80_ENABLED
+#endif
+#if (__CUDA_ARCH__ <= 890)
+#define CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED
+#endif
+#endif
+#endif
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace arch {
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 1688 - Float BF16, FP32 accumulation
+//
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation - F32 = bf16 * bf16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 8>,
+  32,
+  bfloat16_t,
+  layout::RowMajor,
+  bfloat16_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<16, 8, 8>;
+  using ElementA = bfloat16_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<bfloat16_t, 4>;
+  using ElementB = bfloat16_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<bfloat16_t, 2>;
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+  asm(
+      "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32 "
+      "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      :
+        "r"(A[0]), "r"(A[1]),
+        "r"(B[0]),
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+#else
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 1684 - Float TF32
+//
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: F32 = tf32 * tf32 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 4>,
+  32,
+  tfloat32_t,
+  layout::RowMajor,
+  tfloat32_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<16, 8, 4>;
+  using ElementA = tfloat32_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<tfloat32_t, 2>;
+  using ElementB = tfloat32_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<tfloat32_t, 1>;
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+  asm volatile(
+      "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      :
+        "r"(A[0]), "r"(A[1]),
+        "r"(B[0]),
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+#else
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 1688 - Float TF32
+//
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: F32 = tf32 * tf32 + F32
+template <>
+struct Mma<gemm::GemmShape<16, 8, 8>, 32, tfloat32_t, layout::RowMajor,
+           tfloat32_t, layout::ColumnMajor, float, layout::RowMajor,
+           OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<16, 8, 8>;
+  using ElementA = tfloat32_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<tfloat32_t, 4>;
+  using ElementB = tfloat32_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<tfloat32_t, 2>;
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+    asm volatile(
+        "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
+#else
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 16816
+//
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: F16 = F16 * F16 + F16
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 16>,
+  32,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<16, 8, 16>;
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 8>;
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 4>;
+  using ElementC = half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<half_t, 4>;
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  uint32_t const *C = reinterpret_cast<uint32_t const *>(&c);
+  uint32_t *D = reinterpret_cast<uint32_t *>(&d);
+  asm volatile("mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "r"(C[0]), "r"(C[1])
+  );
+#else
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: F32 = bf16 * bf16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 16>,
+  32,
+  bfloat16_t,
+  layout::RowMajor,
+  bfloat16_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<16, 8, 16>;
+  using ElementA = bfloat16_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<bfloat16_t, 8>;
+  using ElementB = bfloat16_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<bfloat16_t, 4>;
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
+#else
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: F32 = F16 * F16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 16>,
+  32,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<16, 8, 16>;
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 8>;
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 4>;
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32  {%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, "
+        "{%10,%11,%12,%13};\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
+#else
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 884 - F64
+//
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: F64 = F64 * F64 + F64
+template <>
+struct Mma<
+  gemm::GemmShape<8,8,4>,
+  32,
+  double,
+  layout::RowMajor,
+  double,
+  layout::ColumnMajor,
+  double,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<8,8,4>;
+  using ElementA = double;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<double, 1>;
+  using ElementB = double;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<double, 1>;
+  using ElementC = double;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<double, 2>;
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+  double const & A = reinterpret_cast<double const &>(a);
+  double const & B = reinterpret_cast<double const &>(b);
+  double const *C = reinterpret_cast<double const *>(&c);
+  double *D = reinterpret_cast<double *>(&d);
+  asm volatile("mma.sync.aligned.m8n8k4.row.col.f64.f64.f64.f64 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=d"(D[0]), "=d"(D[1])
+      : "d"(A), "d"(B), "d"(C[0]), "d"(C[1]));
+#else
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 16816 - S8 input, S32 accumulation - SATURATE
+//
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: S32 = S8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,16>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+  using Shape = gemm::GemmShape<16,8,16>;
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 8>;
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 4>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
+        "{%6}, {%7,%8,%9,%10};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
+          "r"(C[3]));
+#else
+    assert(0);
+#endif
+  }
+};
+/// Matrix multiply-add operation: S32 = U8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,16>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+  using Shape = gemm::GemmShape<16,8,16>;
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 8>;
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 4>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
+        "{%6}, {%7,%8,%9,%10};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
+          "r"(C[3]));
+#else
+    assert(0);
+#endif
+  }
+};
+/// Matrix multiply-add operation: S32 = S8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,16>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+  using Shape = gemm::GemmShape<16,8,16>;
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 8>;
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 4>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
+        "{%6}, {%7,%8,%9,%10};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
+          "r"(C[3]));
+#else
+    assert(0);
+#endif
+  }
+};
+/// Matrix multiply-add operation: S32 = U8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,16>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+  using Shape = gemm::GemmShape<16,8,16>;
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 8>;
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 4>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
+        "{%6}, {%7,%8,%9,%10};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
+          "r"(C[3]));
+#else
+    assert(0);
+#endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 16832 - S8 input, S32 accumulation - SATURATE
+//
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: S32 = S8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,32>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+  using Shape = gemm::GemmShape<16,8,32>;
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 16>;
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 8>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+  uint32_t const * A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const * B = reinterpret_cast<uint32_t const *>(&b);
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+  asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, "
+      "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+      : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+        "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+#else
+    assert(0);
+#endif
+  }
+};
+/// Matrix multiply-add operation: S32 = U8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,32>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+  using Shape = gemm::GemmShape<16,8,32>;
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 16>;
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 8>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+    asm volatile(
+        "mma.sync.aligned.m16n8k32.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+#else
+    assert(0);
+#endif
+  }
+};
+/// Matrix multiply-add operation: S32 = S8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,32>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+  using Shape = gemm::GemmShape<16,8,32>;
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 16>;
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 8>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+    asm volatile(
+        "mma.sync.aligned.m16n8k32.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+#else
+    assert(0);
+#endif
+  }
+};
+/// Matrix multiply-add operation: S32 = U8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,32>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+  using Shape = gemm::GemmShape<16,8,32>;
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 16>;
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 8>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+    asm volatile(
+        "mma.sync.aligned.m16n8k32.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+#else
+    assert(0);
+#endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 16864 - S4 input, S32 accumulation - SATURATE
+//
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: S32 = S4 * S4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 64>,
+  32,
+  cutlass::int4b_t,
+  layout::RowMajor,
+  cutlass::int4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+  using Shape = gemm::GemmShape<16, 8, 64>;
+  using ElementA = cutlass::int4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::int4b_t, 32>;
+  using ElementB = cutlass::int4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::int4b_t, 16>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+  uint32_t const * A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const * B = reinterpret_cast<uint32_t const *>(&b);
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+  asm volatile(
+      "mma.sync.aligned.m16n8k64.row.col.s32.s4.s4.s32.satfinite {%0,%1,%2,%3}, "
+      "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+      : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+        "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+/// Matrix multiply-add operation: S32 = U4 * S4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 64>,
+  32,
+  cutlass::uint4b_t,
+  layout::RowMajor,
+  cutlass::int4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+  using Shape = gemm::GemmShape<16, 8, 64>;
+  using ElementA = cutlass::uint4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint4b_t, 32>;
+  using ElementB = cutlass::int4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::int4b_t, 16>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+    asm volatile(
+        "mma.sync.aligned.m16n8k64.row.col.s32.u4.s4.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+/// Matrix multiply-add operation: S32 = S4 * U4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 64>,
+  32,
+  cutlass::int4b_t,
+  layout::RowMajor,
+  cutlass::uint4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+  using Shape = gemm::GemmShape<16, 8, 64>;
+  using ElementA = cutlass::int4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::int4b_t, 32>;
+  using ElementB = cutlass::uint4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint4b_t, 16>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+    asm volatile(
+        "mma.sync.aligned.m16n8k64.row.col.s32.s4.u4.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+/// Matrix multiply-add operation: S32 = U4 * U4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 64>,
+  32,
+  cutlass::uint4b_t,
+  layout::RowMajor,
+  cutlass::uint4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+  using Shape = gemm::GemmShape<16, 8, 64>;
+  using ElementA = cutlass::uint4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint4b_t, 32>;
+  using ElementB = cutlass::uint4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint4b_t, 16>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+    asm volatile(
+        "mma.sync.aligned.m16n8k64.row.col.s32.u4.u4.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 168256 - B1 input, S32 accumulation - AND,POPC
+//
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: S32 = B1 & B1 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,256>,
+  32,
+  cutlass::uint1b_t,
+  layout::RowMajor,
+  cutlass::uint1b_t,
+  layout::ColumnMajor,
+  int32_t,
+  layout::RowMajor,
+  OpAndPopc> {
+  using Shape = gemm::GemmShape<16,8,256>;
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint1b_t, 128>;
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint1b_t, 64>;
+  using ElementC = int32_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int32_t, 4>;
+  using Operator = OpAndPopc;
+  using ArchTag = arch::Sm80;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_B1_AND_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+    asm volatile(
+        "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.and.popc {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, "
+        "{%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+/// Matrix multiply-add operation: S32 = B1 & B1 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,256>,
+  32,
+  cutlass::uint1b_t,
+  layout::RowMajor,
+  cutlass::uint1b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<16,8,256>;
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint1b_t, 128>;
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint1b_t, 64>;
+  using ElementC = int32_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int32_t, 4>;
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_B1_AND_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+    asm volatile(
+        "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.and.popc {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, "
+        "{%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 168256 - B1 input, S32 accumulation - XOR,POPC
+//
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: S32 = B1 & B1 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,256>,
+  32,
+  cutlass::uint1b_t,
+  layout::RowMajor,
+  cutlass::uint1b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpXorPopc> {
+  using Shape = gemm::GemmShape<16,8,256>;
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint1b_t, 128>;
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint1b_t, 64>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+  using Operator = OpXorPopc;
+  using ArchTag = arch::Sm80;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+#if defined(CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+    asm volatile(
+        "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.xor.popc {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, "
+        "{%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif // defined(CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED)
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace arch
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm89.h ADDED Viewed

	@@ -0,0 +1,641 @@

+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply-accumulate specialzied for SM89
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+#include "mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+////////////////////////////////////////////////////////////////////////////////
+#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 4)
+#  define CUTLASS_ARCH_MMA_F32_SM89_SUPPORTED
+#endif
+#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8)
+#  define CUTLASS_ARCH_MMA_F16_SM89_SUPPORTED
+#endif
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
+#  if defined(CUTLASS_ARCH_MMA_F32_SM89_SUPPORTED)
+#    define CUTLASS_ARCH_MMA_F32_SM89_ENABLED
+#  endif
+#  if defined(CUTLASS_ARCH_MMA_F16_SM89_SUPPORTED)
+#    define CUTLASS_ARCH_MMA_F16_SM89_ENABLED
+#  endif
+#endif
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace arch {
+////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+// Whether the Mma uses as SM89 staged accumulation policy
+template <class Operator>
+static constexpr bool is_sm89_staged_policy_v =
+  (
+    // ElementA must be FP8
+    platform::is_same<typename Operator::ElementA, cutlass::float_e4m3_t>::value ||
+    platform::is_same<typename Operator::ElementA, cutlass::float_e5m2_t>::value
+  ) &&
+  (
+    // ElementB must be FP8
+    platform::is_same<typename Operator::ElementB, cutlass::float_e4m3_t>::value ||
+    platform::is_same<typename Operator::ElementB, cutlass::float_e5m2_t>::value
+  ) &&
+  (
+    // The instruction shape must be 16x8x32
+    Operator::ArchMmaOperator::Shape::kM == 16 &&
+    Operator::ArchMmaOperator::Shape::kN == 8 &&
+    Operator::ArchMmaOperator::Shape::kK == 32
+  ) &&
+  (
+    // The operator must be OpMultiplyAdd (default)
+    platform::is_same<typename Operator::MathOperator, OpMultiplyAdd>::value
+  );
+} // namespace detail
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 16832 - Float {E4M3, E5M2}, FP32 accumulation
+//
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation - F32 = fe4m3 * fe4m3 + F32
+template <typename Operator_>
+struct Mma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  cutlass::float_e4m3_t,
+  layout::RowMajor,
+  cutlass::float_e4m3_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+  using Shape = gemm::GemmShape<16, 8, 32>;
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 8>;
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+#if defined(CUTLASS_ARCH_MMA_F32_SM89_ENABLED)
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+  asm(
+      "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      :
+        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+#else
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+/// Matrix multiply-add operation - F32 = fe4m3 * fe5m2 + F32
+template <typename Operator_>
+struct Mma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  cutlass::float_e4m3_t,
+  layout::RowMajor,
+  cutlass::float_e5m2_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+  using Shape = gemm::GemmShape<16, 8, 32>;
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+  using ElementB = cutlass::float_e5m2_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 8>;
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+#if defined(CUTLASS_ARCH_MMA_F32_SM89_ENABLED)
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+  asm(
+      "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e5m2.f32 "
+      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      :
+        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+#else
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+/// Matrix multiply-add operation - F32 = fe5m2 * fe4m3 + F32
+template <typename Operator_>
+struct Mma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  cutlass::float_e5m2_t,
+  layout::RowMajor,
+  cutlass::float_e4m3_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+  using Shape = gemm::GemmShape<16, 8, 32>;
+  using ElementA = cutlass::float_e5m2_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 8>;
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+#if defined(CUTLASS_ARCH_MMA_F32_SM89_ENABLED)
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+  asm(
+      "mma.sync.aligned.m16n8k32.row.col.f32.e5m2.e4m3.f32 "
+      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      :
+        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+#else
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+/// Matrix multiply-add operation - F32 = fe5m2 * fe5m2 + F32
+template <typename Operator_>
+struct Mma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  cutlass::float_e5m2_t,
+  layout::RowMajor,
+  cutlass::float_e5m2_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+  using Shape = gemm::GemmShape<16, 8, 32>;
+  using ElementA = cutlass::float_e5m2_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+  using ElementB = cutlass::float_e5m2_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 8>;
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+#if defined(CUTLASS_ARCH_MMA_F32_SM89_ENABLED)
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+  asm(
+      "mma.sync.aligned.m16n8k32.row.col.f32.e5m2.e5m2.f32 "
+      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      :
+        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+#else
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 16832 - Float {E4M3, E5M2}, FP16 accumulation
+//
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation - F16 = fe4m3 * fe4m3 + F16
+template <typename Operator_>
+struct Mma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  cutlass::float_e4m3_t,
+  layout::RowMajor,
+  cutlass::float_e4m3_t,
+  layout::ColumnMajor,
+  cutlass::half_t,
+  layout::RowMajor,
+  Operator_> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+  using Shape = gemm::GemmShape<16, 8, 32>;
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 8>;
+  using ElementC = cutlass::half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<cutlass::half_t, 4>;
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+#if defined(CUTLASS_ARCH_MMA_F16_SM89_ENABLED)
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  uint32_t const *C = reinterpret_cast<uint32_t const *>(&c);
+  uint32_t *D = reinterpret_cast<uint32_t *>(&d);
+  asm(
+      "mma.sync.aligned.m16n8k32.row.col.f16.e4m3.e4m3.f16 "
+      "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      :
+        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "r"(C[0]), "r"(C[1])
+  );
+#else
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+/// Matrix multiply-add operation - F16 = fe4m3 * fe5m2 + F16
+template <typename Operator_>
+struct Mma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  cutlass::float_e4m3_t,
+  layout::RowMajor,
+  cutlass::float_e5m2_t,
+  layout::ColumnMajor,
+  cutlass::half_t,
+  layout::RowMajor,
+  Operator_> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+  using Shape = gemm::GemmShape<16, 8, 32>;
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+  using ElementB = cutlass::float_e5m2_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 8>;
+  using ElementC = cutlass::half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<cutlass::half_t, 4>;
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+#if defined(CUTLASS_ARCH_MMA_F16_SM89_ENABLED)
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  uint32_t const *C = reinterpret_cast<uint32_t const *>(&c);
+  uint32_t *D = reinterpret_cast<uint32_t *>(&d);
+  asm(
+      "mma.sync.aligned.m16n8k32.row.col.f16.e4m3.e5m2.f16 "
+      "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      :
+        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "r"(C[0]), "r"(C[1])
+  );
+#else
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+/// Matrix multiply-add operation - F16 = fe5m2 * fe4m3 + F16
+template <typename Operator_>
+struct Mma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  cutlass::float_e5m2_t,
+  layout::RowMajor,
+  cutlass::float_e4m3_t,
+  layout::ColumnMajor,
+  cutlass::half_t,
+  layout::RowMajor,
+  Operator_> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+  using Shape = gemm::GemmShape<16, 8, 32>;
+  using ElementA = cutlass::float_e5m2_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 8>;
+  using ElementC = cutlass::half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<cutlass::half_t, 4>;
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+#if defined(CUTLASS_ARCH_MMA_F16_SM89_ENABLED)
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  uint32_t const *C = reinterpret_cast<uint32_t const *>(&c);
+  uint32_t *D = reinterpret_cast<uint32_t *>(&d);
+  asm(
+      "mma.sync.aligned.m16n8k32.row.col.f16.e5m2.e4m3.f16 "
+      "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      :
+        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "r"(C[0]), "r"(C[1])
+  );
+#else
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+/// Matrix multiply-add operation - F16 = fe5m2 * fe5m2 + F16
+template <typename Operator_>
+struct Mma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  cutlass::float_e5m2_t,
+  layout::RowMajor,
+  cutlass::float_e5m2_t,
+  layout::ColumnMajor,
+  cutlass::half_t,
+  layout::RowMajor,
+  Operator_> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+  using Shape = gemm::GemmShape<16, 8, 32>;
+  using ElementA = cutlass::float_e5m2_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+  using ElementB = cutlass::float_e5m2_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 8>;
+  using ElementC = cutlass::half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<cutlass::half_t, 4>;
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+#if defined(CUTLASS_ARCH_MMA_F16_SM89_ENABLED)
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  uint32_t const *C = reinterpret_cast<uint32_t const *>(&c);
+  uint32_t *D = reinterpret_cast<uint32_t *>(&d);
+  asm(
+      "mma.sync.aligned.m16n8k32.row.col.f16.e5m2.e5m2.f16 "
+      "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      :
+        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "r"(C[0]), "r"(C[1])
+  );
+#else
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+} // namespace arch
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm90.h ADDED Viewed

	@@ -0,0 +1,241 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+#include "mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/config.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace arch {
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix Multiply-Add 16x8x4 fp64
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: F64 = F64 * F64 + F64
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,4>,
+  32,
+  double,
+  layout::RowMajor,
+  double,
+  layout::ColumnMajor,
+  double,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<16,8,4>;
+  using ElementA = double;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<double, 2>;
+  using ElementB = double;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<double, 1>;
+  using ElementC = double;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<double, 4>;
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm90;
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+  double const *A = reinterpret_cast<double const *>(&a);
+  double const *B = reinterpret_cast<double const *>(&b);
+  double const *C = reinterpret_cast<double const *>(&c);
+  double *D = reinterpret_cast<double *>(&d);
+  asm volatile("mma.sync.aligned.m16n8k4.row.col.f64.f64.f64.f64.rn {%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
+      : "=d"(D[0]), "=d"(D[1]), "=d"(D[2]), "=d"(D[3])
+      : "d"(A[0]), "d"(A[1]),
+        "d"(B[0]),
+        "d"(C[0]), "d"(C[1]), "d"(C[2]), "d"(C[3]));
+#else
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix Multiply-Add 16x8x8 fp64
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: F64 = F64 * F64 + F64
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,8>,
+  32,
+  double,
+  layout::RowMajor,
+  double,
+  layout::ColumnMajor,
+  double,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<16,8,8>;
+  using ElementA = double;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<double, 4>;
+  using ElementB = double;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<double, 2>;
+  using ElementC = double;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<double, 4>;
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm90;
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+  double const *A = reinterpret_cast<double const *>(&a);
+  double const *B = reinterpret_cast<double const *>(&b);
+  double const *C = reinterpret_cast<double const *>(&c);
+  double *D = reinterpret_cast<double *>(&d);
+  asm volatile("mma.sync.aligned.m16n8k8.row.col.f64.f64.f64.f64 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n"
+      : "=d"(D[0]), "=d"(d[1]), "=d"(d[2]), "=d"(d[3])
+      : "d"(A[0]), "d"(A[1]), "d"(A[2]), "d"(A[3]),
+        "d"(B[0]), "d"(B[1]),
+        "d"(C[0]), "d"(C[1]), "d"(C[2]), "d"(C[3]));
+#else
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix Multiply-Add 16x8x16 fp64
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: F64 = F64 * F64 + F64
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,16>,
+  32,
+  double,
+  layout::RowMajor,
+  double,
+  layout::ColumnMajor,
+  double,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<16,8,16>;
+  using ElementA = double;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<double, 8>;
+  using ElementB = double;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<double, 4>;
+  using ElementC = double;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<double, 4>;
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm90;
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+  double const *A = reinterpret_cast<double const *>(&a);
+  double const *B = reinterpret_cast<double const *>(&b);
+  double const *C = reinterpret_cast<double const *>(&c);
+  double *D = reinterpret_cast<double *>(&d);
+  asm volatile("mma.sync.aligned.m16n8k16.row.col.f64.f64.f64.f64 {%0, %1, %2, %3}, {%4, %5, %6, %7, %8, %9, %10, %11}, {%12, %13, %14, %15}, {%16, %17, %18, %19};\n"
+      : "=d"(D[0]), "=d"(D[1]), "=d"(D[2]), "=d"(D[3])
+      : "d"(A[0]), "d"(A[2]), "d"(A[2]), "d"(A[3]), "d"(A[4]), "d"(A[5]), "d"(A[6]), "d"(A[7]),
+        "d"(B[0]), "d"(B[1]), "d"(B[2]), "d"(B[3]),
+        "d"(C[0]), "d"(C[1]), "d"(C[2]), "d"(C[3]));
+#else
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace arch
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sparse_sm80.h ADDED Viewed

	@@ -0,0 +1,1234 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Sparse matrix multiply accumulate for SM80
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+#include "mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 1))
+#define CUTLASS_ARCH_SPARSE_MMA_SM80_SUPPORTED 1
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+#define CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED
+#endif
+#endif
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace arch {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+//
+// Sparse Matrix Multiply 16832
+//
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: F16 = F16 * F16 + F16
+template <>
+struct SparseMma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd,
+  SPFormatType::Thread
+> {
+  using Shape = gemm::GemmShape<16, 8, 32>;
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 8>;
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 8>;
+  using ElementC = half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<half_t, 4>;
+  using FragmentE = uint32_t;
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+  static int const kSparse = 2;
+  static int const kMetaSizeInBits = 2;
+  static int const kMaxID2 = 2;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c, uint32_t const &E, int const id2) const {
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  uint32_t const *C = reinterpret_cast<uint32_t const *>(&c);
+  uint32_t *D = reinterpret_cast<uint32_t *>(&d);
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+  if (id2 == 0) {
+    asm volatile(
+        "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f16.f16.f16.f16 {%0,%1}, "
+        "{%2,%3,%4,%5}, {%6,%7,%8,%9}, {%10,%11}, %12, 0x0;\n"
+        : "=r"(D[0]), "=r"(D[1])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "r"(C[0]), "r"(C[1]), "r"(E));
+  }
+  else if (id2 == 1) {
+    asm volatile(
+        "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f16.f16.f16.f16 {%0,%1}, "
+        "{%2,%3,%4,%5}, {%6,%7,%8,%9}, {%10,%11}, %12, 0x1;\n"
+        : "=r"(D[0]), "=r"(D[1])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "r"(C[0]), "r"(C[1]), "r"(E));
+  }
+  else {
+    assert(0);
+  }
+#else
+  if (id2 == 0) {
+    asm volatile(
+        "mma.sp.sync.aligned.m16n8k32.row.col.f16.f16.f16.f16 {%0,%1}, "
+        "{%2,%3,%4,%5}, {%6,%7,%8,%9}, {%10,%11}, %12, 0x0;\n"
+        : "=r"(D[0]), "=r"(D[1])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "r"(C[0]), "r"(C[1]), "r"(E));
+  }
+  else if (id2 == 1) {
+    asm volatile(
+        "mma.sp.sync.aligned.m16n8k32.row.col.f16.f16.f16.f16 {%0,%1}, "
+        "{%2,%3,%4,%5}, {%6,%7,%8,%9}, {%10,%11}, %12, 0x1;\n"
+        : "=r"(D[0]), "=r"(D[1])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "r"(C[0]), "r"(C[1]), "r"(E));
+  }
+  else {
+    assert(0);
+  }
+#endif
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: F32 = F16 * F16 + F32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd,
+  SPFormatType::Thread
+  > {
+  using Shape = gemm::GemmShape<16, 8, 32>;
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 8>;
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 8>;
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+  using FragmentE = uint32_t;
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+  static int const kSparse = 2;
+  static int const kMetaSizeInBits = 2;
+  static int const kMaxID2 = 2;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c, uint32_t const &E, int const id2) const {
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+  if (id2 == 0) {
+    asm volatile(
+        "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]),
+          "r"(E));
+  }
+  else if (id2 == 1) {
+    asm volatile(
+        "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]),
+          "r"(E));
+  }
+  else {
+    assert(0);
+  }
+#else
+  if (id2 == 0) {
+    asm volatile(
+        "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]),
+          "r"(E));
+  }
+  else if (id2 == 1) {
+    asm volatile(
+        "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]),
+          "r"(E));
+  }
+  else {
+    assert(0);
+  }
+#endif
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+//
+// Sparse Matrix Multiply 16832 - Float BF16, FP32 accumulation
+//
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: F32 = bf16 * bf16 + F32
+template <>
+struct SparseMma<gemm::GemmShape<16, 8, 32>, 32, bfloat16_t, layout::RowMajor,
+           bfloat16_t, layout::ColumnMajor, float, layout::RowMajor,
+           OpMultiplyAdd, SPFormatType::Thread> {
+  using Shape = gemm::GemmShape<16, 8, 32>;
+  using ElementA = bfloat16_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<bfloat16_t, 8>;
+  using ElementB = bfloat16_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<bfloat16_t, 8>;
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+  using FragmentE = uint32_t;
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+  static int const kSparse = 2;
+  static int const kMetaSizeInBits = 2;
+  static int const kMaxID2 = 2;
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c, uint32_t const &E, int const id2) const {
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else if (id2 == 1) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k32.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else if (id2 == 1) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k32.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+//
+// Sparse Matrix Multiply 16816 - Float TF32
+//
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: F32 = tf32 * tf32 + F32
+template <>
+struct SparseMma<gemm::GemmShape<16, 8, 16>, 32, tfloat32_t, layout::RowMajor,
+           tfloat32_t, layout::ColumnMajor, float, layout::RowMajor,
+           OpMultiplyAdd, SPFormatType::Thread> {
+  using Shape = gemm::GemmShape<16, 8, 16>;
+  using ElementA = tfloat32_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<tfloat32_t, 4>;
+  using ElementB = tfloat32_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<tfloat32_t, 4>;
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+  using FragmentE = uint32_t;
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+  static int const kSparse = 2;
+  static int const kMetaSizeInBits = 4;
+  static int const kMaxID2 = 2;
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c, uint32_t const &E, int const id2) const {
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k16.row.col.f32.tf32.tf32.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else if (id2 == 1) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k16.row.col.f32.tf32.tf32.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k16.row.col.f32.tf32.tf32.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else if (id2 == 1) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k16.row.col.f32.tf32.tf32.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+//
+// Sparse Matrix Multiply 16864 - S8 input, S32 accumulation - SATURATE
+//
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: S32 = S8 * S8 + S32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate,
+  SPFormatType::Thread> {
+  using Shape = gemm::GemmShape<16,8,64>;
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 16>;
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 16>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+  using FragmentE = uint32_t;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+  static int const kSparse = 2;
+  static int const kMetaSizeInBits = 2;
+  static int const kMaxID2 = 1;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k64.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k64.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+/// Matrix multiply-add operation: S32 = S8 * U8 + S32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate,
+  SPFormatType::Thread> {
+  using Shape = gemm::GemmShape<16,8,64>;
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 16>;
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 16>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+  using FragmentE = uint32_t;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+  static int const kSparse = 2;
+  static int const kMetaSizeInBits = 2;
+  static int const kMaxID2 = 1;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k64.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k64.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+/// Matrix multiply-add operation: S32 = U8 * S8 + S32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate,
+  SPFormatType::Thread> {
+  using Shape = gemm::GemmShape<16,8,64>;
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 16>;
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 16>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+  using FragmentE = uint32_t;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+  static int const kSparse = 2;
+  static int const kMetaSizeInBits = 2;
+  static int const kMaxID2 = 1;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k64.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k64.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+/// Matrix multiply-add operation: S32 = U8 * U8 + S32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate,
+  SPFormatType::Thread> {
+  using Shape = gemm::GemmShape<16,8,64>;
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 16>;
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 16>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+  using FragmentE = uint32_t;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+  static int const kSparse = 2;
+  static int const kMetaSizeInBits = 2;
+  static int const kMaxID2 = 1;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k64.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k64.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+//
+// Sparse Matrix Multiply 168128 - S4 input, S32 accumulation - SATURATE
+//
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: S32 = S4 * S4 + S32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16,8,128>,
+  32,
+  cutlass::int4b_t,
+  layout::RowMajor,
+  cutlass::int4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate,
+  SPFormatType::Thread> {
+  using Shape = gemm::GemmShape<16,8,128>;
+  using ElementA = cutlass::int4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::int4b_t, 32>;
+  using ElementB = cutlass::int4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::int4b_t, 32>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+  using FragmentE = uint32_t;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+  static int const kSparse = 2;
+  static int const kMetaSizeInBits = 2;
+  static int const kMaxID2 = 1;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k128.row.col.s32.s4.s4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k128.row.col.s32.s4.s4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+/// Matrix multiply-add operation: S32 = S4 * U4 + S32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16,8,128>,
+  32,
+  cutlass::int4b_t,
+  layout::RowMajor,
+  cutlass::uint4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate,
+  SPFormatType::Thread> {
+  using Shape = gemm::GemmShape<16,8,128>;
+  using ElementA = cutlass::int4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::int4b_t, 32>;
+  using ElementB = cutlass::uint4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint4b_t, 32>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+  using FragmentE = uint32_t;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+  static int const kSparse = 2;
+  static int const kMetaSizeInBits = 2;
+  static int const kMaxID2 = 1;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k128.row.col.s32.s4.u4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k128.row.col.s32.s4.u4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+/// Matrix multiply-add operation: S32 = U4 * S4 + S32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16,8,128>,
+  32,
+  cutlass::uint4b_t,
+  layout::RowMajor,
+  cutlass::int4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate,
+  SPFormatType::Thread> {
+  using Shape = gemm::GemmShape<16,8,128>;
+  using ElementA = cutlass::uint4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint4b_t, 32>;
+  using ElementB = cutlass::int4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::int4b_t, 32>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+  using FragmentE = uint32_t;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+  static int const kSparse = 2;
+  static int const kMetaSizeInBits = 2;
+  static int const kMaxID2 = 1;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k128.row.col.s32.u4.s4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k128.row.col.s32.u4.s4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+/// Matrix multiply-add operation: S32 = U4 * U4 + S32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16,8,128>,
+  32,
+  cutlass::uint4b_t,
+  layout::RowMajor,
+  cutlass::uint4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate,
+  SPFormatType::Thread> {
+  using Shape = gemm::GemmShape<16,8,128>;
+  using ElementA = cutlass::uint4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint4b_t, 32>;
+  using ElementB = cutlass::uint4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint4b_t, 32>;
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+  using FragmentE = uint32_t;
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+  static int const kSparse = 2;
+  static int const kMetaSizeInBits = 2;
+  static int const kMaxID2 = 1;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k128.row.col.s32.u4.u4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k128.row.col.s32.u4.u4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace arch
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sparse_sm89.h ADDED Viewed

	@@ -0,0 +1,406 @@

+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Sparse matrix multiply accumulate for SM89
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+#include "mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 4)
+#  define CUTLASS_ARCH_SPARSE_MMA_F32_SM89_SUPPORTED
+#endif
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
+#  if defined(CUTLASS_ARCH_SPARSE_MMA_F32_SM89_SUPPORTED)
+#    define CUTLASS_ARCH_SPARSE_MMA_F32_SM89_ENABLED
+#  endif
+#endif
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace arch {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: F32 = fe4m3 * fe4m3 + F32
+template <typename Operator_>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  cutlass::float_e4m3_t,
+  layout::RowMajor,
+  cutlass::float_e4m3_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_,
+  SPFormatType::Thread> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+  using Shape = gemm::GemmShape<16,8,64>;
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 16>;
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<ElementC, 4>;
+  using FragmentE = uint32_t;
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+  static int const kSparse = 2;
+  static int const kMetaSizeInBits = 2;
+  static int const kMaxID2 = 1;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+#if defined(CUTLASS_ARCH_SPARSE_MMA_F32_SM89_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+      if (id2 == 0) {
+        asm volatile(
+            "mma.sp.sync.aligned.m16n8k64.row.col.f32.e4m3.e4m3.f32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+            "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+            : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+            : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+              "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+      }
+      else {
+        assert(0);
+      }
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: F32 = fe4m3 * fe5m2 + F32
+template <typename Operator_>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  cutlass::float_e4m3_t,
+  layout::RowMajor,
+  cutlass::float_e5m2_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_,
+  SPFormatType::Thread> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+  using Shape = gemm::GemmShape<16,8,64>;
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+  using ElementB = cutlass::float_e5m2_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 16>;
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<ElementC, 4>;
+  using FragmentE = uint32_t;
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+  static int const kSparse = 2;
+  static int const kMetaSizeInBits = 2;
+  static int const kMaxID2 = 1;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+#if defined(CUTLASS_ARCH_SPARSE_MMA_F32_SM89_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+      if (id2 == 0) {
+        asm volatile(
+            "mma.sp.sync.aligned.m16n8k64.row.col.f32.e4m3.e5m2.f32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+            "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+            : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+            : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+              "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+      }
+      else {
+        assert(0);
+      }
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: F32 = fe5m2 * fe4m3 + F32
+template <typename Operator_>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  cutlass::float_e5m2_t,
+  layout::RowMajor,
+  cutlass::float_e4m3_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_,
+  SPFormatType::Thread> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+  using Shape = gemm::GemmShape<16,8,64>;
+  using ElementA = cutlass::float_e5m2_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 16>;
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<ElementC, 4>;
+  using FragmentE = uint32_t;
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+  static int const kSparse = 2;
+  static int const kMetaSizeInBits = 2;
+  static int const kMaxID2 = 1;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+#if defined(CUTLASS_ARCH_SPARSE_MMA_F32_SM89_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+      if (id2 == 0) {
+        asm volatile(
+            "mma.sp.sync.aligned.m16n8k64.row.col.f32.e5m2.e4m3.f32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+            "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+            : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+            : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+              "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+      }
+      else {
+        assert(0);
+      }
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Matrix multiply-add operation: F32 = fe5m2 * fe5m2 + F32
+template <typename Operator_>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  cutlass::float_e5m2_t,
+  layout::RowMajor,
+  cutlass::float_e5m2_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_,
+  SPFormatType::Thread> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+  using Shape = gemm::GemmShape<16,8,64>;
+  using ElementA = cutlass::float_e5m2_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+  using ElementB = cutlass::float_e5m2_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 16>;
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<ElementC, 4>;
+  using FragmentE = uint32_t;
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+  static int const kSparse = 2;
+  static int const kMetaSizeInBits = 2;
+  static int const kMaxID2 = 1;
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+#if defined(CUTLASS_ARCH_SPARSE_MMA_F32_SM89_ENABLED)
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+      if (id2 == 0) {
+        asm volatile(
+            "mma.sp.sync.aligned.m16n8k64.row.col.f32.e5m2.e5m2.f32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+            "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+            : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+            : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+              "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+      }
+      else {
+        assert(0);
+      }
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace arch
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/reg_reconfig.h ADDED Viewed

	@@ -0,0 +1,89 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief PTX for CTA Reconfiguration
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cstdint>
+#else
+#include <cstdint>
+#endif
+#ifndef CUDA_CTA_RECONFIG_ACTIVATED
+  #if defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ >= 12 && (             \
+         (__CUDA_ARCH__ ==  900 && defined(__CUDA_ARCH_FEAT_SM90_ALL))      \
+      || (__CUDA_ARCH__ == 1000 && defined(__CUDA_ARCH_FEAT_SM100_ALL))     \
+      || (__CUDA_ARCH__ == 1010 && defined(__CUDA_ARCH_FEAT_SM101_ALL))     \
+      || (__CUDA_ARCH__ == 1030 && defined(__CUDA_ARCH_FEAT_SM103_ALL))     \
+      || (__CUDA_ARCH__ == 1200 && defined(__CUDA_ARCH_FEAT_SM120_ALL))     \
+      || (__CUDA_ARCH__ == 1210 && defined(__CUDA_ARCH_FEAT_SM121_ALL))     \
+    )
+    #define CUDA_CTA_RECONFIG_ACTIVATED 1
+  #endif
+  #if defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ >= 12 && (          \
+         (__CUDA_ARCH__ == 1000 && CUDA_ARCH_FAMILY(1000))  \
+      || (__CUDA_ARCH__ == 1010 && CUDA_ARCH_FAMILY(1010))  \
+      || (__CUDA_ARCH__ == 1030 && CUDA_ARCH_FAMILY(1030))  \
+      || (__CUDA_ARCH__ == 1200 && CUDA_ARCH_FAMILY(1200))  \
+      || (__CUDA_ARCH__ == 1210 && CUDA_ARCH_CONDITIONAL_OR_FAMILY(1210))  \
+    )
+    #define CUDA_CTA_RECONFIG_ACTIVATED 1
+  #endif
+#endif
+namespace cutlass {
+namespace arch {
+template<uint32_t RegCount>
+CUTLASS_DEVICE
+void warpgroup_reg_alloc(){
+#if CUDA_CTA_RECONFIG_ACTIVATED
+  asm volatile( "setmaxnreg.inc.sync.aligned.u32 %0;\n" : : "n"(RegCount) );
+#endif
+}
+template<uint32_t RegCount>
+CUTLASS_DEVICE
+void warpgroup_reg_dealloc(){
+#if CUDA_CTA_RECONFIG_ACTIVATED
+  asm volatile( "setmaxnreg.dec.sync.aligned.u32 %0;\n" : : "n"(RegCount) );
+#endif
+}
+} // namespace arch
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/simd.h ADDED Viewed

	@@ -0,0 +1,125 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing SIMD operators
+*/
+#pragma once
+#include "cutlass/arch/array.h"
+#include "cutlass/arch/numeric_types.h"
+namespace cutlass {
+namespace arch {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Element-wise operators
+//
+CUTLASS_HOST_DEVICE
+template <typename T, int N>
+Array<T, N> operator*(Array<T, N> const &a, Array<T, N> const &b) {
+  Array<T, N> d;
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < N; ++i) {
+    d[i] = a[i] * b[i];
+  }
+  return d;
+}
+CUTLASS_HOST_DEVICE
+template <typename T, int N>
+Array<T, N> operator+(Array<T, N> const &a, Array<T, N> const &b) {
+  Array<T, N> d;
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < N; ++i) {
+    d[i] = a[i] + b[i];
+  }
+  return d;
+}
+CUTLASS_HOST_DEVICE
+template <typename T, int N>
+Array<T, N> operator-(Array<T, N> const &a, Array<T, N> const &b) {
+  Array<T, N> d;
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < N; ++i) {
+    d[i] = a[i] - b[i];
+  }
+  return d;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Multiply-accumulate operators
+//
+CUTLASS_HOST_DEVICE
+template <typename T, int N>
+Array<T, N> mac(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) {
+  Array<T, N> d;
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < N; ++i) {
+    d[i] = a[i] * b[i] + c[i];
+  }
+  return d;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Dot product operator
+//
+CUTLASS_HOST_DEVICE
+template <typename Element, typename Accumulator, int N>
+Accumulator dot(Array<T, N> const &a, Array<T, N> const &b, Accumulator accum) {
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < N; ++i) {
+    accum += a[i] * b[i];
+  }
+  return accum;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace arch
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#include "simd_sm60.h"
+#include "simd_sm61.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/simd_sm60.h ADDED Viewed

	@@ -0,0 +1,104 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing SIMD operators for SM60
+*/
+#pragma once
+#include "simd.h"
+namespace cutlass {
+namespace arch {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Element-wise operators - specialized for half_t x 2
+//
+CUTLASS_HOST_DEVICE
+template <>
+Array<half_t, 2> operator*(Array<half_t, 2> const &a, Array<half_t, 2> const &b) {
+  Array<half_t, 2> d;
+  return d;
+}
+CUTLASS_HOST_DEVICE
+template <>
+Array<half_t, 2> operator+(AArray<half_t, 2> const &a, Array<half_t, 2> const &b) {
+  Array<half_t, 2> d;
+  return d;
+}
+CUTLASS_HOST_DEVICE
+template <>
+Array<half_t, 2> operator-(Array<half_t, 2> const &a, Array<half_t, 2> const &b) {
+  Array<T, N> d;
+  return d;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Multiply-accumulate operators - specialized for half_t x 2
+CUTLASS_HOST_DEVICE
+template <>
+Array<half_t, 2> mac(Array<half_t, 2> const &a, Array<half_t, 2> const &b, Array<half_t, 2> const &c) {
+  Array<half_t, 2> d;
+  return d;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Dot product operator - specialized for half_t <- (half_t * half_t) x 2 + half_t
+CUTLASS_HOST_DEVICE
+template <>
+half_t dot(Array<half_t, 2> const &a, Array<half_t, 2> const &b, half_t accum) {
+  return accum;
+}
+/// Dot product operator - specialized for float <- (half_t * half_t) x 2 + float
+CUTLASS_HOST_DEVICE
+template <>
+float dot(Array<half_t, 2> const &a, Array<half_t, 2> const &b, float accum) {
+  return accum;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace arch
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/simd_sm61.h ADDED Viewed

	@@ -0,0 +1,147 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing SIMD operators for SM61
+*/
+#pragma once
+#include "simd.h"
+namespace cutlass {
+namespace arch {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Dot product operator - specialized for int32_t <- (int8_t * int8_t) x 4 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<int8_t, 4> const &a, Array<int8_t, 4> const &b, int32_t accum) {
+  return accum;
+}
+/// Dot product operator - specialized for int32_t <- (uint8_t * int8_t) x 4 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<uint8_t, 4> const &a, Array<int8_t, 4> const &b, int32_t accum) {
+  return accum;
+}
+/// Dot product operator - specialized for int32_t <- (int8_t * uint8_t) x 4 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<int8_t, 4> const &a, Array<uint8_t, 4> const &b, int32_t accum) {
+  return accum;
+}
+/// Dot product operator - specialized for int32_t <- (uint8_t * uint8_t) x 4 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<uint8_t, 4> const &a, Array<uint8_t, 4> const &b, int32_t accum) {
+  return accum;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Dot product operator - specialized for int32_t <- (int16_t * int8_t) x 2 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<int16_t, 2> const &a, Array<int8_t, 2> const &b, int32_t accum) {
+  return accum;
+}
+/// Dot product operator - specialized for int32_t <- (uint16_t * int8_t) x 2 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<uint16_t, 2> const &a, Array<int8_t, 2> const &b, int32_t accum) {
+  return accum;
+}
+/// Dot product operator - specialized for int32_t <- (int16_t * int8_t) x 2 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<int16_t, 2> const &a, Array<uint8_t, 2> const &b, int32_t accum) {
+  return accum;
+}
+/// Dot product operator - specialized for int32_t <- (uint16_t * int8_t) x 2 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<uint16_t, 2> const &a, Array<uint8_t, 2> const &b, int32_t accum) {
+  return accum;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Dot product operator - specialized for int32_t <- (int16_t * int16_t) x 2 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<int16_t, 2> const &a, Array<int16_t, 2> const &b, int32_t accum) {
+  return accum;
+}
+/// Dot product operator - specialized for int32_t <- (uint16_t * int16_t) x 2 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<uint16_t, 2> const &a, Array<int16_t, 2> const &b, int32_t accum) {
+  return accum;
+}
+/// Dot product operator - specialized for int32_t <- (int16_t * int16_t) x 2 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<int16_t, 2> const &a, Array<uint16_t, 2> const &b, int32_t accum) {
+  return accum;
+}
+/// Dot product operator - specialized for int32_t <- (uint16_t * int16_t) x 2 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<uint16_t, 2> const &a, Array<uint16_t, 2> const &b, int32_t accum) {
+  return accum;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace arch
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/synclog.hpp ADDED Viewed

	@@ -0,0 +1,1271 @@

+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Synchronization event logging for race condition debugging.
+*/
+#pragma once
+#include "cutlass/detail/helper_macros.hpp"
+#include "cutlass/cutlass.h"
+#if defined(__CUDACC_RTC__)
+#include CUDA_STD_HEADER(cstdint)
+#else
+#include <cstdint>
+#endif
+#if !defined(__CUDACC_RTC__)
+#include <mutex>
+#include <vector>
+#endif
+namespace cutlass {
+namespace arch {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#if defined(CUTLASS_ENABLE_SYNCLOG)
+constexpr uint32_t synclog_cap = 1 << 26;
+inline std::mutex synclog_mutex;
+inline std::vector<uint32_t*> synclog_buf_list;
+#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+CUTLASS_DEVICE uint32_t* synclog_buf;
+#endif
+CUTLASS_DEVICE
+uint32_t* synclog_alloc(uint32_t n) {
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  uint32_t* buf = synclog_buf;
+  if (buf == nullptr) return nullptr;
+  uint32_t last = atomicAdd(&buf[0], n);
+  if (last + n < synclog_cap) return buf + last + 1;
+  if (last >= synclog_cap) atomicAdd(&buf[0], -n);
+  #endif
+  return nullptr;
+}
+CUTLASS_DEVICE
+void synclog_emit_prefix(uint32_t* to, uint32_t header, uint32_t line) {
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  uint64_t time64;
+  asm volatile (
+    "mov.u64 %0, %%globaltimer;\n"
+    : "=l"(time64) :
+  );
+  to[0] = header;
+  to[1] = line;
+  to[2] = time64;
+  to[3] = time64 >> 32;
+  to[4] = threadIdx.x;
+  to[5] = threadIdx.y;
+  to[6] = threadIdx.z;
+  to[7] = blockIdx.x;
+  to[8] = blockIdx.y;
+  to[9] = blockIdx.z;
+  #endif
+}
+constexpr uint32_t synclog_header_none = 0;
+constexpr uint32_t synclog_length_prefix = 1 + 1 + 2 + 3 + 3;
+constexpr bool     synclog_enable_syncthreads = true;
+constexpr uint32_t synclog_header_syncthreads = 1;
+constexpr uint32_t synclog_length_syncthreads = synclog_length_prefix + 0;
+constexpr bool     synclog_enable_syncwarp = true;
+constexpr uint32_t synclog_header_syncwarp = 2;
+constexpr uint32_t synclog_length_syncwarp = synclog_length_prefix + 0;
+constexpr bool     synclog_enable_named_barrier_arrive_and_wait = true;
+constexpr uint32_t synclog_header_named_barrier_arrive_and_wait = 3;
+constexpr uint32_t synclog_length_named_barrier_arrive_and_wait = synclog_length_prefix + 2;
+constexpr bool     synclog_enable_named_barrier_arrive = true;
+constexpr uint32_t synclog_header_named_barrier_arrive = 4;
+constexpr uint32_t synclog_length_named_barrier_arrive = synclog_length_prefix + 2;
+constexpr bool     synclog_enable_cluster_barrier_init = true;
+constexpr uint32_t synclog_header_cluster_barrier_init = 5;
+constexpr uint32_t synclog_length_cluster_barrier_init = synclog_length_prefix + 2;
+constexpr bool     synclog_enable_cluster_barrier_wait = true;
+constexpr uint32_t synclog_header_cluster_barrier_wait = 6;
+constexpr uint32_t synclog_length_cluster_barrier_wait = synclog_length_prefix + 2;
+constexpr bool     synclog_enable_cluster_barrier_test_wait = true;
+constexpr uint32_t synclog_header_cluster_barrier_test_wait = 7;
+constexpr uint32_t synclog_length_cluster_barrier_test_wait = synclog_length_prefix + 3;
+constexpr bool     synclog_enable_cluster_barrier_try_wait = true;
+constexpr uint32_t synclog_header_cluster_barrier_try_wait = 8;
+constexpr uint32_t synclog_length_cluster_barrier_try_wait = synclog_length_prefix + 2;
+constexpr bool     synclog_enable_cluster_barrier_arrive_cluster = true;
+constexpr uint32_t synclog_header_cluster_barrier_arrive_cluster = 9;
+constexpr uint32_t synclog_length_cluster_barrier_arrive_cluster = synclog_length_prefix + 3;
+constexpr bool     synclog_enable_cluster_barrier_arrive = true;
+constexpr uint32_t synclog_header_cluster_barrier_arrive = 10;
+constexpr uint32_t synclog_length_cluster_barrier_arrive = synclog_length_prefix + 1;
+constexpr bool     synclog_enable_cluster_barrier_invalidate = true;
+constexpr uint32_t synclog_header_cluster_barrier_invalidate = 11;
+constexpr uint32_t synclog_length_cluster_barrier_invalidate = synclog_length_prefix + 1;
+constexpr bool     synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx = true;
+constexpr uint32_t synclog_header_cluster_transaction_barrier_arrive_and_expect_tx = 12;
+constexpr uint32_t synclog_length_cluster_transaction_barrier_arrive_and_expect_tx = synclog_length_prefix + 2;
+constexpr bool     synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx_cluster = true;
+constexpr uint32_t synclog_header_cluster_transaction_barrier_arrive_and_expect_tx_cluster = 13;
+constexpr uint32_t synclog_length_cluster_transaction_barrier_arrive_and_expect_tx_cluster = synclog_length_prefix + 4;
+constexpr bool     synclog_enable_cluster_transaction_barrier_expect_transaction = true;
+constexpr uint32_t synclog_header_cluster_transaction_barrier_expect_transaction = 14;
+constexpr uint32_t synclog_length_cluster_transaction_barrier_expect_transaction = synclog_length_prefix + 2;
+constexpr bool     synclog_enable_cluster_transaction_barrier_complete_transaction = true;
+constexpr uint32_t synclog_header_cluster_transaction_barrier_complete_transaction = 15;
+constexpr uint32_t synclog_length_cluster_transaction_barrier_complete_transaction = synclog_length_prefix + 4;
+constexpr bool     synclog_enable_fence_barrier_init = true;
+constexpr uint32_t synclog_header_fence_barrier_init = 16;
+constexpr uint32_t synclog_length_fence_barrier_init = synclog_length_prefix + 0;
+constexpr bool     synclog_enable_fence_view_async_shared = true;
+constexpr uint32_t synclog_header_fence_view_async_shared = 17;
+constexpr uint32_t synclog_length_fence_view_async_shared = synclog_length_prefix + 0;
+constexpr bool     synclog_enable_cp_async_wait = true;
+constexpr uint32_t synclog_header_cp_async_wait = 18;
+constexpr uint32_t synclog_length_cp_async_wait = synclog_length_prefix + 1;
+constexpr bool     synclog_enable_cp_async_wait_all = true;
+constexpr uint32_t synclog_header_cp_async_wait_all = 19;
+constexpr uint32_t synclog_length_cp_async_wait_all = synclog_length_prefix + 0;
+constexpr bool     synclog_enable_cp_async_fence = true;
+constexpr uint32_t synclog_header_cp_async_fence = 20;
+constexpr uint32_t synclog_length_cp_async_fence = synclog_length_prefix + 0;
+constexpr bool     synclog_enable_cp_async_nan = true;
+constexpr uint32_t synclog_header_cp_async_nan = 21;
+constexpr uint32_t synclog_length_cp_async_nan = synclog_length_prefix + 4;
+constexpr bool     synclog_enable_cp_async_zfill = true;
+constexpr uint32_t synclog_header_cp_async_zfill = 22;
+constexpr uint32_t synclog_length_cp_async_zfill = synclog_length_prefix + 5;
+constexpr bool     synclog_enable_cp_async = true;
+constexpr uint32_t synclog_header_cp_async = 23;
+constexpr uint32_t synclog_length_cp_async = synclog_length_prefix + 5;
+constexpr bool     synclog_enable_tma_load = true;
+constexpr uint32_t synclog_header_tma_load = 24;
+constexpr uint32_t synclog_length_tma_load = synclog_length_prefix + 4;
+constexpr bool     synclog_enable_tma_store = true;
+constexpr uint32_t synclog_header_tma_store = 25;
+constexpr uint32_t synclog_length_tma_store = synclog_length_prefix + 3;
+constexpr bool     synclog_enable_tma_store_arrive = true;
+constexpr uint32_t synclog_header_tma_store_arrive = 26;
+constexpr uint32_t synclog_length_tma_store_arrive = synclog_length_prefix + 0;
+constexpr bool     synclog_enable_tma_store_wait = true;
+constexpr uint32_t synclog_header_tma_store_wait = 27;
+constexpr uint32_t synclog_length_tma_store_wait = synclog_length_prefix + 1;
+constexpr bool     synclog_enable_warpgroup_arrive = true;
+constexpr uint32_t synclog_header_warpgroup_arrive = 28;
+constexpr uint32_t synclog_length_warpgroup_arrive = synclog_length_prefix + 0;
+constexpr bool     synclog_enable_warpgroup_wait = true;
+constexpr uint32_t synclog_header_warpgroup_wait = 29;
+constexpr uint32_t synclog_length_warpgroup_wait = synclog_length_prefix + 1;
+constexpr bool     synclog_enable_warpgroup_commit_batch = true;
+constexpr uint32_t synclog_header_warpgroup_commit_batch = 30;
+constexpr uint32_t synclog_length_warpgroup_commit_batch = synclog_length_prefix + 0;
+constexpr bool     synclog_enable_wgmma_reg_smem = true;
+constexpr uint32_t synclog_header_wgmma_reg_smem = 31;
+constexpr uint32_t synclog_length_wgmma_reg_smem = synclog_length_prefix + 2;
+constexpr bool     synclog_enable_wgmma_smem_smem = true;
+constexpr uint32_t synclog_header_wgmma_smem_smem = 32;
+constexpr uint32_t synclog_length_wgmma_smem_smem = synclog_length_prefix + 4;
+constexpr bool     synclog_enable_cpasync_barrier_arrive = true;
+constexpr uint32_t synclog_header_cpasync_barrier_arrive = 33;
+constexpr uint32_t synclog_length_cpasync_barrier_arrive = synclog_length_prefix + 1;
+CUTLASS_DEVICE
+bool synclog_condition_emit() {
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  return threadIdx.x % NumThreadsPerWarp == 0 && threadIdx.y == 0 && threadIdx.z == 0 &&
+    blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0;
+  #else
+  return 0;
+  #endif
+}
+CUTLASS_DEVICE
+bool synclog_condition_print() {
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  return threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0 &&
+    blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0;
+  #else
+  return false;
+  #endif
+}
+CUTLASS_DEVICE
+void synclog_print_prefix(char const* header, uint32_t at) {
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  uint32_t line = synclog_buf[at + 1];
+  uint32_t timeLo = synclog_buf[at + 2];
+  uint32_t timeHi = synclog_buf[at + 3];
+  uint32_t threadIdxX = synclog_buf[at + 4];
+  uint32_t threadIdxY = synclog_buf[at + 5];
+  uint32_t threadIdxZ = synclog_buf[at + 6];
+  uint32_t blockIdxX = synclog_buf[at + 7];
+  uint32_t blockIdxY = synclog_buf[at + 8];
+  uint32_t blockIdxZ = synclog_buf[at + 9];
+  printf(
+    "%s line=%u time=%lu thread=%u,%u,%u block=%u,%u,%u ",
+    header, line,
+    (uint64_t)timeHi << 32 | timeLo,
+    threadIdxX, threadIdxY, threadIdxZ,
+    blockIdxX, blockIdxY, blockIdxZ
+  );
+  #endif
+}
+CUTLASS_DEVICE
+void synclog_print_wgmma_desc(char const* str, uint32_t lo, uint32_t hi, char const* sep) {
+  CUTLASS_UNUSED(hi);
+  uint32_t smem_int_ptr = (lo & ((1 << 14) - 1)) << 4;
+  printf("%s_smem_int_ptr=%u%s", str, smem_int_ptr, sep);
+}
+#endif // defined(CUTLASS_ENABLE_SYNCLOG)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+inline void synclog_setup() {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  std::scoped_lock lock(synclog_mutex);
+  auto fail = [] () {
+    fprintf(stderr, "synclog_setup() failed\n");
+    std::terminate();
+  };
+  int orig_device = 0;
+  if (cudaGetDevice(&orig_device) != cudaSuccess) {
+    fail();
+  }
+  int device_count = 0;
+  if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
+    fail();
+  }
+  if (synclog_buf_list.size() == 0) {
+    for (int device = 0; device < device_count; device++) {
+      uint32_t* buf = 0;
+      if (cudaSetDevice(device) != cudaSuccess ||
+        cudaMalloc(&buf, synclog_cap * sizeof(uint32_t)) != cudaSuccess) {
+        fail();
+      }
+      synclog_buf_list.push_back(buf);
+    }
+  }
+  for (int device = 0; device < device_count; device++) {
+    uint32_t* buf = synclog_buf_list.at(device);
+    if (cudaSetDevice(device) != cudaSuccess ||
+      cudaMemset(buf, 0, synclog_cap * sizeof(uint32_t)) != cudaSuccess ||
+      cudaMemcpyToSymbol(synclog_buf, &buf, sizeof(buf)) != cudaSuccess) {
+      fail();
+    }
+  }
+  if (cudaSetDevice(orig_device) != cudaSuccess) {
+    fail();
+  }
+  #endif
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_syncthreads(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_syncthreads) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_syncthreads);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_syncthreads, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_syncwarp(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_syncwarp) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_syncwarp);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_syncwarp, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_named_barrier_arrive_and_wait(
+  uint32_t line,
+  uint32_t num_threads,
+  uint32_t barrier_id) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_named_barrier_arrive_and_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_named_barrier_arrive_and_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_named_barrier_arrive_and_wait, line);
+  to[synclog_length_prefix + 0] = num_threads;
+  to[synclog_length_prefix + 1] = barrier_id;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(num_threads);
+  CUTLASS_UNUSED(barrier_id);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_named_barrier_arrive(
+  uint32_t line,
+  uint32_t num_threads,
+  uint32_t barrier_id) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_named_barrier_arrive) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_named_barrier_arrive);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_named_barrier_arrive, line);
+  to[synclog_length_prefix + 0] = num_threads;
+  to[synclog_length_prefix + 1] = barrier_id;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(num_threads);
+  CUTLASS_UNUSED(barrier_id);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_init(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t arrive_count) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_init) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_init);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_init, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = arrive_count;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(arrive_count);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_wait(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t phase) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_wait, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = phase;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(phase);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_test_wait(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t phase,
+  uint32_t pred) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_test_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_test_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_test_wait, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = phase;
+  to[synclog_length_prefix + 2] = pred;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(phase);
+  CUTLASS_UNUSED(pred);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_try_wait(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t phase) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_try_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_try_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_try_wait, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = phase;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(phase);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_arrive_cluster(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t cta_id,
+  uint32_t pred) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_arrive_cluster) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_arrive_cluster);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_arrive_cluster, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = cta_id;
+  to[synclog_length_prefix + 2] = pred;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(cta_id);
+  CUTLASS_UNUSED(pred);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_arrive(
+  uint32_t line,
+  uint32_t smem_addr) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_arrive) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_arrive);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_arrive, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_invalidate(
+  uint32_t line,
+  uint32_t smem_addr) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_invalidate) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_invalidate);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_invalidate, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_cluster_transaction_barrier_arrive_and_expect_tx(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t transaction_bytes) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cluster_transaction_barrier_arrive_and_expect_tx);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_transaction_barrier_arrive_and_expect_tx, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = transaction_bytes;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(transaction_bytes);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_cluster_transaction_barrier_arrive_and_expect_tx_cluster(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t transaction_bytes,
+  uint32_t cta_id,
+  uint32_t pred) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx_cluster) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cluster_transaction_barrier_arrive_and_expect_tx_cluster);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_transaction_barrier_arrive_and_expect_tx_cluster, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = transaction_bytes;
+  to[synclog_length_prefix + 2] = cta_id;
+  to[synclog_length_prefix + 3] = pred;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(transaction_bytes);
+  CUTLASS_UNUSED(cta_id);
+  CUTLASS_UNUSED(pred);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_cluster_transaction_barrier_expect_transaction(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t transaction_bytes) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_transaction_barrier_expect_transaction) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cluster_transaction_barrier_expect_transaction);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_transaction_barrier_expect_transaction, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = transaction_bytes;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(transaction_bytes);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_cluster_transaction_barrier_complete_transaction(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t dst_cta_id,
+  uint32_t transaction_bytes,
+  uint32_t pred) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_transaction_barrier_complete_transaction) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cluster_transaction_barrier_complete_transaction);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_transaction_barrier_complete_transaction, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = dst_cta_id;
+  to[synclog_length_prefix + 2] = transaction_bytes;
+  to[synclog_length_prefix + 3] = pred;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(dst_cta_id);
+  CUTLASS_UNUSED(transaction_bytes);
+  CUTLASS_UNUSED(pred);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_fence_barrier_init(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_fence_barrier_init) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_fence_barrier_init);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_fence_barrier_init, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_fence_view_async_shared(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_fence_view_async_shared) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_fence_view_async_shared);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_fence_view_async_shared, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_cp_async_wait(
+  uint32_t line,
+  uint32_t n) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cp_async_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cp_async_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cp_async_wait, line);
+  to[synclog_length_prefix + 0] = n;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(n);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_cp_async_wait_all(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cp_async_wait_all) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cp_async_wait_all);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cp_async_wait_all, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_cp_async_fence(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cp_async_fence) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cp_async_fence);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cp_async_fence, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_cp_async_nan(
+  uint32_t line,
+  uint32_t smem_addr,
+  const void* gmem_ptr,
+  uint32_t pred) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cp_async_nan) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cp_async_nan);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cp_async_nan, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = (uint32_t)((uint64_t)gmem_ptr);
+  to[synclog_length_prefix + 2] = (uint32_t)((uint64_t)gmem_ptr >> 32);
+  to[synclog_length_prefix + 3] = pred;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(gmem_ptr);
+  CUTLASS_UNUSED(pred);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_cp_async_zfill(
+  uint32_t line,
+  uint32_t smem_addr,
+  const void* gmem_ptr,
+  uint32_t pred,
+  uint32_t size) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cp_async_zfill) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cp_async_zfill);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cp_async_zfill, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = (uint32_t)((uint64_t)gmem_ptr);
+  to[synclog_length_prefix + 2] = (uint32_t)((uint64_t)gmem_ptr >> 32);
+  to[synclog_length_prefix + 3] = pred;
+  to[synclog_length_prefix + 4] = size;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(gmem_ptr);
+  CUTLASS_UNUSED(pred);
+  CUTLASS_UNUSED(size);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_cp_async(
+  uint32_t line,
+  uint32_t smem_addr,
+  const void* gmem_ptr,
+  uint32_t pred,
+  uint32_t size) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cp_async) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cp_async);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cp_async, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = (uint32_t)((uint64_t)gmem_ptr);
+  to[synclog_length_prefix + 2] = (uint32_t)((uint64_t)gmem_ptr >> 32);
+  to[synclog_length_prefix + 3] = pred;
+  to[synclog_length_prefix + 4] = size;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(gmem_ptr);
+  CUTLASS_UNUSED(pred);
+  CUTLASS_UNUSED(size);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_tma_load(
+  uint32_t line,
+  uint64_t gmem_int_desc,
+  uint32_t smem_int_mbar,
+  uint32_t smem_int_ptr) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_tma_load) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_tma_load);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_tma_load, line);
+  to[synclog_length_prefix + 0] = (uint32_t)((uint64_t)gmem_int_desc);
+  to[synclog_length_prefix + 1] = (uint32_t)((uint64_t)gmem_int_desc >> 32);
+  to[synclog_length_prefix + 2] = smem_int_mbar;
+  to[synclog_length_prefix + 3] = smem_int_ptr;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(gmem_int_desc);
+  CUTLASS_UNUSED(smem_int_mbar);
+  CUTLASS_UNUSED(smem_int_ptr);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_tma_store(
+  uint32_t line,
+  uint64_t gmem_int_desc,
+  uint32_t smem_int_ptr) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_tma_store) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_tma_store);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_tma_store, line);
+  to[synclog_length_prefix + 0] = (uint32_t)((uint64_t)gmem_int_desc);
+  to[synclog_length_prefix + 1] = (uint32_t)((uint64_t)gmem_int_desc >> 32);
+  to[synclog_length_prefix + 2] = smem_int_ptr;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(gmem_int_desc);
+  CUTLASS_UNUSED(smem_int_ptr);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_tma_store_arrive(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_tma_store_arrive) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_tma_store_arrive);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_tma_store_arrive, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_tma_store_wait(
+  uint32_t line,
+  uint32_t count) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_tma_store_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_tma_store_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_tma_store_wait, line);
+  to[synclog_length_prefix + 0] = count;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(count);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_warpgroup_arrive(
+  uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_warpgroup_arrive) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_warpgroup_arrive);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_warpgroup_arrive, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_warpgroup_wait(
+  uint32_t line,
+  uint32_t n) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_warpgroup_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_warpgroup_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_warpgroup_wait, line);
+  to[synclog_length_prefix + 0] = n;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(n);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_warpgroup_commit_batch(
+  uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_warpgroup_commit_batch) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_warpgroup_commit_batch);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_warpgroup_commit_batch, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_wgmma_reg_smem(
+  uint32_t line,
+  uint64_t desc_b) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_wgmma_reg_smem) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_wgmma_reg_smem);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_wgmma_reg_smem, line);
+  to[synclog_length_prefix + 0] = desc_b;
+  to[synclog_length_prefix + 1] = desc_b >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(desc_b);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_wgmma_smem_smem(
+  uint32_t line,
+  uint64_t desc_a,
+  uint64_t desc_b) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_wgmma_smem_smem) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_wgmma_smem_smem);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_wgmma_smem_smem, line);
+  to[synclog_length_prefix + 0] = desc_a;
+  to[synclog_length_prefix + 1] = desc_a >> 32;
+  to[synclog_length_prefix + 2] = desc_b;
+  to[synclog_length_prefix + 3] = desc_b >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(desc_a);
+  CUTLASS_UNUSED(desc_b);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+CUTLASS_DEVICE
+void synclog_emit_cpasync_barrier_arrive(
+  uint32_t line,
+  uint32_t smem_addr) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cpasync_barrier_arrive) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cpasync_barrier_arrive);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cpasync_barrier_arrive, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+#if !defined(CUTLASS_ENABLE_SYNCLOG)
+CUTLASS_DEVICE
+#elif defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+static __attribute__((__noinline__)) __device__
+#else
+static __attribute__((__noinline__))
+#endif
+void synclog_print() {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  if (synclog_buf == nullptr || !synclog_condition_print()) {
+    return;
+  }
+  printf("synclog start\n");
+  for (uint32_t at = 1; at < synclog_cap; ) {
+    uint32_t header = synclog_buf[at];
+    if (header == synclog_header_none) {
+      break;
+    }
+    printf("synclog at %u: ", at);
+    if constexpr (synclog_enable_syncthreads) {
+      if (header == synclog_header_syncthreads) {
+        synclog_print_prefix("syncthreads", at);
+        at += synclog_length_syncthreads;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_syncwarp) {
+      if (header == synclog_header_syncwarp) {
+        synclog_print_prefix("syncwarp", at);
+        at += synclog_length_syncwarp;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_named_barrier_arrive_and_wait) {
+      if (header == synclog_header_named_barrier_arrive_and_wait) {
+        synclog_print_prefix("named_barrier_arrive_and_wait", at);
+        at += synclog_length_named_barrier_arrive_and_wait;
+        printf("num_threads=%u barrier_id=%u\n", synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_named_barrier_arrive) {
+      if (header == synclog_header_named_barrier_arrive) {
+        synclog_print_prefix("named_barrier_arrive", at);
+        at += synclog_length_named_barrier_arrive;
+        printf("num_threads=%u barrier_id=%u\n", synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_init) {
+      if (header == synclog_header_cluster_barrier_init) {
+        synclog_print_prefix("cluster_barrier_init", at);
+        at += synclog_length_cluster_barrier_init;
+        printf("smem_addr=%u arrive_count=%u\n", synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_wait) {
+      if (header == synclog_header_cluster_barrier_wait) {
+        synclog_print_prefix("cluster_barrier_wait", at);
+        at += synclog_length_cluster_barrier_wait;
+        printf("smem_addr=%u phase=%u\n", synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_test_wait) {
+      if (header == synclog_header_cluster_barrier_test_wait) {
+        synclog_print_prefix("cluster_barrier_test_wait", at);
+        at += synclog_length_cluster_barrier_test_wait;
+        printf("smem_addr=%u phase=%u pred=%u\n", synclog_buf[at-3], synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_try_wait) {
+      if (header == synclog_header_cluster_barrier_try_wait) {
+        synclog_print_prefix("cluster_barrier_try_wait", at);
+        at += synclog_length_cluster_barrier_try_wait;
+        printf("smem_addr=%u phase=%u\n", synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_arrive_cluster) {
+      if (header == synclog_header_cluster_barrier_arrive_cluster) {
+        synclog_print_prefix("cluster_barrier_arrive_cluster", at);
+        at += synclog_length_cluster_barrier_arrive_cluster;
+        printf("smem_addr=%u cta_id=%u pred=%u\n", synclog_buf[at-3], synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_arrive) {
+      if (header == synclog_header_cluster_barrier_arrive) {
+        synclog_print_prefix("cluster_barrier_arrive", at);
+        at += synclog_length_cluster_barrier_arrive;
+        printf("smem_addr=%u\n", synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_invalidate) {
+      if (header == synclog_header_cluster_barrier_invalidate) {
+        synclog_print_prefix("cluster_barrier_invalidate", at);
+        at += synclog_length_cluster_barrier_invalidate;
+        printf("smem_addr=%u\n", synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx) {
+      if (header == synclog_header_cluster_transaction_barrier_arrive_and_expect_tx) {
+        synclog_print_prefix("cluster_transaction_barrier_arrive_and_expect_tx", at);
+        at += synclog_length_cluster_transaction_barrier_arrive_and_expect_tx;
+        printf("smem_addr=%u transaction_bytes=%u\n", synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx_cluster) {
+      if (header == synclog_header_cluster_transaction_barrier_arrive_and_expect_tx_cluster) {
+        synclog_print_prefix("cluster_transaction_barrier_arrive_and_expect_tx_cluster", at);
+        at += synclog_length_cluster_transaction_barrier_arrive_and_expect_tx_cluster;
+        printf("smem_addr=%u transaction_bytes=%u cta_id=%u pred=%u\n", synclog_buf[at-4], synclog_buf[at-3], synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_transaction_barrier_expect_transaction) {
+      if (header == synclog_header_cluster_transaction_barrier_expect_transaction) {
+        synclog_print_prefix("cluster_transaction_barrier_expect_transaction", at);
+        at += synclog_length_cluster_transaction_barrier_expect_transaction;
+        printf("smem_addr=%u transaction_bytes=%u\n", synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_transaction_barrier_complete_transaction) {
+      if (header == synclog_header_cluster_transaction_barrier_complete_transaction) {
+        synclog_print_prefix("cluster_transaction_barrier_complete_transaction", at);
+        at += synclog_length_cluster_transaction_barrier_complete_transaction;
+        printf("smem_addr=%u dst_cta_id=%u transaction_bytes=%u pred=%u\n", synclog_buf[at-4], synclog_buf[at-3], synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_fence_barrier_init) {
+      if (header == synclog_header_fence_barrier_init) {
+        synclog_print_prefix("fence_barrier_init", at);
+        at += synclog_length_fence_barrier_init;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_fence_view_async_shared) {
+      if (header == synclog_header_fence_view_async_shared) {
+        synclog_print_prefix("fence_view_async_shared", at);
+        at += synclog_length_fence_view_async_shared;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cp_async_wait) {
+      if (header == synclog_header_cp_async_wait) {
+        synclog_print_prefix("cp_async_wait", at);
+        at += synclog_length_cp_async_wait;
+        printf("n=%u\n", synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cp_async_wait_all) {
+      if (header == synclog_header_cp_async_wait_all) {
+        synclog_print_prefix("cp_async_wait_all", at);
+        at += synclog_length_cp_async_wait_all;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cp_async_fence) {
+      if (header == synclog_header_cp_async_fence) {
+        synclog_print_prefix("cp_async_fence", at);
+        at += synclog_length_cp_async_fence;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cp_async_nan) {
+      if (header == synclog_header_cp_async_nan) {
+        synclog_print_prefix("cp_async_nan", at);
+        at += synclog_length_cp_async_nan;
+        uint64_t gmem_addr = synclog_buf[at-3];
+        gmem_addr += (uint64_t)synclog_buf[at-2] << 32;
+        printf("smem_addr=%u gmem_addr=%llu pred=%u\n", synclog_buf[at-4], gmem_addr, synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cp_async_zfill) {
+      if (header == synclog_header_cp_async_zfill) {
+        synclog_print_prefix("cp_async_zfill", at);
+        at += synclog_length_cp_async_zfill;
+        uint64_t gmem_addr = synclog_buf[at-4];
+        gmem_addr += (uint64_t)synclog_buf[at-3] << 32;
+        printf("smem_addr=%u gmem_addr=%llu pred=%u size=%u\n", synclog_buf[at-5], gmem_addr, synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cp_async) {
+      if (header == synclog_header_cp_async) {
+        synclog_print_prefix("cp_async", at);
+        at += synclog_length_cp_async;
+        uint64_t gmem_addr = synclog_buf[at-4];
+        gmem_addr += (uint64_t)synclog_buf[at-3] << 32;
+        printf("smem_addr=%u gmem_addr=%llu pred=%u size=%u\n", synclog_buf[at-5], gmem_addr, synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_tma_load) {
+      if (header == synclog_header_tma_load) {
+        synclog_print_prefix("tma_load", at);
+        at += synclog_length_tma_load;
+        uint64_t gmem_int_desc = synclog_buf[at-4];
+        gmem_int_desc += (uint64_t)synclog_buf[at-3] << 32;
+        printf("gmem_int_desc=%llu smem_int_mbar=%u smem_int_ptr=%u\n", gmem_int_desc, synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_tma_store) {
+      if (header == synclog_header_tma_store) {
+        synclog_print_prefix("tma_store", at);
+        at += synclog_length_tma_store;
+        uint64_t gmem_int_desc = synclog_buf[at-3];
+        gmem_int_desc += (uint64_t)synclog_buf[at-2] << 32;
+        printf("gmem_int_desc=%llu smem_int_ptr=%u\n", gmem_int_desc, synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_tma_store_arrive) {
+      if (header == synclog_header_tma_store_arrive) {
+        synclog_print_prefix("tma_store_arrive", at);
+        at += synclog_length_tma_store_arrive;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_tma_store_wait) {
+      if (header == synclog_header_tma_store_wait) {
+        synclog_print_prefix("tma_store_wait", at);
+        at += synclog_length_tma_store_wait;
+        printf("count=%u\n", synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_warpgroup_arrive) {
+      if (header == synclog_header_warpgroup_arrive) {
+        synclog_print_prefix("warpgroup_arrive", at);
+        at += synclog_length_warpgroup_arrive;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_warpgroup_wait) {
+      if (header == synclog_header_warpgroup_wait) {
+        synclog_print_prefix("warpgroup_wait", at);
+        at += synclog_length_warpgroup_wait;
+        printf("n=%u\n", synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_warpgroup_commit_batch) {
+      if (header == synclog_header_warpgroup_commit_batch) {
+        synclog_print_prefix("warpgroup_commit_batch", at);
+        at += synclog_length_warpgroup_commit_batch;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_wgmma_reg_smem) {
+      if (header == synclog_header_wgmma_reg_smem) {
+        synclog_print_prefix("wgmma_reg_smem", at);
+        at += synclog_length_wgmma_reg_smem;
+        synclog_print_wgmma_desc("desc_b", synclog_buf[at-2], synclog_buf[at-1], "");
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_wgmma_smem_smem) {
+      if (header == synclog_header_wgmma_smem_smem) {
+        synclog_print_prefix("wgmma_smem_smem", at);
+        at += synclog_length_wgmma_smem_smem;
+        synclog_print_wgmma_desc("desc_a", synclog_buf[at-4], synclog_buf[at-3], " ");
+        synclog_print_wgmma_desc("desc_b", synclog_buf[at-2], synclog_buf[at-1], "");
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cpasync_barrier_arrive) {
+      if (header == synclog_header_cpasync_barrier_arrive) {
+        synclog_print_prefix("cpasync_barrier_arrive", at);
+        at += synclog_length_cpasync_barrier_arrive;
+        printf("smem_addr=%u\n", synclog_buf[at-1]);
+        continue;
+      }
+    }
+    asm volatile ("brkpt;\n" ::);
+  }
+  if (synclog_buf[0] >= synclog_cap) {
+    printf(
+      "synclog was truncated (exceeded capacity of %lu bytes)\n",
+      (synclog_cap - 1) * sizeof(uint32_t)
+    );
+  }
+  printf("synclog end\n");
+  #endif
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#if defined(CUTLASS_ENABLE_SYNCLOG)
+#undef __syncthreads
+#define __syncthreads() do {\
+  cutlass::arch::synclog_emit_syncthreads(__LINE__);\
+  __syncthreads();\
+} while (0)
+#endif // defined(CUTLASS_ENABLE_SYNCLOG)
+#if defined(CUTLASS_ENABLE_SYNCLOG)
+#undef __syncwarp
+#define __syncwarp(...) do {\
+  cutlass::arch::synclog_emit_syncwarp(__LINE__);\
+  __syncwarp(__VA_ARGS__);\
+} while (0)
+#endif // defined(CUTLASS_ENABLE_SYNCLOG)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace arch
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/wmma.h ADDED Viewed

	@@ -0,0 +1,218 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing architecture support for warp matrix multiply-add (WMMA) operations
+*/
+#pragma once
+#if (__CUDACC_VER_MAJOR__ >= 9)
+#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))
+#define CUTLASS_ARCH_WMMA_ENABLED
+#define CUTLASS_ARCH_WMMA_SM70_ENABLED
+#endif
+#endif
+#if (__CUDACC_VER_MAJOR__ >= 10)
+#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 720))
+#define CUTLASS_ARCH_INTEGER_MATRIX_MULTIPLY_ENABLED
+#define CUTLASS_ARCH_WMMA_SM72_ENABLED
+#endif
+#endif
+#if (__CUDACC_VER_MAJOR__ >= 10)
+#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 750))
+#define CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED
+#define CUTLASS_ARCH_WMMA_SM75_ENABLED
+#endif
+#endif
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include <mma.h>
+#include "cutlass/arch/mma.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/gemm.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace arch {
+////////////////////////////////////////////////////////////////////////////////////////////////
+/// Statically maps cutlass data types => nvcuda::wmma data types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Type_>
+struct CutlassToWmmaDataType{
+  using Type = Type_;
+};
+/// Statically maps cutlass::half_t => __half
+template<>
+struct CutlassToWmmaDataType<cutlass::half_t> {
+  using Type = __half;
+};
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11)
+template<>
+struct CutlassToWmmaDataType<cutlass::bfloat16_t> {
+  using Type = __nv_bfloat16;
+};
+#endif
+/// Statically maps int8_t => char
+template<>
+struct CutlassToWmmaDataType<int8_t> {
+  using Type = signed char;
+};
+/// Statically maps uint8_t => char
+template<>
+struct CutlassToWmmaDataType<uint8_t> {
+  using Type = unsigned char;
+};
+/// Statically maps int32_t => int
+template<>
+struct CutlassToWmmaDataType<int32_t> {
+  using Type = int;
+};
+#if defined(CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED)
+/// Statically maps cutlass::int4b_t => experimental::precision::s4
+template<>
+struct CutlassToWmmaDataType<cutlass::int4b_t> {
+  using Type = nvcuda::wmma::experimental::precision::s4;
+};
+/// Statically maps cutlass::uint4b_t => experimental::precision::s4
+template<>
+struct CutlassToWmmaDataType<cutlass::uint4b_t> {
+  using Type = nvcuda::wmma::experimental::precision::u4;
+};
+/// Statically maps cutlass::uint1b_t => experimental::precision::b1
+template<>
+struct CutlassToWmmaDataType<cutlass::uint1b_t> {
+  using Type = nvcuda::wmma::experimental::precision::b1;
+};
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////
+/// Statically maps cutlass::layout => nvcuda::wmma layout tags
+////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Layout_>
+struct CutlassToWmmaLayout {
+};
+/// Statically maps cutlass::layout::RowMajor => nvcuda::wmma::row_major layout tags
+template <>
+struct CutlassToWmmaLayout<cutlass::layout::RowMajor> {
+  using Layout = nvcuda::wmma::row_major;
+  static nvcuda::wmma::layout_t const value = nvcuda::wmma::layout_t::mem_row_major;
+};
+////////////////////////////////////////////////////////////////////////////////////////////////
+/// Statically maps cutlass::layout::RowMajor => nvcuda::wmma::row_major layout tags
+////////////////////////////////////////////////////////////////////////////////////////////////
+template <>
+struct CutlassToWmmaLayout<cutlass::layout::ColumnMajor> {
+  using Layout = nvcuda::wmma::col_major;
+  static nvcuda::wmma::layout_t const value = nvcuda::wmma::layout_t::mem_col_major;
+};
+////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////
+/// Statically maps nvcuda::wmma data types => cutlass data types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Type_>
+struct WmmaToCutlassDataType{
+  using Type = Type_;
+};
+/// Statically maps __half => cutlass::half_t
+template<>
+struct WmmaToCutlassDataType<__half> {
+  using Type = cutlass::half_t;
+};
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11)
+template<>
+struct WmmaToCutlassDataType<__nv_bfloat16> {
+  using Type = cutlass::bfloat16_t;
+};
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// WMMA template structure defines nvcuda::wmma::fragments and static assertion chaeks
+// for a specific template parameterized data type (Element[A|B|C]), layout (Layout[A|B|C]),
+// and native wmma size (Shape)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Shape_,                                   ///< Size of the matrix product (concept: GemmShape)
+  typename ElementA_,                                ///< Data type of A elements
+  typename LayoutA_,                                 ///< Layout of A matrix (concept: MatrixLayout)
+  typename ElementB_,                                ///< Data type of B elements
+  typename LayoutB_,                                 ///< Layout of B matrix (concept: MatrixLayout)
+  typename ElementC_,                                ///< Element type of C matrix
+  typename LayoutC_,                                 /// Layout of C matrix (concept: MatrixLayout)
+  typename Operator_ = cutlass::arch::OpMultiplyAdd   ///< Inner product operator (multiply-add, xor.popc)
+>
+struct Wmma;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace arch
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Specializations for each compute capability
+//
+#ifdef CUTLASS_ARCH_WMMA_SM70_ENABLED
+#include "cutlass/arch/wmma_sm70.h"
+#endif
+#ifdef CUTLASS_ARCH_WMMA_SM72_ENABLED
+#include "cutlass/arch/wmma_sm72.h"
+#endif
+#ifdef CUTLASS_ARCH_WMMA_SM75_ENABLED
+#include "cutlass/arch/wmma_sm75.h"
+#endif
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif //CUTLASS_ARCH_WMMA_ENABLED

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/wmma_sm70.h ADDED Viewed

	@@ -0,0 +1,132 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+#include "cutlass/layout/matrix.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace arch {
+////////////////////////////////////////////////////////////////////////////////
+//
+// WMMA template structure defines nvcuda::wmma::fragments and static assert for
+// wmma native instruction sizes supported for half
+//
+////////////////////////////////////////////////////////////////////////////////
+template <
+typename Shape_,
+typename LayoutA_,
+typename LayoutB_,
+typename ElementC_,
+typename LayoutC_>
+struct Wmma<
+  Shape_,                                   ///< Size of the matrix product (concept: GemmShape)
+  cutlass::half_t,                          ///< ElementA
+  LayoutA_,                                 ///< LayoutA
+  cutlass::half_t,                          ///< ElementB
+  LayoutB_,                                 ///< LayoutB
+  ElementC_,                                ///< ElementC
+  LayoutC_,                                 ///< LayoutC
+  cutlass::arch::OpMultiplyAdd              ///< Operator (multiply-add, xor.popc)
+> {
+#if defined(CUTLASS_ARCH_WMMA_SM70_ENABLED)
+  using Shape = Shape_;
+  using ElementA = cutlass::half_t;
+  using LayoutA = LayoutA_;
+  using ElementB = cutlass::half_t;
+  using LayoutB = LayoutB_;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+  // check supported wmma shape for the given multiplicand data types
+  static_assert(
+    platform::is_same<cutlass::gemm::GemmShape<16, 16, 16>, Shape>::value ||
+    platform::is_same<cutlass::gemm::GemmShape< 8, 32, 16>, Shape>::value ||
+    platform::is_same<cutlass::gemm::GemmShape<32,  8, 16>, Shape>::value,
+    "Supported list of wmma operator shape for f16 multiplicands are: 16x16x16, 8x32x16, and 32x8x16");
+  // check supported wmma output data type for the given multiplicand data types
+  static_assert(
+    platform::is_same<cutlass::half_t, ElementC>::value || platform::is_same<float, ElementC>::value,
+    "Supported of wmma output data type for f16 multiplicands are: f16 and f32");
+  // Wmma Fragment
+  using FragmentA = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_a,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementA>::Type,
+          typename CutlassToWmmaLayout<LayoutA>::Layout>;
+  using FragmentB = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_b,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementB>::Type,
+          typename CutlassToWmmaLayout<LayoutB>::Layout>;
+  using FragmentC = nvcuda::wmma::fragment<
+          nvcuda::wmma::accumulator,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementC>::Type>;
+  /// Performs a nvcuda::wmma matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D,
+    FragmentA const &A,
+    FragmentB const &B,
+    FragmentC const &C) const {
+      nvcuda::wmma::mma_sync(D, A, B, C);
+  }
+#else
+    static_assert(false, "wmma.mma.sync for floating point multiplicands is available only for SM70 and beyond");
+#endif
+};
+} // namespace arch
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/wmma_sm72.h ADDED Viewed

	@@ -0,0 +1,206 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+#include "cutlass/layout/matrix.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace arch {
+////////////////////////////////////////////////////////////////////////////////
+//
+// WMMA template structure defines nvcuda::wmma::fragments and static assert for
+// wmma native instruction sizes supported for int8_t
+//
+////////////////////////////////////////////////////////////////////////////////
+template <
+typename Shape_,
+typename LayoutA_,
+typename LayoutB_,
+typename LayoutC_>
+struct Wmma<
+  Shape_,                                   ///< Size of the matrix product (concept: GemmShape)
+  int8_t,                                   ///< ElementA
+  LayoutA_,                                 ///< LayoutA
+  int8_t,                                   ///< ElementB
+  LayoutB_,                                 ///< LayoutB
+  int32_t,                                  ///< ElementC
+  LayoutC_,                                 ///< LayoutC
+  cutlass::arch::OpMultiplyAdd              ///< Operator (multiply-add, xor.popc)
+> {
+#if defined(CUTLASS_ARCH_WMMA_SM72_ENABLED)
+  using Shape = Shape_;
+  using ElementA = int8_t;
+  using LayoutA = LayoutA_;
+  using ElementB = int8_t;
+  using LayoutB = LayoutB_;
+  using ElementC = int32_t;
+  using LayoutC = LayoutC_;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+  using ArchTag = arch::Sm72;
+  // check supported wmma shape for the given multiplicand data types
+  static_assert(
+    platform::is_same<cutlass::gemm::GemmShape<16, 16, 16>, Shape>::value ||
+    platform::is_same<cutlass::gemm::GemmShape< 8, 32, 16>, Shape>::value ||
+    platform::is_same<cutlass::gemm::GemmShape<32,  8, 16>, Shape>::value,
+    "Supported list of wmma operator shape for s8 multiplicands are: 16x16x16, 8x32x16, and 32x8x16");
+  // Wmma Fragment
+  using FragmentA = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_a,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementA>::Type,
+          typename CutlassToWmmaLayout<LayoutA>::Layout>;
+  using FragmentB = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_b,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementB>::Type,
+          typename CutlassToWmmaLayout<LayoutB>::Layout>;
+  using FragmentC = nvcuda::wmma::fragment<
+          nvcuda::wmma::accumulator,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementC>::Type>;
+  /// Performs a nvcuda::wmma matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D,
+    FragmentA const &A,
+    FragmentB const &B,
+    FragmentC const &C) const {
+      nvcuda::wmma::mma_sync(D, A, B, C);
+  }
+#else
+    static_assert(false, "wmma.mma.sync integer type multiplicands is available only for SM72 and beyond");
+#endif
+};
+////////////////////////////////////////////////////////////////////////////////
+//
+// WMMA template structure defines nvcuda::wmma::fragments and static assert for
+// wmma native instruction sizes supported for uint8_t
+//
+////////////////////////////////////////////////////////////////////////////////
+template <
+typename Shape_,
+typename LayoutA_,
+typename LayoutB_,
+typename LayoutC_>
+struct Wmma<
+  Shape_,                                   ///< Size of the matrix product (concept: GemmShape)
+  uint8_t,                                  ///< ElementA
+  LayoutA_,                                 ///< LayoutA
+  uint8_t,                                  ///< ElementB
+  LayoutB_,                                 ///< LayoutB
+  int32_t,                                  ///< ElementC
+  LayoutC_,                                 ///< LayoutC
+  cutlass::arch::OpMultiplyAdd              ///< Operator (multiply-add, xor.popc)
+> {
+#if defined(CUTLASS_ARCH_WMMA_SM72_ENABLED)
+  using Shape = Shape_;
+  using ElementA = uint8_t;
+  using LayoutA = LayoutA_;
+  using ElementB = uint8_t;
+  using LayoutB = LayoutB_;
+  using ElementC = int32_t;
+  using LayoutC = LayoutC_;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+  using ArchTag = arch::Sm72;
+  // check supported wmma shape for the given multiplicand data types
+  static_assert(
+    platform::is_same<cutlass::gemm::GemmShape<16, 16, 16>, Shape>::value ||
+    platform::is_same<cutlass::gemm::GemmShape< 8, 32, 16>, Shape>::value ||
+    platform::is_same<cutlass::gemm::GemmShape<32,  8, 16>, Shape>::value,
+    "Supported list of wmma operator shape for u8 multiplicands are: 16x16x16, 8x32x16, and 32x8x16");
+  // Wmma Fragment
+  using FragmentA = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_a,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementA>::Type,
+          typename CutlassToWmmaLayout<LayoutA>::Layout>;
+  using FragmentB = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_b,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementB>::Type,
+          typename CutlassToWmmaLayout<LayoutB>::Layout>;
+  using FragmentC = nvcuda::wmma::fragment<
+          nvcuda::wmma::accumulator,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementC>::Type>;
+  /// Performs a nvcuda::wmma matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D,
+    FragmentA const &A,
+    FragmentB const &B,
+    FragmentC const &C) const {
+      nvcuda::wmma::mma_sync(D, A, B, C);
+  }
+#else
+    static_assert(false, "wmma.mma.sync integer type multiplicands is available only for SM72 and beyond");
+#endif
+};
+} // namespace arch
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/wmma_sm75.h ADDED Viewed

	@@ -0,0 +1,203 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+#include "cutlass/layout/matrix.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace arch {
+////////////////////////////////////////////////////////////////////////////////
+//
+// WMMA template structure defines nvcuda::wmma::fragments and static assert for
+// wmma native instruction sizes supported for cutlass::int4b_t (experimental::s4).
+//
+////////////////////////////////////////////////////////////////////////////////
+template <
+typename Shape_,
+typename LayoutA_,
+typename LayoutB_,
+typename LayoutC_>
+struct Wmma<
+  Shape_,                                   ///< Size of the matrix product (concept: GemmShape)
+  cutlass::int4b_t,                         ///< ElementA
+  LayoutA_,                                 ///< LayoutA
+  cutlass::int4b_t,                         ///< ElementB
+  LayoutB_,                                 ///< LayoutB
+  int32_t,                                  ///< ElementC
+  LayoutC_,                                 ///< LayoutC
+  cutlass::arch::OpMultiplyAdd              ///< Operator (multiply-add, xor.popc)
+> {
+#if defined(CUTLASS_ARCH_WMMA_SM75_ENABLED)
+  using Shape = Shape_;
+  using ElementA = cutlass::int4b_t;
+  using LayoutA = LayoutA_;
+  using ElementB = cutlass::int4b_t;
+  using LayoutB = LayoutB_;
+  using ElementC = int32_t;
+  using LayoutC = LayoutC_;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+  using ArchTag = arch::Sm75;
+  // check supported wmma shape for the given multiplicand data types
+  static_assert(
+    platform::is_same<cutlass::gemm::GemmShape<8, 8, 32>, Shape>::value,
+    "Supported list of wmma operator shape for s8 multiplicands is: 8x8x32");
+  // Wmma Fragment
+  using FragmentA = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_a,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementA>::Type,
+          typename CutlassToWmmaLayout<LayoutA>::Layout>;
+  using FragmentB = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_b,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementB>::Type,
+          typename CutlassToWmmaLayout<LayoutB>::Layout>;
+  using FragmentC = nvcuda::wmma::fragment<
+          nvcuda::wmma::accumulator,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementC>::Type>;
+  /// Performs a nvcuda::wmma matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D,
+    FragmentA const &A,
+    FragmentB const &B,
+    FragmentC const &C) const {
+      nvcuda::wmma::mma_sync(D, A, B, C);
+  }
+#else
+    static_assert(false, "wmma.mma.sync integer type multiplicands is available only for SM75 and beyond");
+#endif
+};
+////////////////////////////////////////////////////////////////////////////////
+//
+// WMMA template structure defines nvcuda::wmma::fragments and static assert for
+// wmma native instruction sizes supported for cutlass::uint1b_t (experimental::b1).
+//
+////////////////////////////////////////////////////////////////////////////////
+template <
+typename Shape_,
+typename LayoutA_,
+typename LayoutB_,
+typename LayoutC_>
+struct Wmma<
+  Shape_,                                   ///< Size of the matrix product (concept: GemmShape)
+  cutlass::uint1b_t,                        ///< ElementA
+  LayoutA_,                                 ///< LayoutA
+  cutlass::uint1b_t,                        ///< ElementB
+  LayoutB_,                                 ///< LayoutB
+  int32_t,                                  ///< ElementC
+  LayoutC_,                                 ///< LayoutC
+  cutlass::arch::OpXorPopc                  ///< Operator (multiply-add, xor.popc)
+> {
+#if defined(CUTLASS_ARCH_WMMA_SM75_ENABLED)
+  using Shape = Shape_;
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = LayoutA_;
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = LayoutB_;
+  using ElementC = int32_t;
+  using LayoutC = LayoutC_;
+  using Operator = cutlass::arch::OpXorPopc;
+  using ArchTag = arch::Sm75;
+  // check supported wmma shape for the given multiplicand data types
+  static_assert(
+    platform::is_same<cutlass::gemm::GemmShape<8, 8, 128>, Shape>::value,
+    "Supported list of wmma operator shape for b1 multiplicands is: 8x8x128");
+  // Wmma Fragment
+  using FragmentA = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_a,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementA>::Type,
+          typename CutlassToWmmaLayout<LayoutA>::Layout>;
+  using FragmentB = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_b,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementB>::Type,
+          typename CutlassToWmmaLayout<LayoutB>::Layout>;
+  using FragmentC = nvcuda::wmma::fragment<
+          nvcuda::wmma::accumulator,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementC>::Type>;
+  /// Performs a nvcuda::wmma matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D,
+    FragmentA const &A,
+    FragmentB const &B,
+    FragmentC const &C) const {
+      nvcuda::wmma::bmma_sync(D, A, B, C, nvcuda::wmma::experimental::bmmaBitOpXOR,
+                                          nvcuda::wmma::experimental::bmmaAccumulateOpPOPC);
+  }
+#else
+    static_assert(false, "wmma.mma.sync integer type multiplicands is available only for SM75 and beyond");
+#endif
+};
+} // namespace arch
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/array.h ADDED Viewed

	@@ -0,0 +1,2860 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Statically sized array of elements that accommodates all CUTLASS-supported numeric types
+           and is safe to use in a union.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/platform/platform.h"
+namespace cutlass {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Statically sized array for any data type
+template <
+  typename T,
+  int N,
+  bool RegisterSized = sizeof_bits<T>::value >= 32
+>
+struct Array;
+namespace detail {
+template<class T>
+struct is_Array : platform::false_type {};
+template <
+  typename T,
+  int N,
+  bool RegisterSized
+>
+struct is_Array<Array<T, N, RegisterSized> > : platform::true_type {};
+template<typename T>
+constexpr bool is_Array_v = is_Array<T>::value;
+} // namespace detail
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines the size of an Array<> in bits
+template <typename T, int N, bool RegisterSized>
+struct sizeof_bits<Array<T, N, RegisterSized> > {
+  static constexpr int value = sizeof(Array<T, N, RegisterSized>) * 8;
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Returns true if the argument is a power of 2
+CUTLASS_HOST_DEVICE
+constexpr bool ispow2(unsigned x) {
+  return x && (!(x & (x - 1)));
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Returns the largest power of two not greater than the argument.
+CUTLASS_HOST_DEVICE
+constexpr unsigned floor_pow_2(unsigned x) {
+  return (x == 0 || ispow2(x)) ? x : ((floor_pow_2(x >> 1)) << 1);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Statically sized array for any data type
+template <
+  typename T,
+  int N
+>
+struct Array<T, N, true> {
+  /// Storage type
+  using Storage = T;
+  /// Element type
+  using Element = T;
+  /// Number of storage elements
+  //static std::size_t const kStorageElements = N;
+  static constexpr size_t kStorageElements = N;
+  /// Number of logical elements
+  static constexpr size_t kElements = N;
+  //
+  // C++ standard members
+  //
+  typedef T value_type;
+  typedef size_t size_type;
+  typedef ptrdiff_t difference_type;
+  typedef value_type &reference;
+  typedef value_type const & const_reference;
+  typedef value_type *pointer;
+  typedef value_type const * const_pointer;
+  //
+  // Iterators
+  //
+  /// Bidirectional iterator over elements
+  class iterator {
+    /// Pointer to object
+    T *ptr_;
+  public:
+    CUTLASS_HOST_DEVICE
+    iterator(): ptr_(nullptr) { }
+    CUTLASS_HOST_DEVICE
+    iterator(T *_ptr): ptr_(_ptr) { }
+    CUTLASS_HOST_DEVICE
+    iterator &operator++() {
+      ++ptr_;
+      return *this;
+    }
+    CUTLASS_HOST_DEVICE
+    iterator &operator--() {
+      --ptr_;
+      return *this;
+    }
+    CUTLASS_HOST_DEVICE
+    iterator operator++(int) {
+      iterator ret(*this);
+      ++ptr_;
+      return ret;
+    }
+    CUTLASS_HOST_DEVICE
+    iterator operator--(int) {
+      iterator ret(*this);
+      --ptr_;
+      return ret;
+    }
+    CUTLASS_HOST_DEVICE
+    T &operator*() const {
+      return *ptr_;
+    }
+    CUTLASS_HOST_DEVICE
+    bool operator==(iterator const &other) const {
+      return ptr_ == other.ptr_;
+    }
+    CUTLASS_HOST_DEVICE
+    bool operator!=(iterator const &other) const {
+      return ptr_ != other.ptr_;
+    }
+  };
+  /// Bidirectional constant iterator over elements
+  class const_iterator {
+    /// Pointer to object
+    const T *ptr_;
+  public:
+    CUTLASS_HOST_DEVICE
+    const_iterator(): ptr_(nullptr) { }
+    CUTLASS_HOST_DEVICE
+    const_iterator(T const *_ptr): ptr_(_ptr) { }
+    CUTLASS_HOST_DEVICE
+    const_iterator &operator++() {
+      ++ptr_;
+      return *this;
+    }
+    CUTLASS_HOST_DEVICE
+    const_iterator &operator--() {
+      --ptr_;
+      return *this;
+    }
+    CUTLASS_HOST_DEVICE
+    const_iterator operator++(int) {
+      const_iterator ret(*this);
+      ++ptr_;
+      return ret;
+    }
+    CUTLASS_HOST_DEVICE
+    const_iterator operator--(int) {
+      const_iterator ret(*this);
+      --ptr_;
+      return ret;
+    }
+    CUTLASS_HOST_DEVICE
+    T const &operator*() const {
+      return *ptr_;
+    }
+    CUTLASS_HOST_DEVICE
+    bool operator==(const_iterator const &other) const {
+      return ptr_ == other.ptr_;
+    }
+    CUTLASS_HOST_DEVICE
+    bool operator!=(const_iterator const &other) const {
+      return ptr_ != other.ptr_;
+    }
+  };
+  /// Bidirectional iterator over elements
+  class reverse_iterator {
+    /// Pointer to object
+    T *ptr_;
+  public:
+    CUTLASS_HOST_DEVICE
+    reverse_iterator(): ptr_(nullptr) { }
+    CUTLASS_HOST_DEVICE
+    reverse_iterator(T *_ptr): ptr_(_ptr) { }
+    CUTLASS_HOST_DEVICE
+    reverse_iterator &operator++() {
+      --ptr_;
+      return *this;
+    }
+    CUTLASS_HOST_DEVICE
+    reverse_iterator &operator--() {
+      ++ptr_;
+      return *this;
+    }
+    CUTLASS_HOST_DEVICE
+    reverse_iterator operator++(int) {
+      iterator ret(*this);
+      --ptr_;
+      return ret;
+    }
+    CUTLASS_HOST_DEVICE
+    reverse_iterator operator--(int) {
+      iterator ret(*this);
+      ++ptr_;
+      return ret;
+    }
+    CUTLASS_HOST_DEVICE
+    T &operator*() const {
+      return *(ptr_ - 1);
+    }
+    CUTLASS_HOST_DEVICE
+    bool operator==(reverse_iterator const &other) const {
+      return ptr_ == other.ptr_;
+    }
+    CUTLASS_HOST_DEVICE
+    bool operator!=(reverse_iterator const &other) const {
+      return ptr_ != other.ptr_;
+    }
+  };
+  /// Bidirectional constant iterator over elements
+  class const_reverse_iterator {
+    /// Pointer to object
+    T const *ptr_;
+  public:
+    CUTLASS_HOST_DEVICE
+    const_reverse_iterator(): ptr_(nullptr) { }
+    CUTLASS_HOST_DEVICE
+    const_reverse_iterator(T const *_ptr): ptr_(_ptr) { }
+    CUTLASS_HOST_DEVICE
+    const_reverse_iterator &operator++() {
+      --ptr_;
+      return *this;
+    }
+    CUTLASS_HOST_DEVICE
+    const_reverse_iterator &operator--() {
+      ++ptr_;
+      return *this;
+    }
+    CUTLASS_HOST_DEVICE
+    const_reverse_iterator operator++(int) {
+      const_reverse_iterator ret(*this);
+      --ptr_;
+      return ret;
+    }
+    CUTLASS_HOST_DEVICE
+    const_reverse_iterator operator--(int) {
+      const_reverse_iterator ret(*this);
+      ++ptr_;
+      return ret;
+    }
+    CUTLASS_HOST_DEVICE
+    T const &operator*() const {
+      return *(ptr_ - 1);
+    }
+    CUTLASS_HOST_DEVICE
+    bool operator==(const_iterator const &other) const {
+      return ptr_ == other.ptr_;
+    }
+    CUTLASS_HOST_DEVICE
+    bool operator!=(const_iterator const &other) const {
+      return ptr_ != other.ptr_;
+    }
+  };
+  /// Internal storage
+  Storage storage[kElements];
+  /// Efficient clear method
+  CUTLASS_HOST_DEVICE
+  void clear() {
+    fill(T(0));
+  }
+  CUTLASS_HOST_DEVICE
+  reference at(size_type pos) {
+    return reinterpret_cast<reference>(storage[pos]);
+  }
+  CUTLASS_HOST_DEVICE
+  const_reference at(size_type pos) const {
+    return reinterpret_cast<const_reference>(storage[pos]);
+  }
+  CUTLASS_HOST_DEVICE
+  reference operator[](size_type pos) {
+    return reinterpret_cast<reference>(storage[pos]);
+  }
+  CUTLASS_HOST_DEVICE
+  const_reference operator[](size_type pos) const {
+    return reinterpret_cast<const_reference>(storage[pos]);
+  }
+  CUTLASS_HOST_DEVICE
+  reference front() {
+    return reinterpret_cast<reference>(storage[0]);
+  }
+  CUTLASS_HOST_DEVICE
+  const_reference front() const {
+    return reinterpret_cast<const_reference>(storage[0]);
+  }
+  CUTLASS_HOST_DEVICE
+  reference back() {
+    return reinterpret_cast<reference>(storage[kStorageElements - 1]);
+  }
+  CUTLASS_HOST_DEVICE
+  const_reference back() const {
+    return reinterpret_cast<const_reference>(storage[kStorageElements - 1]);
+  }
+  CUTLASS_HOST_DEVICE
+  pointer data() {
+    return reinterpret_cast<pointer>(storage);
+  }
+  CUTLASS_HOST_DEVICE
+  const_pointer data() const {
+    return reinterpret_cast<const_pointer>(storage);
+  }
+  CUTLASS_HOST_DEVICE
+  pointer raw_data() {
+    return reinterpret_cast<pointer>(storage);
+  }
+  CUTLASS_HOST_DEVICE
+  const_pointer raw_data() const {
+    return reinterpret_cast<const_pointer>(storage);
+  }
+  CUTLASS_HOST_DEVICE
+  constexpr bool empty() const {
+    return !kElements;
+  }
+  CUTLASS_HOST_DEVICE
+  constexpr size_type size() const {
+    return kElements;
+  }
+  CUTLASS_HOST_DEVICE
+  constexpr size_type max_size() const {
+    return kElements;
+  }
+  CUTLASS_HOST_DEVICE
+  void fill(T const &value) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < int(kElements); ++i) {
+      storage[i] = static_cast<Storage>(value);
+    }
+  }
+  CUTLASS_HOST_DEVICE
+  iterator begin() {
+    return iterator(storage);
+  }
+  CUTLASS_HOST_DEVICE
+  const_iterator begin() const {
+    return cbegin();
+  }
+  CUTLASS_HOST_DEVICE
+  const_iterator cbegin() const {
+    return const_iterator(storage);
+  }
+  CUTLASS_HOST_DEVICE
+  iterator end() {
+    return iterator(reinterpret_cast<pointer>(storage + kStorageElements));
+  }
+  CUTLASS_HOST_DEVICE
+  const_iterator end() const {
+    return cend();
+  }
+  CUTLASS_HOST_DEVICE
+  const_iterator cend() const {
+    return const_iterator(reinterpret_cast<const_pointer>(storage + kStorageElements));
+  }
+  CUTLASS_HOST_DEVICE
+  reverse_iterator rbegin() {
+    return reverse_iterator(reinterpret_cast<pointer>(storage + kStorageElements));
+  }
+  CUTLASS_HOST_DEVICE
+  const_reverse_iterator rbegin() const {
+    return crbegin();
+  }
+  CUTLASS_HOST_DEVICE
+  const_reverse_iterator crbegin() const {
+    return const_reverse_iterator(reinterpret_cast<const_pointer>(storage + kStorageElements));
+  }
+  CUTLASS_HOST_DEVICE
+  reverse_iterator rend() {
+    return reverse_iterator(reinterpret_cast<pointer>(storage));
+  }
+  CUTLASS_HOST_DEVICE
+  const_reverse_iterator rend() const {
+    return crend();
+  }
+  CUTLASS_HOST_DEVICE
+  const_reverse_iterator crend() const {
+    return const_reverse_iterator(reinterpret_cast<const_pointer>(storage));
+  }
+  //
+  // Comparison operators
+  //
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Factories
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Array<Element, 1> make_Array(Element x) {
+  return {x};
+}
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Array<Element, 2> make_Array(Element x, Element y) {
+  return {x,y};
+}
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Array<Element, 3> make_Array(Element x, Element y, Element z) {
+  return {x,y,z};
+}
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Array<Element, 4> make_Array(Element x, Element y, Element z, Element w) {
+  return {x,y,z,w};
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// functional.h numeric specializations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename T, int N>
+struct absolute_value_op< Array<T, N> > {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs) const {
+    Array<T, N> result;
+    absolute_value_op<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i]);
+    }
+    return result;
+  }
+};
+template <typename T, int N>
+struct plus<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+    Array<T, N> result;
+    plus<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], rhs[i]);
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
+    Array<T, N> result;
+    plus<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], scalar);
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
+    Array<T, N> result;
+    plus<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, rhs[i]);
+    }
+    return result;
+  }
+};
+template <typename T, int N>
+struct minus<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+    Array<T, N> result;
+    minus<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], rhs[i]);
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
+    Array<T, N> result;
+    minus<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], scalar);
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
+    Array<T, N> result;
+    minus<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, rhs[i]);
+    }
+    return result;
+  }
+};
+template <typename T, int N>
+struct multiplies<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+    Array<T, N> result;
+    multiplies<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], rhs[i]);
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
+    Array<T, N> result;
+    multiplies<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], scalar);
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
+    Array<T, N> result;
+    multiplies<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, rhs[i]);
+    }
+    return result;
+  }
+};
+template <typename T, int N, bool PropogateNaN>
+struct maximum_absolute_value_reduction<Array<T, N>, PropogateNaN> {
+  CUTLASS_HOST_DEVICE
+  T operator() (T const& scalar, Array<T, N> const& rhs) const {
+    T result = scalar;
+    maximum_absolute_value_reduction<T, PropogateNaN> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result = scalar_op(result, rhs[i]);
+    }
+    return result;
+  }
+};
+template <typename T, int N>
+struct scale<Array<T, N>> {
+  T const scaling_factor_;
+  CUTLASS_HOST_DEVICE
+  scale(T scaling_factor) : scaling_factor_(scaling_factor) {
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const & rhs) const {
+    Array<T, N> result;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = rhs[i] * scaling_factor_;
+    }
+    return result;
+  }
+};
+template <typename T, int N>
+struct divides<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+    Array<T, N> result;
+    divides<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], rhs[i]);
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
+    Array<T, N> result;
+    divides<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], scalar);
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
+    Array<T, N> result;
+    divides<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, rhs[i]);
+    }
+    return result;
+  }
+};
+template <typename T, int N>
+struct reciprocal_approximate<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs) const {
+    Array<T, N> result;
+    reciprocal_approximate<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i]);
+    }
+    return result;
+  }
+};
+template <typename T, int N>
+struct reciprocal_approximate_ftz<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs) const {
+    Array<T, N> result;
+    reciprocal_approximate_ftz<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i]);
+    }
+    return result;
+  }
+};
+template <typename T, int N, bool PropagateNaN>
+struct maximum<Array<T, N>, PropagateNaN> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+    Array<T, N> result;
+    maximum<T, PropagateNaN> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], rhs[i]);
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
+    Array<T, N> result;
+    maximum<T, PropagateNaN> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], scalar);
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(T const &scalar, Array<T, N> const &rhs) const {
+    Array<T, N> result;
+    maximum<T, PropagateNaN> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, rhs[i]);
+    }
+    return result;
+  }
+};
+template <typename T, int N, bool PropagateNaN>
+struct minimum<Array<T, N>, PropagateNaN> {
+  CUTLASS_HOST_DEVICE
+  static T scalar_op(T const &lhs, T const &rhs) {
+    return (rhs < lhs ? rhs : lhs);
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+    Array<T, N> result;
+    minimum<T, PropagateNaN> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], rhs[i]);
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
+    Array<T, N> result;
+    minimum<T, PropagateNaN> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], scalar);
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(T const &scalar, Array<T, N> const &rhs) const {
+    Array<T, N> result;
+    minimum<T, PropagateNaN> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, rhs[i]);
+    }
+    return result;
+  }
+};
+template <typename T, int N>
+struct minimum_with_nan_propagation<Array<T, N>> : minimum<Array<T, N>, true>
+{};
+template <typename T, int N>
+struct negate<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs) const {
+    Array<T, N> result;
+    negate<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i]);
+    }
+    return result;
+  }
+};
+/// Fused multiply-add
+template <typename T, int N>
+struct multiply_add<Array<T, N>, Array<T, N>, Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) const {
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i], b[i], c[i]);
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, T const &scalar, Array<T, N> const &c) const {
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i], scalar, c[i]);
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(T const &scalar, Array<T, N> const &b, Array<T, N> const &c) const {
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, b[i], c[i]);
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, Array<T, N> const &b, T const &scalar) const {
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i], b[i], scalar);
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, T const &scalar_b, T const &scalar_c) const {
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i], scalar_b, scalar_c);
+    }
+    return result;
+  }
+};
+/// Fused square-and-plus
+template <typename T, int N>
+struct square_and_plus<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+    multiply_add<Array<T, N>, Array<T, N>, Array<T, N>> ma_op;
+    return ma_op(rhs, rhs, lhs);
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &rhs) const {
+    plus<Array<T, N>> plus_op;
+    multiplies<T> multiplies_op;
+    return plus_op(multiplies_op(rhs, rhs), lhs);
+  }
+};
+/// Inverse-square-root
+template <typename T, int N>
+struct inverse_square_root<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a) const {
+    Array<T, N> result;
+    inverse_square_root<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i]);
+    }
+    return result;
+  }
+};
+template <int N>
+struct inverse_square_root<Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & a) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = h2rsqrt(a_ptr[i]);
+    }
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half d_residual = hrsqrt(a_residual_ptr[N - 1]);
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    inverse_square_root<half_t> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i]);
+    }
+    #endif
+    return result;
+  }
+};
+/// Fused multiply-add-relu0
+template <typename T, int N>
+struct multiply_add_relu0<Array<T, N>, Array<T, N>, Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) const {
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+    maximum<T> mx;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(scalar_op(a[i], b[i], c[i]), T(0));
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, T const &scalar, Array<T, N> const &c) const {
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+    maximum<T> mx;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(scalar_op(a[i], scalar, c[i]), T(0));
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(T const &scalar, Array<T, N> const &b, Array<T, N> const &c) const {
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+    maximum<T> mx;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(scalar_op(scalar, b[i], c[i]), T(0));
+    }
+    return result;
+  }
+};
+template <typename T, int N>
+struct conjugate<Array<T, N> >  {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a) const {
+    conjugate<T> conj_op;
+    Array<T, N> ca;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      ca[i] = conj_op(a[i]);
+    }
+    return ca;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// functional.h numeric specializations targeting SIMD instructions in device code.
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <int N>
+struct plus<Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hadd2(lhs_ptr[i], rhs_ptr[i]);
+    }
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = __hadd(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] + rhs[i];
+    }
+    #endif
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hadd2(lhs_pair, rhs_ptr[i]);
+    }
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = __hadd(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1]);
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs + rhs[i];
+    }
+    #endif
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hadd2(lhs_ptr[i], rhs_pair);
+    }
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half d_residual = __hadd(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs));
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] + rhs;
+    }
+    #endif
+    return result;
+  }
+};
+template <int N>
+struct minus<Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hsub2(lhs_ptr[i], rhs_ptr[i]);
+    }
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = __hsub(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] - rhs[i];
+    }
+    #endif
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hsub2(lhs_pair, rhs_ptr[i]);
+    }
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = __hsub(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1]);
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs - rhs[i];
+    }
+    #endif
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hsub2(lhs_ptr[i], rhs_pair);
+    }
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half d_residual = __hsub(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs));
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] - rhs;
+    }
+    #endif
+    return result;
+  }
+};
+template <int N>
+struct multiplies<Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hmul2(lhs_ptr[i], rhs_ptr[i]);
+    }
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = __hmul(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] * rhs[i];
+    }
+    #endif
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hmul2(lhs_pair, rhs_ptr[i]);
+    }
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = __hmul(
+        reinterpret_cast<__half const &>(lhs),
+        b_residual_ptr[N - 1]);
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs * rhs[i];
+    }
+    #endif
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hmul2(lhs_ptr[i], rhs_pair);
+    }
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half d_residual = __hmul(
+        a_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(rhs));
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] * rhs;
+    }
+    #endif
+    return result;
+  }
+};
+template <int N>
+struct divides<Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __h2div(lhs_ptr[i], rhs_ptr[i]);
+    }
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = __hdiv(
+        a_residual_ptr[N - 1],
+        b_residual_ptr[N - 1]);
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] / rhs[i];
+    }
+    #endif
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __h2div(lhs_pair, rhs_ptr[i]);
+    }
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = __hdiv(
+        reinterpret_cast<__half const &>(lhs),
+        b_residual_ptr[N - 1]);
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs / rhs[i];
+    }
+    #endif
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __h2div(lhs_ptr[i], rhs_pair);
+    }
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half d_residual = __hdiv(
+        a_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(rhs));
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] / rhs;
+    }
+    #endif
+    return result;
+  }
+};
+template <int N>
+struct negate<Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *source_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hneg2(source_ptr[i]);
+    }
+    if constexpr (N % 2) {
+      half_t x = -lhs[N - 1];
+      __half lhs_val = reinterpret_cast<__half const &>(x);
+      result[N - 1] = reinterpret_cast<half_t const &>(lhs_val);
+    }
+    #else
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = -lhs[i];
+    }
+    #endif
+    return result;
+  }
+};
+/// Fused multiply-add
+template <int N>
+struct multiply_add<Array<half_t, N>, Array<half_t, N>, Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    Array<half_t, N> const &b,
+    Array<half_t, N> const &c) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
+    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2(a_ptr[i], b_ptr[i], c_ptr[i]);
+    }
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
+      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
+      __half d_residual = __hfma(
+        a_residual_ptr[N - 1],
+        b_residual_ptr[N - 1],
+        c_residual_ptr[N - 1]);
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    multiply_add<half_t> op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b[i], c[i]);
+    }
+    #endif
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    half_t const &a,
+    Array<half_t, N> const &b,
+    Array<half_t, N> const &c) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 a_pair = __half2half2(reinterpret_cast<__half const &>(a));
+    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
+    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2(a_pair, b_ptr[i], c_ptr[i]);
+    }
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
+      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
+      __half d_residual = __hfma(
+        reinterpret_cast<__half const &>(a),
+        b_residual_ptr[N - 1],
+        c_residual_ptr[N - 1]);
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    multiply_add<half_t> op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a, b[i], c[i]);
+    }
+    #endif
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    half_t const &b,
+    Array<half_t, N> const &c) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 b_pair = __half2half2(reinterpret_cast<__half const &>(b));
+    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2(a_ptr[i], b_pair, c_ptr[i]);
+    }
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
+      __half d_residual = __hfma(
+        a_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(b),
+        c_residual_ptr[N - 1]);
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    multiply_add<half_t> op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b, c[i]);
+    }
+    #endif
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    Array<half_t, N> const &b,
+    half_t const &c) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
+    __half2 c_pair = __half2half2(reinterpret_cast<__half const &>(c));
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2(a_ptr[i], b_ptr[i], c_pair);
+    }
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
+      __half d_residual = __hfma(
+        a_residual_ptr[N - 1],
+        b_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(c));
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    multiply_add<half_t> op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b[i], c);
+    }
+    #endif
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    half_t const &b,
+    half_t const &c) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 b_pair = __half2half2(reinterpret_cast<__half const &>(b));
+    __half2 c_pair = __half2half2(reinterpret_cast<__half const &>(c));
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2(a_ptr[i], b_pair, c_pair);
+    }
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half d_residual = __hfma(
+        a_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(b),
+        reinterpret_cast<__half const &>(c));
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    multiply_add<half_t> op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b, c);
+    }
+    #endif
+    return result;
+  }
+};
+/// Fused multiply-add-relu0
+template <int N>
+struct multiply_add_relu0<Array<half_t, N>, Array<half_t, N>, Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    Array<half_t, N> const &b,
+    Array<half_t, N> const &c) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
+    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2_relu(a_ptr[i], b_ptr[i], c_ptr[i]);
+    }
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
+      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
+      __half d_residual = __hfma_relu(
+        a_residual_ptr[N - 1],
+        b_residual_ptr[N - 1],
+        c_residual_ptr[N - 1]);
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    multiply_add<half_t> op;
+    maximum<half_t> mx;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(op(a[i], b[i], c[i]), (half_t)0);
+    }
+    #endif
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    half_t const &a,
+    Array<half_t, N> const &b,
+    Array<half_t, N> const &c) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 a_pair = __half2half2(reinterpret_cast<__half const &>(a));
+    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
+    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2_relu(a_pair, b_ptr[i], c_ptr[i]);
+    }
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
+      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
+      __half d_residual = __hfma_relu(
+        reinterpret_cast<__half const &>(a),
+        b_residual_ptr[N - 1],
+        c_residual_ptr[N - 1]);
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    multiply_add<half_t> op;
+    maximum<half_t> mx;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(op(a, b[i], c[i]), half_t(0));
+    }
+    #endif
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    half_t const &b,
+    Array<half_t, N> const &c) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 b_pair = __half2half2(reinterpret_cast<__half const &>(b));
+    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2_relu(a_ptr[i], b_pair, c_ptr[i]);
+    }
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
+      __half d_residual = __hfma_relu(
+        a_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(b),
+        c_residual_ptr[N - 1]);
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    multiply_add<half_t> op;
+    maximum<half_t> mx;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(op(a[i], b, c[i]), half_t(0));
+    }
+    #endif
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    Array<half_t, N> const &b,
+    half_t const &c) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
+    __half2 c_pair = __half2half2(reinterpret_cast<__half const &>(c));
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2_relu(a_ptr[i], b_ptr[i], c_pair);
+    }
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
+      __half d_residual = __hfma_relu(
+        a_residual_ptr[N - 1],
+        b_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(c));
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    multiply_add<half_t> op;
+    maximum<half_t> mx;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(op(a[i], b[i], c), half_t(0));
+    }
+    #endif
+    return result;
+  }
+};
+template <int N, bool PropagateNaN>
+struct minimum<Array<half_t, N>, PropagateNaN> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = PropagateNaN ? __hmin2_nan(lhs_ptr[i], rhs_ptr[i])
+                                   : __hmin2(lhs_ptr[i], rhs_ptr[i]);
+    }
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = PropagateNaN ? __hmin_nan(a_residual_ptr[N - 1], b_residual_ptr[N - 1])
+                                       : __hmin(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    minimum<half_t,PropagateNaN> mn;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mn(lhs[i],rhs[i]);
+    }
+    #endif
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = PropagateNaN ? __hmin2_nan(lhs_pair, rhs_ptr[i])
+                                   : __hmin2(lhs_pair, rhs_ptr[i]);
+    }
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = PropagateNaN ? __hmin_nan(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1])
+                                       : __hmin(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1]);
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    minimum<half_t,PropagateNaN> mn;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mn(lhs, rhs[i]);
+    }
+    #endif
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = PropagateNaN ? __hmin2_nan(lhs_ptr[i], rhs_pair)
+                                   : __hmin2(lhs_ptr[i], rhs_pair);
+    }
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half d_residual = PropagateNaN ? __hmin_nan(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs))
+                                       : __hmin(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs));
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    minimum<half_t, PropagateNaN> mn;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mn(lhs[i], rhs);
+    }
+    #endif
+    return result;
+  }
+};
+template <int N, bool PropagateNaN>
+struct maximum<Array<half_t, N>, PropagateNaN> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = PropagateNaN ? __hmax2_nan(lhs_ptr[i], rhs_ptr[i])
+                                   : __hmax2(lhs_ptr[i], rhs_ptr[i]);
+    }
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = PropagateNaN ? __hmax(a_residual_ptr[N - 1], b_residual_ptr[N - 1])
+                                       : __hmax_nan(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    maximum<half_t,PropagateNaN> mx;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(lhs[i], rhs[i]);
+    }
+    #endif
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = PropagateNaN ? __hmax2_nan(lhs_pair, rhs_ptr[i])
+                                   : __hmax2(lhs_pair, rhs_ptr[i]);
+    }
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = PropagateNaN ? __hmax_nan(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1])
+                                       : __hmax(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1]);
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    maximum<half_t,PropagateNaN> mx;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(lhs, rhs[i]);
+    }
+    #endif
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = PropagateNaN ? __hmax2_nan(lhs_ptr[i], rhs_pair)
+                                   : __hmax2(lhs_ptr[i], rhs_pair);
+    }
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half d_residual = PropagateNaN ? __hmax_nan(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs))
+                                       : __hmax(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs));
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+    #else
+    maximum<half_t,PropagateNaN> mx;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(lhs[i], rhs);
+    }
+    #endif
+    return result;
+  }
+};
+/// Fused multiply-add
+template <int N>
+struct multiply_add<Array<bfloat16_t, N>, Array<bfloat16_t, N>, Array<bfloat16_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(
+    Array<bfloat16_t, N> const &a,
+    Array<bfloat16_t, N> const &b,
+    Array<bfloat16_t, N> const &c) const {
+    Array<bfloat16_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
+    unsigned const *a_ptr = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *b_ptr = reinterpret_cast<unsigned const *>(&b);
+    unsigned const *c_ptr = reinterpret_cast<unsigned const *>(&c);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n"
+        : "=r"(result_ptr[i])
+        : "r"(a_ptr[i]), "r"(b_ptr[i]), "r"(c_ptr[i])
+      );
+    }
+    if constexpr (N % 2) {
+      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
+      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
+      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
+      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
+      asm ("fma.rn.bf16 %0, %1, %2, %3;\n"
+        : "=h"(result_ptr[N - 1])
+        : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[N - 1])
+      );
+    }
+    #else
+    multiply_add<bfloat16_t> op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b[i], c[i]);
+    }
+    #endif
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(
+    bfloat16_t const &a,
+    Array<bfloat16_t, N> const &b,
+    Array<bfloat16_t, N> const &c) const {
+    Array<bfloat16_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
+    unsigned const *b_ptr = reinterpret_cast<unsigned const *>(&b);
+    unsigned const *c_ptr = reinterpret_cast<unsigned const *>(&c);
+    unsigned a_packed = static_cast<unsigned>(a.raw());
+    a_packed = (a_packed | (a_packed << 16));
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n"
+        : "=r"(result_ptr[i])
+        : "r"(a_packed), "r"(b_ptr[i]), "r"(c_ptr[i])
+      );
+    }
+    if constexpr (N % 2) {
+      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
+      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
+      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
+      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
+      asm ("fma.rn.bf16 %0, %1, %2, %3;\n"
+        : "=h"(result_ptr[N - 1])
+        : "h"(a_residual_ptr[0]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[N - 1])
+      );
+    }
+    #else
+    multiply_add<bfloat16_t> op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a, b[i], c[i]);
+    }
+    #endif
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(
+    Array<bfloat16_t, N> const &a,
+    bfloat16_t const &b,
+    Array<bfloat16_t, N> const &c) const {
+    Array<bfloat16_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
+    unsigned const *a_ptr = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *c_ptr = reinterpret_cast<unsigned const *>(&c);
+    unsigned b_packed = static_cast<unsigned>(b.raw());
+    b_packed = (b_packed | (b_packed << 16));
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n"
+        : "=r"(result_ptr[i])
+        : "r"(a_ptr[i]), "r"(b_packed), "r"(c_ptr[i])
+      );
+    }
+    if constexpr (N % 2) {
+      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
+      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
+      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
+      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
+      asm ("fma.rn.bf16 %0, %1, %2, %3;\n"
+        : "=h"(result_ptr[N - 1])
+        : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[0]), "h"(c_residual_ptr[N - 1])
+      );
+    }
+    #else
+    multiply_add<bfloat16_t> op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b, c[i]);
+    }
+    #endif
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(
+    Array<bfloat16_t, N> const &a,
+    Array<bfloat16_t, N> const &b,
+    bfloat16_t const &c) const {
+    Array<bfloat16_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
+    unsigned const *a_ptr = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *b_ptr = reinterpret_cast<unsigned const *>(&b);
+    unsigned c_packed = static_cast<unsigned>(c.raw());
+    c_packed = (c_packed | (c_packed << 16));
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n"
+        : "=r"(result_ptr[i])
+        : "r"(a_ptr[i]), "r"(b_ptr[i]), "r"(c_packed)
+      );
+    }
+    if constexpr (N % 2) {
+      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
+      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
+      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
+      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
+      asm ("fma.rn.bf16 %0, %1, %2, %3;\n"
+        : "=h"(result_ptr[N - 1])
+        : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[0])
+      );
+    }
+    #else
+    multiply_add<bfloat16_t> op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b[i], c);
+    }
+    #endif
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(
+    Array<bfloat16_t, N> const &a,
+    bfloat16_t const &b,
+    bfloat16_t const &c) const {
+    Array<bfloat16_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
+    unsigned const *a_ptr = reinterpret_cast<unsigned const *>(&a);
+    unsigned b_packed = static_cast<unsigned>(b.raw());
+    b_packed = (b_packed | (b_packed << 16));
+    unsigned c_packed = static_cast<unsigned>(c.raw());
+    c_packed = (c_packed | (c_packed << 16));
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n"
+        : "=r"(result_ptr[i])
+        : "r"(a_ptr[i]), "r"(b_packed), "r"(c_packed)
+      );
+    }
+    if constexpr (N % 2) {
+      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
+      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
+      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
+      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
+      asm ("fma.rn.bf16 %0, %1, %2, %3;\n"
+        : "=h"(result_ptr[N - 1])
+        : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[0]), "h"(c_residual_ptr[0])
+      );
+    }
+    #else
+    multiply_add<bfloat16_t> op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b, c);
+    }
+    #endif
+    return result;
+  }
+};
+/// bit_and
+template <int N>
+struct bit_and<Array<uint1b_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a, Array<uint1b_t, N> const &b) const {
+    using ArrayType = Array<uint1b_t, N>;
+    using Storage = typename ArrayType::Storage;
+    ArrayType result;
+    Storage *result_data = result.raw_data();
+    Storage const *a_data = a.raw_data();
+    Storage const *b_data = b.raw_data();
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
+      result_data[i] = (a_data[i] & b_data[i]);
+    }
+    return result;
+  }
+};
+/// bit_or
+template <int N>
+struct bit_or<Array<uint1b_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a, Array<uint1b_t, N> const &b) const {
+    using ArrayType = Array<uint1b_t, N>;
+    using Storage = typename ArrayType::Storage;
+    ArrayType result;
+    Storage *result_data = result.raw_data();
+    Storage const *a_data = a.raw_data();
+    Storage const *b_data = b.raw_data();
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
+      result_data[i] = (a_data[i] | b_data[i]);
+    }
+    return result;
+  }
+};
+/// bit_not
+template <int N>
+struct bit_not<Array<uint1b_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a) const {
+    using ArrayType = Array<uint1b_t, N>;
+    using Storage = typename ArrayType::Storage;
+    ArrayType result;
+    Storage *result_data = result.raw_data();
+    Storage const *a_data = a.raw_data();
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
+      result_data[i] = (~a_data[i]);
+    }
+    return result;
+  }
+};
+/// bit_xor
+template <int N>
+struct bit_xor<Array<uint1b_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a, Array<uint1b_t, N> const &b) const {
+    using ArrayType = Array<uint1b_t, N>;
+    using Storage = typename ArrayType::Storage;
+    ArrayType result;
+    Storage *result_data = result.raw_data();
+    Storage const *a_data = a.raw_data();
+    Storage const *b_data = b.raw_data();
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
+      result_data[i] = (a_data[i] ^ b_data[i]);
+    }
+    return result;
+  }
+};
+/// Fused and-popc-add
+template <typename T, int N>
+struct and_popc_add<Array<T, N>, Array<T, N>, Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) const {
+    Array<T, N> result;
+    and_popc_add<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i], b[i], c[i]);
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, T const &scalar, Array<T, N> const &c) const {
+    Array<T, N> result;
+    and_popc_add<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i], scalar, c[i]);
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(T const &scalar, Array<T, N> const &b, Array<T, N> const &c) const {
+    Array<T, N> result;
+    and_popc_add<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, b[i], c[i]);
+    }
+    return result;
+  }
+};
+/// Fused or-popc-add
+template <typename T, int N>
+struct or_popc_add<Array<T, N>, Array<T, N>, Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) const {
+    Array<T, N> result;
+    or_popc_add<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i], b[i], c[i]);
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, T const &scalar, Array<T, N> const &c) const {
+    Array<T, N> result;
+    or_popc_add<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i], scalar, c[i]);
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(T const &scalar, Array<T, N> const &b, Array<T, N> const &c) const {
+    Array<T, N> result;
+    or_popc_add<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, b[i], c[i]);
+    }
+    return result;
+  }
+};
+/// Fused xor-popc-add
+template <typename T, int N>
+struct xor_popc_add<Array<T, N>, Array<T, N>, Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) const {
+    Array<T, N> result;
+    xor_popc_add<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i], b[i], c[i]);
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, T const &scalar, Array<T, N> const &c) const {
+    Array<T, N> result;
+    xor_popc_add<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i], scalar, c[i]);
+    }
+    return result;
+  }
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(T const &scalar, Array<T, N> const &b, Array<T, N> const &c) const {
+    Array<T, N> result;
+    xor_popc_add<T> scalar_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, b[i], c[i]);
+    }
+    return result;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Operator overloads
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator+(Array<T, N> const &lhs, Array<T, N> const &rhs) {
+  plus<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator+(T const &lhs, Array<T, N> const &rhs) {
+  plus<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator+(Array<T, N> const &lhs, T const &rhs) {
+  plus<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator-(Array<T, N> const &lhs, Array<T, N> const &rhs) {
+  minus<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator-(Array<T, N> const &lhs) {
+  negate<Array<T, N>> op;
+  return op(lhs);
+}
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator*(Array<T, N> const &lhs, Array<T, N> const &rhs) {
+  multiplies<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator*(T lhs, Array<T, N> const &rhs) {
+  multiplies<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator*(Array<T, N> const &lhs, T rhs) {
+  multiplies<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator/(Array<T, N> const &lhs, Array<T, N> const &rhs) {
+  divides<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> fma(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) {
+  multiply_add<Array<T, N>> op;
+  return op(a, b, c);
+}
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> fma(T a, Array<T, N> const &b, Array<T, N> const &c) {
+  multiply_add<Array<T, N>> op;
+  return op(a, b, c);
+}
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> fma(Array<T, N> const &a, T b, Array<T, N> const &c) {
+  multiply_add<Array<T, N>> op;
+  return op(a, b, c);
+}
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> fma(Array<T, N> const &a, Array<T, N> const &b, T c) {
+  multiply_add<Array<T, N>> op;
+  return op(a, b, c);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// AlignedArray
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Aligned array type
+template <
+  /// Element type
+  typename T,
+  /// Number of elements in the array
+  int N,
+  /// Alignment requirement in bytes
+  int Alignment = ( sizeof_bits<T>::value * N + 7 ) / 8
+>
+class alignas(Alignment) AlignedArray: public Array<T, N> {
+public:
+};
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/array_subbyte.h"
+////////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/array_planar_complex.h ADDED Viewed

	@@ -0,0 +1,89 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Array holding planar complex elements
+template <typename Element_, int N>
+struct ArrayPlanarComplex {
+  /// Underlying real element
+  using Element = Element_;
+  /// Number of logical elements
+  static constexpr size_t kElements = N;
+  /// Underlying Fragment of real-valued elemenets
+  using ArrayReal = cutlass::Array<Element, N>;
+public:
+  /// Fragment of real-valued elements representing the real part
+  ArrayReal real;
+  /// Fragment of real-valued elements representing the imaginary part
+  ArrayReal imag;
+public:
+  /// Sets the array to zero efficiently
+  CUTLASS_HOST_DEVICE
+  void clear() {
+    real.clear();
+    imag.clear();
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Helper to deduce template arguments
+template <typename Element, int N>
+CUTLASS_HOST_DEVICE
+ArrayPlanarComplex<Element, N>
+make_ArrayPlanarComplex(Array<Element, N> const &real, Array<Element, N> const &imag) {
+  return ArrayPlanarComplex<Element, N>{real, imag};
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/array_subbyte.h ADDED Viewed

	@@ -0,0 +1,561 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Statically sized array of elements that accommodates all CUTLASS-supported numeric types
+           and is safe to use in a union.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+namespace cutlass {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Statically sized array for any data type
+template <
+  typename T,
+  int N
+>
+struct Array<T, N, false> {
+  static constexpr int kSizeBits = sizeof_bits<T>::value * N;
+  /// Storage type
+  using Storage = typename platform::conditional<
+    ((kSizeBits % 32) != 0),
+    typename platform::conditional<
+      ((kSizeBits % 16) != 0),
+      uint8_t,
+      uint16_t
+    >::type,
+    uint32_t
+  >::type;
+  /// Element type
+  using Element = T;
+  /// Number of logical elements per stored object
+  static constexpr int kElementsPerStoredItem = int(sizeof(Storage) * 8) / sizeof_bits<T>::value;
+  /// Number of storage elements
+  static constexpr size_t kStorageElements = (N + kElementsPerStoredItem - 1) / kElementsPerStoredItem;
+  /// Number of logical elements
+  static constexpr size_t kElements = N;
+  /// Bitmask for covering one item
+  static constexpr Storage kMask = ((Storage(1) << sizeof_bits<T>::value) - 1);
+  //
+  // C++ standard members with pointer types removed
+  //
+  typedef T value_type;
+  typedef size_t size_type;
+  typedef ptrdiff_t difference_type;
+  typedef value_type *pointer;
+  typedef value_type const *const_pointer;
+  //
+  // References
+  //
+  /// Reference object inserts or extracts sub-byte items
+  class reference {
+    /// Pointer to storage element
+    Storage *ptr_{nullptr};
+    /// Index into elements packed into Storage object
+    int idx_{0};
+  public:
+    reference() = default;
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    reference(Storage *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
+    /// Assignment
+    CUTLASS_HOST_DEVICE
+    reference &operator=(T x) {
+    // `*ptr_ & kUpdateMask` will read ptr_ before write to it
+    // This means code pattern like
+    //
+    // ```cpp
+    // Array<half_t, N> result;
+    // result[0] = xxx;
+    // ```
+    //
+    // Will leads to compiler warning on use of uninitialized member variable. Although we know
+    //      this read of uninitialized member variable is harmeless.
+#if defined(__clang__)
+#  pragma clang diagnostic push
+#  pragma clang diagnostic ignored "-Wuninitialized"
+#elif defined(__GNUC__)
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wuninitialized"
+#  pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+      Storage item = (reinterpret_cast<Storage const &>(x) & kMask);
+      Storage kUpdateMask = Storage(~(kMask << (idx_ * sizeof_bits<T>::value)));
+      *ptr_ = Storage(((*ptr_ & kUpdateMask) | (item << idx_ * sizeof_bits<T>::value)));
+#if defined(__clang__)
+#  pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#  pragma GCC diagnostic pop
+#endif
+      return *this;
+    }
+    CUTLASS_HOST_DEVICE
+    T get() const {
+      Storage item = Storage((*ptr_ >> (idx_ * sizeof_bits<T>::value)) & kMask);
+      return reinterpret_cast<T const &>(item);
+    }
+    /// Extract
+    CUTLASS_HOST_DEVICE
+    operator T() const {
+      return get();
+    }
+    /// Explicit cast to int
+    CUTLASS_HOST_DEVICE
+    explicit operator int() const {
+      return int(get());
+    }
+    /// Explicit cast to float
+    CUTLASS_HOST_DEVICE
+    explicit operator float() const {
+      return float(get());
+    }
+  };
+  /// Reference object extracts sub-byte items
+  class const_reference {
+    /// Pointer to storage element
+    Storage const *ptr_{nullptr};
+    /// Index into elements packed into Storage object
+    int idx_{0};
+  public:
+    const_reference() = default;
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    const_reference(Storage const *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
+    CUTLASS_HOST_DEVICE
+    const T get() const {
+      Storage item = (*ptr_ >> (idx_ * sizeof_bits<T>::value)) & kMask;
+      return reinterpret_cast<T const &>(item);
+    }
+    /// Extract
+    CUTLASS_HOST_DEVICE
+    operator T() const {
+      Storage item = Storage(Storage(*ptr_ >> Storage(idx_ * sizeof_bits<T>::value)) & kMask);
+      return reinterpret_cast<T const &>(item);
+    }
+    /// Explicit cast to int
+    CUTLASS_HOST_DEVICE
+    explicit operator int() const {
+      return int(get());
+    }
+    /// Explicit cast to float
+    CUTLASS_HOST_DEVICE
+    explicit operator float() const {
+      return float(get());
+    }
+  };
+  //
+  // Iterators
+  //
+  /// Bidirectional iterator over elements
+  class iterator {
+    /// Pointer to storage element
+    Storage *ptr_{nullptr};
+    /// Index into elements packed into Storage object
+    int idx_{0};
+  public:
+    iterator() = default;
+    CUTLASS_HOST_DEVICE
+    iterator(Storage *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
+    CUTLASS_HOST_DEVICE
+    iterator &operator++() {
+      ++idx_;
+      if (idx_ == kElementsPerStoredItem) {
+        ++ptr_;
+        idx_ = 0;
+      }
+      return *this;
+    }
+    CUTLASS_HOST_DEVICE
+    iterator &operator--() {
+      if (!idx_) {
+        --ptr_;
+        idx_ = kElementsPerStoredItem - 1;
+      }
+      else {
+        --idx_;
+      }
+      return *this;
+    }
+    CUTLASS_HOST_DEVICE
+    iterator operator++(int) {
+      iterator ret(*this);
+      ++idx_;
+      if (idx_ == kElementsPerStoredItem) {
+        ++ptr_;
+        idx_ = 0;
+      }
+      return ret;
+    }
+    CUTLASS_HOST_DEVICE
+    iterator operator--(int) {
+      iterator ret(*this);
+      if (!idx_) {
+        --ptr_;
+        idx_ = kElementsPerStoredItem - 1;
+      }
+      else {
+        --idx_;
+      }
+      return ret;
+    }
+    CUTLASS_HOST_DEVICE
+    reference operator*() const {
+      return reference(ptr_, idx_);
+    }
+    CUTLASS_HOST_DEVICE
+    bool operator==(iterator const &other) const {
+      return ptr_ == other.ptr_ && idx_ == other.idx_;
+    }
+    CUTLASS_HOST_DEVICE
+    bool operator!=(iterator const &other) const {
+      return !(*this == other);
+    }
+  };
+  /// Bidirectional constant iterator over elements
+  class const_iterator {
+    /// Pointer to storage element
+    Storage const *ptr_{nullptr};
+    /// Index into elements packed into Storage object
+    int idx_{0};
+  public:
+    const_iterator() = default;
+    CUTLASS_HOST_DEVICE
+    const_iterator(Storage const *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
+    CUTLASS_HOST_DEVICE
+    iterator &operator++() {
+      ++idx_;
+      if (idx_ == kElementsPerStoredItem) {
+        ++ptr_;
+        idx_ = 0;
+      }
+      return *this;
+    }
+    CUTLASS_HOST_DEVICE
+    iterator &operator--() {
+      if (!idx_) {
+        --ptr_;
+        idx_ = kElementsPerStoredItem - 1;
+      }
+      else {
+        --idx_;
+      }
+      return *this;
+    }
+    CUTLASS_HOST_DEVICE
+    iterator operator++(int) {
+      iterator ret(*this);
+      ++idx_;
+      if (idx_ == kElementsPerStoredItem) {
+        ++ptr_;
+        idx_ = 0;
+      }
+      return ret;
+    }
+    CUTLASS_HOST_DEVICE
+    iterator operator--(int) {
+      iterator ret(*this);
+      if (!idx_) {
+        --ptr_;
+        idx_ = kElementsPerStoredItem - 1;
+      }
+      else {
+        --idx_;
+      }
+      return ret;
+    }
+    CUTLASS_HOST_DEVICE
+    const_reference operator*() const {
+      return const_reference(ptr_, idx_);
+    }
+    CUTLASS_HOST_DEVICE
+    bool operator==(iterator const &other) const {
+      return ptr_ == other.ptr_ && idx_ == other.idx_;
+    }
+    CUTLASS_HOST_DEVICE
+    bool operator!=(iterator const &other) const {
+      return !(*this == other);
+    }
+  };
+  /// Bidirectional iterator over elements
+  class reverse_iterator {
+    /// Pointer to storage element
+    Storage *ptr_{nullptr};
+    /// Index into elements packed into Storage object
+    int idx_{0};
+  public:
+    reverse_iterator() = default;
+    CUTLASS_HOST_DEVICE
+    reverse_iterator(Storage *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
+  };
+  /// Bidirectional constant iterator over elements
+  class const_reverse_iterator {
+    /// Pointer to storage element
+    Storage const *ptr_{nullptr};
+    /// Index into elements packed into Storage object
+    int idx_{0};
+  public:
+    const_reverse_iterator() = default;
+    CUTLASS_HOST_DEVICE
+    const_reverse_iterator(Storage const *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
+  };
+  /// Efficient clear method
+  CUTLASS_HOST_DEVICE
+  void clear() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < int(kStorageElements); ++i) {
+      storage[i] = Storage(0);
+    }
+  }
+  CUTLASS_HOST_DEVICE
+  reference at(size_type pos) {
+    return reference(storage + pos / kElementsPerStoredItem, pos % kElementsPerStoredItem);
+  }
+  CUTLASS_HOST_DEVICE
+  const_reference at(size_type pos) const {
+    return const_reference(storage + pos / kElementsPerStoredItem, pos % kElementsPerStoredItem);
+  }
+  CUTLASS_HOST_DEVICE
+  reference operator[](size_type pos) {
+    return at(pos);
+  }
+  CUTLASS_HOST_DEVICE
+  const_reference operator[](size_type pos) const {
+    return at(pos);
+  }
+  CUTLASS_HOST_DEVICE
+  reference front() {
+    return at(0);
+  }
+  CUTLASS_HOST_DEVICE
+  const_reference front() const {
+    return at(0);
+  }
+  CUTLASS_HOST_DEVICE
+  reference back() {
+    return reference(storage + kStorageElements - 1, kElementsPerStoredItem - 1);
+  }
+  CUTLASS_HOST_DEVICE
+  const_reference back() const {
+    return const_reference(storage + kStorageElements - 1, kElementsPerStoredItem - 1);
+  }
+  CUTLASS_HOST_DEVICE
+  pointer data() {
+    return reinterpret_cast<pointer>(storage);
+  }
+  CUTLASS_HOST_DEVICE
+  const_pointer data() const {
+    return reinterpret_cast<const_pointer>(storage);
+  }
+  CUTLASS_HOST_DEVICE
+  Storage * raw_data() {
+    return storage;
+  }
+  CUTLASS_HOST_DEVICE
+  Storage const * raw_data() const {
+    return storage;
+  }
+  CUTLASS_HOST_DEVICE
+  constexpr bool empty() const {
+    return !kElements;
+  }
+  CUTLASS_HOST_DEVICE
+  constexpr size_type size() const {
+    return kElements;
+  }
+  CUTLASS_HOST_DEVICE
+  constexpr size_type max_size() const {
+    return kElements;
+  }
+  CUTLASS_HOST_DEVICE
+  void fill(T const &value) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerStoredItem; ++i) {
+      reference ref(storage, i);
+      ref = value;
+    }
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < kStorageElements; ++i) {
+      storage[i] = storage[0];
+    }
+  }
+  CUTLASS_HOST_DEVICE
+  iterator begin() {
+    return iterator(storage);
+  }
+  CUTLASS_HOST_DEVICE
+  const_iterator cbegin() const {
+    return const_iterator(storage);
+  }
+  CUTLASS_HOST_DEVICE
+  iterator end() {
+    return iterator(storage + kStorageElements);
+  }
+  CUTLASS_HOST_DEVICE
+  const_iterator cend() const {
+    return const_iterator(storage + kStorageElements);
+  }
+  CUTLASS_HOST_DEVICE
+  reverse_iterator rbegin() {
+    return reverse_iterator(storage + kStorageElements);
+  }
+  CUTLASS_HOST_DEVICE
+  const_reverse_iterator crbegin() const {
+    return const_reverse_iterator(storage + kStorageElements);
+  }
+  CUTLASS_HOST_DEVICE
+  reverse_iterator rend() {
+    return reverse_iterator(storage);
+  }
+  CUTLASS_HOST_DEVICE
+  const_reverse_iterator crend() const {
+    return const_reverse_iterator(storage);
+  }
+private:
+  /// Internal storage
+  Storage storage[kStorageElements];
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/barrier.h ADDED Viewed

	@@ -0,0 +1,377 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implementation of a CTA-wide barrier for inter-CTA synchronization.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace detail {
+//
+// Utilities for abstracting synchronization methods for barriers
+//
+struct SyncthreadsSync {
+  CUTLASS_DEVICE
+  static void sync() {
+    __syncthreads();
+  }
+};
+struct SyncwarpSync {
+  CUTLASS_DEVICE
+  static void sync() {
+    __syncwarp();
+  }
+};
+template <
+  int ThreadCount,
+  int BarrierId
+>
+struct NamedBarrierSync {
+  CUTLASS_DEVICE
+  static void sync() {
+    cutlass::arch::NamedBarrier::sync(ThreadCount, static_cast<arch::ReservedNamedBarriers>(BarrierId));
+  }
+};
+} // namepspace detail
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Group or CTA-wide semaphore for inter-CTA synchronization.
+template <class Sync>
+struct GenericBarrier {
+public:
+  /// Flag type
+  using T = int;
+  /// Initial flag value
+  static const T INIT = 0;
+protected:
+  /// Load flag, as a strong acquire operation (int specialization)
+  CUTLASS_DEVICE
+  static int ld_acquire(int *ptr)
+  {
+    int state = 0;
+#if (__CUDA_ARCH__ >= 700)
+    /// SM70 and newer use memory consistency qualifiers
+    // Acquire pattern using acquire modifier
+    asm volatile ("ld.global.acquire.gpu.b32 %0, [%1];\n" : "=r"(state) : "l"(ptr));
+#else
+    asm volatile ("ld.cg.global.b32 %0, [%1];\n" : "=r"(state) : "l"(ptr));
+#endif // (__CUDA_ARCH__ >= 700)
+    return state;
+  }
+  /// Reduce into flag, with release pattern (int specialization)
+  CUTLASS_DEVICE
+  static void red_release(int *ptr, int val)
+  {
+#if (__CUDA_ARCH__ >= 700)
+    /// SM70 and newer use memory consistency qualifiers
+    // Release pattern using acq_rel fence + relaxed modifier.  (The fence also releases data
+    // that was weakly-written by other threads prior to the last syncthreads)
+    asm volatile ("fence.acq_rel.gpu;\n");
+    asm volatile ("red.relaxed.gpu.global.add.s32 [%0], %1;\n" : : "l"(ptr), "r"(val));
+#else
+    __threadfence();
+    atomicAdd(ptr, val);
+#endif // (__CUDA_ARCH__ >= 700)
+  }
+public:
+  /// Uses thread[0] to wait for at least the specified count of signals on the given flag counter
+  CUTLASS_DEVICE
+  static void wait_lt(void *lock_ptr, int thread_idx, int flag_idx, int count)
+  {
+    T *flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
+    if (thread_idx == 0)
+    {
+        // Spin-loop
+        #pragma unroll 1
+        while(ld_acquire(flag_ptr) < count) {}
+    }
+    Sync::sync();
+  }
+  /// Uses thread[0] to wait for at least the specified count of signals on the given flag counter
+  CUTLASS_DEVICE
+  static void wait_eq(void *lock_ptr, int thread_idx, int flag_idx, T val = 1)
+  {
+    T *flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
+    if (thread_idx == 0)
+    {
+        // Spin-loop
+        #pragma unroll 1
+        while(ld_acquire(flag_ptr) != val) {}
+    }
+    Sync::sync();
+  }
+  /// Uses thread[0] to wait for the specified count of signals on the given flag counter
+  CUTLASS_DEVICE
+  static void wait_eq_reset(void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
+    T *flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
+    if (thread_idx == 0)
+    {
+        // Spin-loop
+        #pragma unroll 1
+        while(atomicCAS(flag_ptr, val, 0) != val) {}
+    }
+    Sync::sync();
+  }
+  /// Increment the arrival count for a flag
+  CUTLASS_DEVICE
+  static void arrive_inc(void *lock_ptr, int thread_idx, int flag_idx, int val = 1)
+  {
+    T* flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
+    Sync::sync();
+    if (thread_idx == 0)
+    {
+      red_release(flag_ptr, val);
+    }
+  }
+  /// Increment the arrival counts for a range of flags
+  CUTLASS_DEVICE
+  static void arrive_range_inc(void *lock_ptr, int thread_idx, int first_flag_idx, int count = 1, int val = 1)
+  {
+    int flag_idx = first_flag_idx + thread_idx;
+    T* flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
+    // Barrier to make sure all other threads in group have written their data
+    Sync::sync();
+    // Select threads increment their flags
+    if (thread_idx < count) {
+      red_release(flag_ptr, val);
+    }
+  }
+};
+using Barrier = GenericBarrier<detail::SyncthreadsSync>;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/** Structure for managing multiple NamedBarriers to be used by different warp groups, allowing
+ * runtime index values to be used to call into named barriers with compile-time-constant IDs.
+ *
+ * @param ThreadCount_ Number of threads that will wait on a NamedBarrier with a given ID
+ * @param Offset Value added to the ID passed in by the user to determine the NamedBarrier ID to call into
+ * @param MaxNumNamedBarriers The maximum number of unique barrier IDs that will be requested on this type
+**/
+template <
+  uint32_t ThreadCount_,
+  uint32_t Offset = 0,
+  uint32_t MaxNumNamedBarriers = 16
+>
+struct NamedBarrierManager {
+  static_assert(MaxNumNamedBarriers <= arch::NamedBarrier::HardwareMaxNumNamedBarriers);
+  static_assert(MaxNumNamedBarriers + Offset <= arch::NamedBarrier::HardwareMaxNumNamedBarriers, "Barrier IDs cannot exceed 15");
+  // Number of threads participating in the barrier
+  static constexpr uint32_t ThreadCount = ThreadCount_;
+  template <uint32_t BarrierId>
+  using BarrierSync = cutlass::GenericBarrier<cutlass::detail::NamedBarrierSync<ThreadCount, BarrierId>>;
+  // Underlying type used by all barriers for synchronization. Does not depend on
+  // template parameter BarrierId, so passing in 0 suffices.
+  using T = typename BarrierSync<0>::T;
+  using IntegerSequence = cute::make_integer_sequence<uint32_t, MaxNumNamedBarriers>;
+  CUTLASS_DEVICE
+  static
+  void wait_lt(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, int count) {
+    wait_lt_helper(idx, lock_ptr, thread_idx, flag_idx, count, IntegerSequence{});
+  }
+  CUTLASS_DEVICE
+  static void
+  wait_eq(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
+    wait_eq_helper<false>(idx, lock_ptr, thread_idx, flag_idx, val, IntegerSequence{});
+  }
+  CUTLASS_DEVICE
+  static void
+  wait_eq_reset(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
+    wait_eq_helper<true>(idx, lock_ptr, thread_idx, flag_idx, val, IntegerSequence{});
+  }
+  CUTLASS_DEVICE
+  static void
+  arrive_inc(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, int val = 1) {
+    arrive_inc_helper(idx, lock_ptr, thread_idx, flag_idx, val, IntegerSequence{});
+  }
+  CUTLASS_DEVICE
+  static void
+  arrive_range_inc(uint32_t idx, void *lock_ptr, int thread_idx, int first_flag_idx, int count = 1, int val = 1) {
+    arrive_range_inc_helper(idx, lock_ptr, thread_idx, first_flag_idx, count, val, IntegerSequence{});
+  }
+private:
+  CUTLASS_DEVICE
+  static void
+  check_barrier_in_range([[maybe_unused]] uint32_t idx) {
+    assert((idx < MaxNumNamedBarriers) && "Index exceeds barrier count");
+  }
+  template <uint32_t... Idx>
+  CUTLASS_DEVICE
+  static void
+  wait_lt_helper(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, int count, cute::integer_sequence<uint32_t, Idx...>) {
+    check_barrier_in_range(idx);
+    ((Idx == idx && (BarrierSync<Idx + Offset>::wait_lt(lock_ptr, thread_idx, flag_idx, count), true)) || ...);
+  }
+  template <bool Reset, uint32_t... Idx>
+  CUTLASS_DEVICE
+  static void
+  wait_eq_helper(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, T val, cute::integer_sequence<uint32_t, Idx...>) {
+    check_barrier_in_range(idx);
+    if constexpr (Reset) {
+      ((Idx == idx && (BarrierSync<Idx + Offset>::wait_eq_reset(lock_ptr, thread_idx, flag_idx, val), true)) || ...);
+    }
+    else {
+      ((Idx == idx && (BarrierSync<Idx + Offset>::wait_eq(lock_ptr, thread_idx, flag_idx, val), true)) || ...);
+    }
+  }
+  template <uint32_t... Idx>
+  CUTLASS_DEVICE
+  static void
+  arrive_inc_helper(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, int val, cute::integer_sequence<uint32_t, Idx...>) {
+    check_barrier_in_range(idx);
+    ((Idx == idx && (BarrierSync<Idx + Offset>::arrive_inc(lock_ptr, thread_idx, flag_idx, val), true)) || ...);
+  }
+  template <uint32_t... Idx>
+  CUTLASS_DEVICE
+  static void
+  arrive_range_inc_helper(uint32_t idx, void *lock_ptr, int thread_idx, int first_flag_idx, int count, int val, cute::integer_sequence<uint32_t, Idx...>) {
+    check_barrier_in_range(idx);
+    ((Idx == idx && (BarrierSync<Idx + Offset>::arrive_range_inc(lock_ptr, thread_idx, first_flag_idx, count, val), true)) || ...);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/** Structure for synchronizing via contiguous barriers (e.g., __syncwarp, __syncthreads)
+ *  via an API that mirrors that of NamedBarrierManager
+ *
+ * @param Synchronizer Synchronization helper exposing a `sync()` method to perform synchronization
+**/
+template <
+  class Synchronizer,
+  uint32_t ThreadCount_
+>
+struct SyncManager {
+  // Number of threads participating in the barrier
+  static constexpr uint32_t ThreadCount = ThreadCount_;
+  using BarrierSync = cutlass::GenericBarrier<Synchronizer>;
+  // Underlying type used by all barriers for synchronization.
+  using T = typename BarrierSync::T;
+  CUTLASS_DEVICE
+  static
+  void wait_lt(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, int count) {
+    BarrierSync::wait_lt(lock_ptr, thread_idx, flag_idx, count);
+  }
+  CUTLASS_DEVICE
+  static void
+  wait_eq(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
+    BarrierSync::wait_eq(lock_ptr, thread_idx, flag_idx, val);
+  }
+  CUTLASS_DEVICE
+  static void
+  wait_eq_reset(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
+    BarrierSync::wait_eq_reset(lock_ptr, thread_idx, flag_idx, val);
+  }
+  CUTLASS_DEVICE
+  static void
+  arrive_inc(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, int val = 1) {
+    BarrierSync::arrive_inc(lock_ptr, thread_idx, flag_idx, val);
+  }
+  CUTLASS_DEVICE
+  static void
+  arrive_range_inc(uint32_t idx, void *lock_ptr, int thread_idx, int first_flag_idx, int count = 1, int val = 1) {
+    BarrierSync::arrive_range_inc(lock_ptr, thread_idx, first_flag_idx, count, val);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/bfloat16.h ADDED Viewed

	@@ -0,0 +1,679 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Defines a proxy class for storing non-standard 16-bit floating point values with
+          8 bits of exponent and 7 bit of mantissa.
+*/
+#pragma once
+#if defined(__CUDACC_RTC__)
+#include "cutlass/floating_point_nvrtc.h"
+#else
+#include <cmath>
+#include <limits>
+#include <cstdint>
+#include <cstring>
+#endif
+#include <cuda_bf16.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/platform/platform.h"
+namespace cutlass {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Floating-point type with 8 bits of exponent and 7 bits of mantissa.
+struct alignas(2) bfloat16_t {
+  //
+  // Data members
+  //
+  /// Storage type
+  uint16_t storage;
+  //
+  // Methods
+  //
+  /// Constructs from an unsigned short
+  CUTLASS_HOST_DEVICE
+  static bfloat16_t bitcast(uint16_t x) {
+    bfloat16_t h;
+    h.storage = x;
+    return h;
+  }
+private:
+  struct from_32_bit_integer_t {};
+  static constexpr from_32_bit_integer_t from_32_bit_integer{};
+  template<class T>
+  CUTLASS_HOST_DEVICE
+  explicit bfloat16_t(from_32_bit_integer_t, T x) {
+    static_assert(cutlass::platform::is_integral<T>::value && sizeof(T) == 4, "Requires 32-bit integer");
+    float flt = static_cast<float>(x);
+    uint32_t bits;
+    #if defined(__CUDA_ARCH__)
+    bits = reinterpret_cast<uint32_t &>(flt);
+    #else
+    std::memcpy(&bits, &flt, sizeof(bits));
+    #endif
+    storage = uint16_t(bits >> 16);
+  }
+public:
+  /// Default constructor
+  bfloat16_t() = default;
+  /// Reinterpret cast from CUDA's __nv_bfloat16 type
+  CUTLASS_HOST_DEVICE
+  explicit bfloat16_t(__nv_bfloat16 const & x) {
+    #if defined(__CUDA_ARCH__)
+    storage = reinterpret_cast<uint16_t const &>(x);
+    #else
+    __nv_bfloat16_raw raw(x);
+    std::memcpy(&storage, &raw.x, sizeof(storage));
+    #endif
+  }
+  /// Floating-point conversion - round toward nearest
+  CUTLASS_HOST_DEVICE
+  explicit bfloat16_t(float x) {
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11)
+    asm("cvt.rn.bf16.f32 %0, %1;\n" : "=h"(storage) : "f"(x));
+    #else
+    uint32_t bits;
+    #if defined(__CUDA_ARCH__)
+    bits = reinterpret_cast<uint32_t &>(x);
+    #else
+    std::memcpy(&bits, &x, sizeof(bits));
+    #endif
+    if ((bits & 0x7f800000) != 0x7f800000) {
+      bool mantissa_bit = ((bits & (1 << 16)) != 0);
+      bool round_bit = ((bits & (1 << 15)) != 0);
+      bool sticky_bit = ((bits & ((1 << 15) - 1)) != 0);
+      if ((round_bit && sticky_bit) || (round_bit && mantissa_bit)) {
+        bits += uint32_t(1 << 16);
+      }
+    }
+    else if (bits & ~0xff800000) {
+      bits = 0x7fffffff;
+    }
+    storage = uint16_t((bits >> 16) & 0xffff);
+    #endif
+  }
+  /// Floating-point conversion - round toward nearest
+  CUTLASS_HOST_DEVICE
+  explicit bfloat16_t(double x): bfloat16_t(float(x)) {
+  }
+  /// Integer conversion - round toward nearest
+  CUTLASS_HOST_DEVICE
+  explicit bfloat16_t(int x) : bfloat16_t(from_32_bit_integer, x) {}
+  CUTLASS_HOST_DEVICE
+  explicit bfloat16_t(uint32_t x) : bfloat16_t(from_32_bit_integer, x) {}
+  /// Converts to float
+  CUTLASS_HOST_DEVICE
+  operator float() const {
+    unsigned bits = (unsigned(storage) << 16);
+    #if defined(__CUDA_ARCH__)
+    return reinterpret_cast<float const &>(bits);
+    #else
+    float flt;
+    std::memcpy(&flt, &bits, sizeof(flt));
+    return flt;
+    #endif
+  }
+  /// Converts to float
+  CUTLASS_HOST_DEVICE
+  explicit operator double() const {
+    return double(float(*this));
+  }
+  /// Converts to int
+  CUTLASS_HOST_DEVICE
+  explicit operator int() const {
+    return int(float(*this));
+  }
+  /// Casts to bool
+  CUTLASS_HOST_DEVICE
+  explicit operator bool() const {
+    return (float(*this) != 0.0f);
+  }
+  /// Bitcasts to CUDA's bf16 type
+  CUTLASS_DEVICE
+  __nv_bfloat16 to_nv_bfloat16() const {
+    return reinterpret_cast<__nv_bfloat16 const &>(storage);
+  }
+  /// Obtains raw bits
+  CUTLASS_HOST_DEVICE
+  uint16_t raw() const {
+    return storage;
+  }
+    /// Returns the sign bit
+  CUTLASS_HOST_DEVICE
+  bool signbit() const {
+    return ((raw() & 0x8000) != 0);
+  }
+  /// Returns the biased exponent
+  CUTLASS_HOST_DEVICE
+  int exponent_biased() const {
+    return int((raw() >> 7) & 0x0ff);
+  }
+  /// Returns the unbiased exponent
+  CUTLASS_HOST_DEVICE
+  int exponent() const {
+    return exponent_biased() - 127;
+  }
+  /// Returns the mantissa
+  CUTLASS_HOST_DEVICE
+  int mantissa() const {
+    return int(raw() & 0x7f);
+  }
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+CUTLASS_HOST_DEVICE
+bool signbit(cutlass::bfloat16_t const& h) {
+  return h.signbit();
+}
+CUTLASS_HOST_DEVICE
+cutlass::bfloat16_t abs(cutlass::bfloat16_t const& h) {
+  return cutlass::bfloat16_t::bitcast(h.raw() & 0x7fff);
+}
+CUTLASS_HOST_DEVICE
+bool isnan(cutlass::bfloat16_t const& h) {
+  return (h.exponent_biased() == 0x0ff) && h.mantissa();
+}
+CUTLASS_HOST_DEVICE
+bool isfinite(cutlass::bfloat16_t const& h) {
+  return (h.exponent_biased() != 0x0ff);
+}
+CUTLASS_HOST_DEVICE
+cutlass::bfloat16_t nan_bf16(const char*) {
+  // NVIDIA canonical NaN
+  return cutlass::bfloat16_t::bitcast(0x7fff);
+}
+CUTLASS_HOST_DEVICE
+bool isinf(cutlass::bfloat16_t const& h) {
+  return (h.exponent_biased() == 0x0ff) && !h.mantissa();
+}
+CUTLASS_HOST_DEVICE
+bool isnormal(cutlass::bfloat16_t const& h) {
+  return h.exponent_biased() && h.exponent_biased() != 0x0ff;
+}
+CUTLASS_HOST_DEVICE
+int fpclassify(cutlass::bfloat16_t const& h) {
+  int exp = h.exponent_biased();
+  int mantissa = h.mantissa();
+  if (exp == 0x0ff) {
+    if (mantissa) {
+      return FP_NAN;
+    }
+    else {
+      return FP_INFINITE;
+    }
+  }
+  else if (!exp) {
+    if (mantissa) {
+      return FP_SUBNORMAL;
+    }
+    else {
+      return FP_ZERO;
+    }
+  }
+  return FP_NORMAL;
+}
+CUTLASS_HOST_DEVICE
+cutlass::bfloat16_t sqrt(cutlass::bfloat16_t const& h) {
+#if defined(__CUDACC_RTC__)
+  return cutlass::bfloat16_t(sqrtf(float(h)));
+#else
+  return cutlass::bfloat16_t(std::sqrt(float(h)));
+#endif
+}
+CUTLASS_HOST_DEVICE
+bfloat16_t copysign(bfloat16_t const& a, bfloat16_t const& b) {
+  uint16_t a_bits;
+  uint16_t b_bits;
+  #if defined(__CUDA_ARCH__)
+  a_bits = reinterpret_cast<uint16_t const &>(a);
+  b_bits = reinterpret_cast<uint16_t const &>(b);
+  #else
+  std::memcpy(&a_bits, &a, sizeof(a_bits));
+  std::memcpy(&b_bits, &b, sizeof(b_bits));
+  #endif
+  uint16_t a_mag = (a_bits & 0x7fff);
+  uint16_t b_sign = (b_bits & 0x8000);
+  uint16_t result = (a_mag | b_sign);
+  return bfloat16_t::bitcast(result);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Standard Library operations and definitions
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#if !defined(__CUDACC_RTC__)
+namespace std {
+/// Numeric limits
+template <>
+struct numeric_limits<cutlass::bfloat16_t> {
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_infinity = true;
+  static bool const has_quiet_NaN = true;
+  static bool const has_signaling_NaN = false;
+  static std::float_denorm_style const has_denorm = std::denorm_present;
+  static bool const has_denorm_loss = true;
+  static std::float_round_style const round_style = std::round_to_nearest;
+  static bool const is_iec559 = false;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = 7;
+  /// Least positive value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t min() { return cutlass::bfloat16_t::bitcast(0x01); }
+  /// Minimum finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t lowest() { return cutlass::bfloat16_t::bitcast(0xff7f); }
+  /// Maximum finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t max() { return cutlass::bfloat16_t::bitcast(0x7f7f); }
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t epsilon() { return cutlass::bfloat16_t::bitcast(0x1000); }
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t round_error() { return cutlass::bfloat16_t(0.5f); }
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t infinity() { return cutlass::bfloat16_t::bitcast(0x7f80); }
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t quiet_NaN() { return cutlass::bfloat16_t::bitcast(0x7fff); }
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t signaling_NaN() { return cutlass::bfloat16_t::bitcast(0x7fff); }
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t denorm_min() { return cutlass::bfloat16_t::bitcast(0x1); }
+};
+} // namespace std
+#endif
+namespace cutlass {
+namespace platform {
+/// Forward Declaration
+template <class T>
+struct numeric_limits;
+/// Numeric limits
+template <>
+struct numeric_limits<cutlass::bfloat16_t> {
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_infinity = true;
+  static bool const has_quiet_NaN = true;
+  static bool const has_signaling_NaN = false;
+#if !defined(__CUDACC_RTC__)
+  static std::float_denorm_style const has_denorm = std::denorm_present;
+#endif
+  static bool const has_denorm_loss = true;
+#if !defined(__CUDACC_RTC__)
+  static std::float_round_style const round_style = std::round_to_nearest;
+#endif
+  static bool const is_iec559 = false;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = 7;
+  /// Least positive value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t min() { return cutlass::bfloat16_t::bitcast(0x01); }
+  /// Minimum finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t lowest() { return cutlass::bfloat16_t::bitcast(0xff7f); }
+  /// Maximum finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t max() { return cutlass::bfloat16_t::bitcast(0x7f7f); }
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t epsilon() { return cutlass::bfloat16_t::bitcast(0x1000); }
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t round_error() { return cutlass::bfloat16_t(0.5f); }
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t infinity() { return cutlass::bfloat16_t::bitcast(0x7f80); }
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t quiet_NaN() { return cutlass::bfloat16_t::bitcast(0x7fff); }
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t signaling_NaN() { return cutlass::bfloat16_t::bitcast(0x7fff); }
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t denorm_min() { return cutlass::bfloat16_t::bitcast(0x1); }
+};
+} // namespace platform
+} // namespace cutlass
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Arithmetic operators
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+CUTLASS_HOST_DEVICE
+bool operator==(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __heq(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
+#else
+  return float(lhs) == float(rhs);
+#endif
+}
+CUTLASS_HOST_DEVICE
+bool operator!=(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hne(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
+#else
+  return float(lhs) != float(rhs);
+#endif
+}
+CUTLASS_HOST_DEVICE
+bool operator<(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hlt(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
+#else
+  return float(lhs) < float(rhs);
+#endif
+}
+CUTLASS_HOST_DEVICE
+bool operator<=(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hle(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
+#else
+  return float(lhs) <= float(rhs);
+#endif
+}
+CUTLASS_HOST_DEVICE
+bool operator>(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hgt(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
+#else
+  return float(lhs) > float(rhs);
+#endif
+}
+CUTLASS_HOST_DEVICE
+bool operator>=(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hge(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
+#else
+  return float(lhs) >= float(rhs);
+#endif
+}
+CUTLASS_HOST_DEVICE
+bfloat16_t operator+(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_t(__hadd(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
+  return bfloat16_t(float(lhs) + float(rhs));
+#endif
+}
+CUTLASS_HOST_DEVICE
+bfloat16_t operator-(bfloat16_t const& lhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_t(__hneg(lhs.to_nv_bfloat16()));
+#else
+  return bfloat16_t(-float(lhs));
+#endif
+}
+CUTLASS_HOST_DEVICE
+bfloat16_t operator-(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_t(__hsub(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
+  return bfloat16_t(float(lhs) - float(rhs));
+#endif
+}
+CUTLASS_HOST_DEVICE
+bfloat16_t operator*(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_t(__hmul(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
+  return bfloat16_t(float(lhs) * float(rhs));
+#endif
+}
+CUTLASS_HOST_DEVICE
+bfloat16_t operator/(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_t(__hdiv(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
+  return bfloat16_t(float(lhs) / float(rhs));
+#endif
+}
+CUTLASS_HOST_DEVICE
+bfloat16_t& operator+=(bfloat16_t & lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hadd(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
+  lhs = bfloat16_t(float(lhs) + float(rhs));
+#endif
+  return lhs;
+}
+CUTLASS_HOST_DEVICE
+bfloat16_t& operator-=(bfloat16_t & lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hsub(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
+  lhs = bfloat16_t(float(lhs) - float(rhs));
+#endif
+  return lhs;
+}
+CUTLASS_HOST_DEVICE
+bfloat16_t& operator*=(bfloat16_t & lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hmul(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
+  lhs = bfloat16_t(float(lhs) * float(rhs));
+#endif
+  return lhs;
+}
+CUTLASS_HOST_DEVICE
+bfloat16_t& operator/=(bfloat16_t & lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hdiv(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
+  lhs = bfloat16_t(float(lhs) / float(rhs));
+#endif
+  return lhs;
+}
+CUTLASS_HOST_DEVICE
+bfloat16_t& operator++(bfloat16_t & lhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hadd(lhs.to_nv_bfloat16(), bfloat16_t(1.0f).to_nv_bfloat16()));
+#else
+  float tmp(lhs);
+  ++tmp;
+  lhs = bfloat16_t(tmp);
+#endif
+  return lhs;
+}
+CUTLASS_HOST_DEVICE
+bfloat16_t& operator--(bfloat16_t & lhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hsub(lhs.to_nv_bfloat16(), bfloat16_t(1.0f).to_nv_bfloat16()));
+#else
+  float tmp(lhs);
+  --tmp;
+  lhs = bfloat16_t(tmp);
+#endif
+  return lhs;
+}
+CUTLASS_HOST_DEVICE
+bfloat16_t operator++(bfloat16_t & lhs, int) {
+  bfloat16_t ret(lhs);
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hadd(lhs.to_nv_bfloat16(), bfloat16_t(1.0f).to_nv_bfloat16()));
+#else
+  float tmp(lhs);
+  tmp++;
+  lhs = bfloat16_t(tmp);
+#endif
+  return ret;
+}
+CUTLASS_HOST_DEVICE
+bfloat16_t operator--(bfloat16_t & lhs, int) {
+  bfloat16_t ret(lhs);
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hsub(lhs.to_nv_bfloat16(), bfloat16_t(1.0f).to_nv_bfloat16()));
+#else
+  float tmp(lhs);
+  tmp--;
+  lhs = bfloat16_t(tmp);
+#endif
+  return ret;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// User-defined literals
+//
+CUTLASS_HOST_DEVICE
+cutlass::bfloat16_t operator "" _bf16(long double x) {
+  return cutlass::bfloat16_t(float(x));
+}
+CUTLASS_HOST_DEVICE
+cutlass::bfloat16_t operator "" _bf16(unsigned long long int x) {
+  return cutlass::bfloat16_t(int(x));
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/blas3.h ADDED Viewed

	@@ -0,0 +1,143 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Basic include for CUTLASS BLAS3/HPC code.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/blas3_types.h"
+#include "cutlass/coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_types.h"
+////////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines FillMode inversions
+template <FillMode kFillMode>
+struct InvertFillMode;
+/// Invert FillMode lower to upper
+template <>
+struct InvertFillMode<FillMode::kLower> {
+  static FillMode const mode = FillMode::kUpper;
+};
+/// Invert FillMode upper to lower
+template <>
+struct InvertFillMode<FillMode::kUpper> {
+  static FillMode const mode = FillMode::kLower;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines SideMode inversions
+template <SideMode kSideMode>
+struct InvertSideMode;
+/// Invert SideMode left to right
+template <>
+struct InvertSideMode<SideMode::kLeft> {
+  static SideMode const mode = SideMode::kRight;
+};
+/// Invert SideMode right to left
+template <>
+struct InvertSideMode<SideMode::kRight> {
+  static SideMode const mode = SideMode::kLeft;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines correct compare operation for Triangular matrix boundary
+template <FillMode kFillMode, DiagType kDiagType = DiagType::kNonUnit>
+struct TrMatrixCompareOp {
+  using Index = int32_t;
+  using Type = typename platform::conditional<
+                        (kFillMode == FillMode::kLower),
+                        greater_equal<Index>,
+                        less_equal<Index>>::type;
+};
+template <FillMode kFillMode>
+struct TrMatrixCompareOp <kFillMode, DiagType::kUnit> {
+   using Index = int32_t;
+   using Type = typename platform::conditional<
+                        (kFillMode == FillMode::kLower),
+                        greater_equal<Index>,
+                        less_equal<Index>>::type;
+};
+template <FillMode kFillMode>
+struct TrMatrixCompareOp <kFillMode, DiagType::kZero> {
+   using Index = int32_t;
+   using Type = typename platform::conditional<
+                        (kFillMode == FillMode::kLower),
+                        greater<Index>,
+                        less<Index>>::type;
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Returns precision in terms of bits (based on datatype) to fill tensors with.
+// Defaults to 5 bits of mantissa for TF32 and FP32 (with implicit round-offs).
+// Also defines acceptable mantissa result variance/error.
+template <typename Element>
+struct MantissaInBits {
+  static int constexpr bits = 5;
+  static double constexpr error = 1.0e-7;
+};
+// Full precision is supported for FP64
+template <>
+struct MantissaInBits<double> {
+  static int constexpr bits = 30;
+  static double constexpr error = 1.0e-15;
+};
+template <>
+struct MantissaInBits<cutlass::complex<double>> {
+  static int constexpr bits = 30;
+  static double constexpr error = 1.0e-14;
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+}  // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/blas3_types.h ADDED Viewed

	@@ -0,0 +1,78 @@

+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Enumerated type describing the type of kernel (based on input or output matrices).
+enum class BlasMode {
+  kGemm,
+  kSymmetric,
+  kHermitian,
+  kTriangular,
+  kInvalid
+};
+/// Enumerated type describing the fill mode for matrices for BLAS functions.
+enum class FillMode {
+  kFull,              /// The entire tensor is covered.
+  kLower,             /// The 'lower' part of a tensor is covered including diagonal
+  kUpper,             /// The 'upper' part of a tensor is covered including diaognal
+  kDiagonal,          /// Only diagonal elements are covered.
+  kNone,              /// No element is covered.
+  kInvalid
+};
+/// Enumerated type describing the diagonal property of matrices for BLAS functions.
+enum class DiagType {
+  kNonUnit,
+  kUnit,
+  kZero, // Only used internally for computing SYMM/HEMM
+  kInvalid
+};
+/// Enumerated type describing the side dense matrix is in matrix equation for BLAS functions.
+enum class SideMode {
+  kLeft,
+  kRight,
+  kInvalid
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/block_striped.h ADDED Viewed

	@@ -0,0 +1,267 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Utilities for performing block-striped access (load, store, reduce) of trivially-copyable,
+    statically-sized array types to global memory.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/wmma_array.h"
+#include "cutlass/functional.h"
+#include "cutlass/complex.h"
+namespace cutlass {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// AccessWidth
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Computes the maximal power-of-two that evenly divides the size of T, capped at Limit
+template <
+  typename T,
+  int Limit>
+struct AccessWidth
+{
+  // Inductive case
+  template <
+      int ObjectBytes,        /// Size of T in bytes
+      int AlignBytes,         /// Template induction variable
+      bool IsAligned  =       /// Whether ObjectBytes is an even multiple of AlignBytes
+        ((AlignBytes <= Limit) &&  (ObjectBytes % AlignBytes == 0))>
+  struct Detail
+  {
+      static const int value = Detail<ObjectBytes, AlignBytes * 2>::value;
+  };
+  // Base case (ObjectBytes is not an even multiple of AlignBytes)
+  template <
+      int ObjectBytes,        /// Size of T in bytes
+      int AlignBytes>         /// Template induction variable
+  struct Detail<ObjectBytes, AlignBytes, false>
+  {
+      static const int value = AlignBytes / 2;
+  };
+  /// The maximal power-of-two that evenly divides the size of T
+  static const int value = Detail<
+    (int) sizeof(T),
+    1>::value;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// StripedAccessType
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// ReinterpretCast type for striping a trivially-copyable type in global memory
+/// (Default specialization.  Striping granularity is type T.)
+template <
+    typename T,           /// Data type
+    int TransferBytes =   /// Data access width (16 byte max for global memory access on current architectures)
+      AccessWidth<T, 16>::value>
+struct alignas(TransferBytes) StripedAccessType : public T
+{};
+/// ReinterpretCast type for striping a trivially-copyable type in global memory
+/// (Specialization for cutlass::Array<T>.  Striping granularity is a multiple of T.)
+template <
+    typename T,           /// Array element type
+    int N,                /// Number of elements in array
+    bool RegisterSized,   /// T is register-sized
+    int TransferBytes>    /// Data access width
+struct StripedAccessType<
+    Array<T, N, RegisterSized>,
+    TransferBytes>
+: public AlignedArray<
+            T,                                                  // Element type of StripedAccessType
+            __NV_STD_MAX(1, TransferBytes / (int) sizeof(T)),   // Number of elements T in StripedAccessType
+            TransferBytes>                                      // Alignment of StripedAccessType
+{};
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+/// ReinterpretCast type for striping a trivially-copyable type in global memory
+/// (Specialization for cutlass::WmmaFragmentArray<T>.  Striping granularity is a multiple of T.)
+template<
+    typename Use,
+    int m,
+    int n,
+    int k,
+    typename ElementT,
+    typename Layout,
+    int kFragments,
+    int TransferBytes>
+struct StripedAccessType<
+    WmmaFragmentArray<nvcuda::wmma::fragment<Use, m, n, k, ElementT, Layout>, kFragments>,
+    TransferBytes>
+: public AlignedArray<
+            ElementT,
+            __NV_STD_MAX(1, TransferBytes / (int) sizeof(ElementT)),
+            TransferBytes>
+{};
+#endif // if defined(CUTLASS_ARCH_WMMA_ENABLED)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// BlockStriped
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Utility for performing block-striped access (load, store) of trivially-copyable,
+/// statically-sized array types to global memory
+template <
+  int BlockThreads,
+  typename ArrayT,
+  typename AccessT = StripedAccessType<ArrayT> >
+struct BlockStriped
+{
+  /// Number of striped accesses
+  static const int kStripes = int(sizeof(ArrayT) / sizeof(AccessT));
+  static_assert(kStripes > 0, "AccessT type must be smaller than or equal to ArrayT type");
+  /// Load
+  CUTLASS_DEVICE
+  static void load(ArrayT &data, ArrayT *ptr, int thread_idx)
+  {
+    AccessT *access_input = reinterpret_cast<AccessT*>(ptr);
+    AccessT *access_data = reinterpret_cast<AccessT*>(&data);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kStripes; ++i) {
+      access_data[i] = access_input[(BlockThreads * i) + thread_idx];
+    }
+  }
+  /// Load & Add
+  CUTLASS_DEVICE
+  static void load_add(ArrayT &data, ArrayT *ptr, int thread_idx)
+  {
+    AccessT *access_input = reinterpret_cast<AccessT*>(ptr);
+    AccessT *access_data = reinterpret_cast<AccessT*>(&data);
+    plus<AccessT> add;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kStripes; ++i)
+    {
+      access_data[i] = add(access_data[i], access_input[(BlockThreads * i) + thread_idx]);
+    }
+  }
+  /// Store
+  CUTLASS_DEVICE
+  static void store(ArrayT *ptr, const ArrayT &data, int thread_idx)
+  {
+    AccessT *access_output = reinterpret_cast<AccessT*>(ptr);
+    const AccessT *access_data = reinterpret_cast<const AccessT*>(&data);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kStripes; ++i) {
+      access_output[(BlockThreads * i) + thread_idx] = access_data[i];
+    }
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// BlockStripedReduce
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Utility for performing block-striped access (load, store, reduce) of trivially-copyable,
+/// statically-sized array types to global memory.
+/// (Default specialization)
+template <
+  int BlockThreads,
+  typename ArrayT,
+  typename ElementT = typename StripedAccessType<ArrayT>::Element>
+struct BlockStripedReduce :
+  BlockStriped<
+    BlockThreads,
+    ArrayT,
+    ElementT>
+{
+  /// Reduce
+  CUTLASS_DEVICE
+  static void reduce(ArrayT *ptr, const ArrayT &data, int thread_idx)
+  {
+    cutlass::atomic_add<ElementT> reduce;
+    ElementT *access_output = reinterpret_cast<ElementT*>(ptr);
+    const ElementT *access_data = reinterpret_cast<const ElementT*>(&data);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < BlockStripedReduce::kStripes; ++i) {
+      reduce(access_output + (BlockThreads * i) + thread_idx, access_data[i]);
+    }
+  }
+};
+/// Utility for performing block-striped access (load, store, reduce) of trivially-copyable,
+/// statically-sized array types to global memory.
+/// (Specialization for half_t.  Uses half2 vectorized-reduction.)
+template <
+  int BlockThreads,
+  typename ArrayT>
+struct BlockStripedReduce<BlockThreads, ArrayT, half_t> :
+  BlockStriped<
+    BlockThreads,
+    ArrayT,
+    half2>
+{
+  static_assert(BlockStripedReduce::kStripes % 2 == 0, "Array of half must be even number in length");
+  /// Reduce
+  CUTLASS_DEVICE
+  static void reduce(ArrayT *ptr, const ArrayT &data, int thread_idx)
+  {
+    cutlass::atomic_add<half2> reduce;
+    half2 *access_output = reinterpret_cast<half2*>(ptr);
+    const half2 *access_data = reinterpret_cast<const half2*>(&data);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < BlockStripedReduce::kStripes; ++i)
+    {
+      reduce(access_output + (BlockThreads * i) + thread_idx, access_data[i]);
+    }
+  }
+};
+} // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/cluster_launch.hpp ADDED Viewed

	@@ -0,0 +1,394 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief CUDA interfaces to launch CUTLASS device-level operators (for >= SM90) that use thread-block clusters.
+*/
+#pragma once
+#include <cuda_runtime_api.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/trace.h"
+#include <cute/arch/cluster_sm100.hpp>
+#include "cutlass/arch/synclog.hpp"
+#if defined(__CUDACC_RTC__)
+#include CUDA_STD_HEADER(type_traits)
+#else
+#include <type_traits>
+#include <cstdio>
+#endif
+#if ((__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8)))
+#  define CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED
+#endif
+#if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8))
+  #  define CUDA_ENABLE_PREFERRED_CLUSTER
+#endif
+namespace cutlass {
+#ifndef NDEBUG
+#define Return_Status(cudaError_t_status)            \
+  if (cudaError_t_status != cudaSuccess) {           \
+    fprintf(stderr,                                  \
+            "[ ERROR: CUDA Runtime ] %s:%d: %s\n",   \
+            __FILE__,                                \
+            __LINE__,                                \
+            cudaGetErrorString(cudaError_t_status)); \
+    return Status::kInvalid;                         \
+  } else {                                           \
+    return Status::kSuccess;                         \
+  }
+#else
+#define Return_Status(cudaError_t_status)          \
+  if (cudaError_t_status != cudaSuccess) {         \
+    return Status::kInvalid;                       \
+  } else {                                         \
+    return Status::kSuccess;                       \
+  }
+#endif
+struct ClusterLauncher {
+  constexpr static int MaxClusterSize = 32;
+  struct LaunchConfig {
+#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
+    cudaLaunchConfig_t launch_config;
+  #if defined(CUDA_ENABLE_PREFERRED_CLUSTER)
+    constexpr static int numAttrs = 3;
+  #else
+    constexpr static int numAttrs = 2;
+  #endif
+    cudaLaunchAttribute launch_attribute[numAttrs];
+  // Commonly used utility functions
+  dim3 gridDim()  { return launch_config.gridDim;  }
+  dim3 blockDim() { return launch_config.blockDim; }
+#endif
+  };
+  // Check for hardware compatibility
+  static inline CUTLASS_HOST
+  Status check_cluster_dims(dim3 grid, dim3 cluster) {
+    if (((cluster.x * cluster.y * cluster.z) <= MaxClusterSize) &&
+        (grid.x % cluster.x == 0) && (grid.y % cluster.y == 0) && (grid.z % cluster.z == 0)) {
+      return Status::kSuccess;
+    }
+    else {
+      CUTLASS_TRACE_HOST("ClusterLauncher: Invalid cluster configuration -- aborting launch.");
+      return Status::kInvalid;
+    }
+  }
+  static inline CUTLASS_HOST
+  Status
+#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
+  init(void const* kernel_function)
+#else
+  init(void const* /* kernel_function */)
+#endif
+  {
+#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    if (kernel_function == nullptr) {
+      CUTLASS_TRACE_HOST("kernel_function is null");
+      return Status::kInvalid;
+    }
+    CUTLASS_TRACE_HOST("Checking previous error state before calling cudaFuncSetAttribute");
+    cudaError_t prevStatus = cudaGetLastError();
+    if (prevStatus != cudaSuccess) {
+      fprintf(stderr,
+              "[ ERROR: CUDA Runtime ] %s:%d: %s\n",
+              __FILE__,
+              __LINE__,
+              cudaGetErrorString(prevStatus));
+      return Status::kInvalid;
+    }
+    CUTLASS_TRACE_HOST("Calling cudaFuncSetAttribute");
+#endif
+    // This attribute was added in CUDA 11.8.
+    cudaError_t status =
+        cudaFuncSetAttribute(
+          kernel_function, cudaFuncAttributeNonPortableClusterSizeAllowed, 1);
+    Return_Status(status);
+#else
+    return Status::kInvalid;
+#endif
+  }
+  static inline CUTLASS_HOST
+  LaunchConfig make_cluster_launch_config(
+      dim3 const grid_dims,
+      dim3 const cluster_dims,
+      dim3 const block_dims,
+      size_t const smem_size = 0,
+      cudaStream_t cuda_stream = 0,
+      bool launch_with_pdl = false
+      , dim3 const fallback_cluster_dims = {0, 0, 0}
+    ) {
+    LaunchConfig cluster_launch_config;
+#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
+    auto &launch_config    = cluster_launch_config.launch_config;
+    auto &launch_attribute = cluster_launch_config.launch_attribute;
+    auto numAttrs = cluster_launch_config.numAttrs;
+    launch_attribute[0].id = cudaLaunchAttributeClusterDimension;
+    bool have_fallback = fallback_cluster_dims.x * fallback_cluster_dims.y * fallback_cluster_dims.z > 0;
+    if (have_fallback) {
+      launch_attribute[0].val.clusterDim = {fallback_cluster_dims.x, fallback_cluster_dims.y, fallback_cluster_dims.z};
+      CUTLASS_TRACE_HOST("ClusterLauncher: Setting fallback ClusterDims = "
+          "(" << fallback_cluster_dims.x << ", " << fallback_cluster_dims.y << ", " << fallback_cluster_dims.z << ")\n");
+    }
+    else {
+    launch_attribute[0].val.clusterDim = {cluster_dims.x, cluster_dims.y, cluster_dims.z};
+    CUTLASS_TRACE_HOST("ClusterLauncher: Setting ClusterDims = "
+        "(" << cluster_dims.x << ", " << cluster_dims.y << ", " << cluster_dims.z << ")\n");
+    }
+#if defined(CUDA_ENABLE_PREFERRED_CLUSTER)
+    if (have_fallback) {
+      if (cute::initialize_preferred_cluster_launch(nullptr, grid_dims, cluster_dims, fallback_cluster_dims)) {
+        launch_attribute[1].id = cudaLaunchAttributePreferredClusterDimension;
+        launch_attribute[1].val.preferredClusterDim = {cluster_dims.x, cluster_dims.y, cluster_dims.z};
+        CUTLASS_TRACE_HOST("ClusterLauncher: Setting preferred ClusterDims = "
+            "(" << cluster_dims.x << ", " << cluster_dims.y << ", " << cluster_dims.z << ")\n");
+      }
+    }
+    else {
+      numAttrs--;
+    }
+#endif
+    // PDL attributes
+    launch_attribute[numAttrs - 1].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+    launch_attribute[numAttrs - 1].val.programmaticStreamSerializationAllowed = 1;
+    launch_config.gridDim = {grid_dims.x, grid_dims.y, grid_dims.z};
+    launch_config.blockDim = {block_dims.x, block_dims.y, block_dims.z};
+    launch_config.dynamicSmemBytes = smem_size;
+    launch_config.stream = cuda_stream;
+    launch_config.numAttrs = launch_with_pdl ? numAttrs : numAttrs - 1;
+    launch_config.attrs = launch_attribute;
+    return cluster_launch_config;
+#else
+    CUTLASS_TRACE_HOST("ClusterLauncher: CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED not defined! Aborting cluster launch.");
+    return cluster_launch_config;
+#endif
+  }
+  // This is the method we expect to use going forward
+  static inline CUTLASS_HOST
+  Status launch(
+      dim3 const grid_dims,
+      dim3 const cluster_dims,
+      dim3 const block_dims,
+      size_t const smem_size,
+      cudaStream_t cuda_stream,
+      void const* kernel,
+      void** kernel_params,
+      bool launch_with_pdl = false) {
+#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
+    LaunchConfig cluster_launch_config = make_cluster_launch_config(grid_dims, cluster_dims,
+                                            block_dims, smem_size, cuda_stream, launch_with_pdl);
+    auto launch_grid_dims = cluster_launch_config.gridDim();
+    if (check_cluster_dims(launch_grid_dims, cluster_dims) != Status::kSuccess) {
+      CUTLASS_TRACE_HOST("ClusterLauncher: check_cluster_dims() failed. Aborting.");
+      return Status::kInvalid;
+    }
+    auto init_status = init(kernel);
+    if (init_status != Status::kSuccess) {
+      CUTLASS_TRACE_HOST("ClusterLauncher: init(kernel) failed with status " << int(init_status) << ". Aborting.");
+      return Status::kInvalid;
+    }
+    CUTLASS_TRACE_HOST("ClusterLauncher: Launching GridDims = "
+        "(" << launch_grid_dims.x << ", " << launch_grid_dims.y << ", " << launch_grid_dims.z << "), "
+        "And ClusterDims = "
+        "(" << cluster_dims.x << ", " << cluster_dims.y << ", " << cluster_dims.z << ")\n");
+    cutlass::arch::synclog_setup();
+    cudaError_t status = cudaLaunchKernelExC(&cluster_launch_config.launch_config, kernel, kernel_params);
+    Return_Status(status);
+#else
+    CUTLASS_TRACE_HOST("ClusterLauncher: CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED not defined! Aborting cluster launch.");
+    return Status::kInvalid;
+#endif
+  }
+  // This is the method we expect to use going forward
+  // Launch a preferred cluster grid
+  static inline CUTLASS_HOST
+  Status launch_with_fallback_cluster(
+      dim3 const grid_dims,
+      dim3 const preferred_cluster_dims,
+      dim3 const fallback_cluster_dims,
+      dim3 const block_dims,
+      size_t const smem_size,
+      cudaStream_t cuda_stream,
+      void const* kernel,
+      void** kernel_params,
+      bool launch_with_pdl = false) {
+#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
+    LaunchConfig cluster_launch_config = make_cluster_launch_config(grid_dims, preferred_cluster_dims,
+                                            block_dims, smem_size, cuda_stream, launch_with_pdl, fallback_cluster_dims);
+    auto launch_grid_dims = cluster_launch_config.gridDim();
+    if (check_cluster_dims(launch_grid_dims, preferred_cluster_dims) != Status::kSuccess) {
+      CUTLASS_TRACE_HOST("ClusterLauncher: check_cluster_dims() failed. Aborting.");
+      return Status::kInvalid;
+    }
+    auto init_status = init(kernel);
+    if (init_status != Status::kSuccess) {
+      CUTLASS_TRACE_HOST("ClusterLauncher: init(kernel) failed with status " << int(init_status) << ". Aborting.");
+      return Status::kInvalid;
+    }
+    CUTLASS_TRACE_HOST("ClusterLauncher: Launching \n\tGridDims = "
+        "(" << launch_grid_dims.x << ", " << launch_grid_dims.y << ", " << launch_grid_dims.z << "), "
+        "\n\tPreferred ClusterDims = "
+        "(" << preferred_cluster_dims.x << ", " << preferred_cluster_dims.y << ", " << preferred_cluster_dims.z << "),"
+        "\n\tFallback  ClusterDims = "
+        "(" << fallback_cluster_dims.x << ", " << fallback_cluster_dims.y << ", " << fallback_cluster_dims.z <<  ")\n");
+    cutlass::arch::synclog_setup();
+    cudaError_t status = cudaLaunchKernelExC(&cluster_launch_config.launch_config, kernel, kernel_params);
+    Return_Status(status);
+#else
+    CUTLASS_TRACE_HOST("ClusterLauncher: CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED not defined! Aborting cluster launch.");
+    return Status::kInvalid;
+#endif
+  }
+};
+namespace detail {
+template<class Arg>
+void* checked_addressof(Arg&& arg) {
+  static_assert(! std::is_rvalue_reference_v<Arg> || ! std::is_const_v<Arg>, "You cannot take the address of a const rvalue reference (const T&&).");
+  // We use std::addressof to ensure we get the address,
+  // in case the type has an overloaded operator&.
+  // Note that this precludes `const T&&` references.
+  return const_cast<void*>(reinterpret_cast<void const*>(std::addressof(arg)));
+}
+} // namespace detail
+//! Parameters for launch_on_cluster (see below).
+struct ClusterLaunchParams {
+  //! Grid dimensions
+  dim3 grid_dims{1, 1, 1};
+  //! Block dimensions
+  dim3 block_dims{1, 1, 1};
+  //! Cluster dimensions
+  dim3 cluster_dims{1, 1, 1};
+  //! Number of bytes required for the kernel's shared memory.
+  int smem_size_in_bytes = 0;
+  //! CUDA stream on which to launch the kernel.
+  cudaStream_t cuda_stream = nullptr;
+};
+/// @brief Launch the kernel on the stream using cluster launch.
+///
+/// @param params Cluster launch parameters (see above).
+/// @param kernel_ptr Pointer to the kernel function (see example).
+/// @param args Zero or more arguments to pass to the kernel.
+///
+/// @tparam Args Types of the arguments passed to the kernel.
+///   Don't specify this/these template argument(s) explicitly.
+///
+/// @return Status::Success on success, else an error code.
+///
+/// @code
+/// template<class SharedMemoryType, class A, class B, class C>
+/// __global__ void kernel(A a, B b, C c);
+///
+/// X x = get_x();
+/// Y y = get_y();
+/// Z z = get_z();
+///
+/// void const* kernel_ptr =
+///   const_cast<void const*>(reinterpret_cast<void*>(
+///     &kernel<SharedMemory, X, Y, Z>));
+/// auto status = launch_kernel_on_cluster(
+///   {grid_dims, block_dims, cluster_dims, sizeof(SharedMemory)},
+///   kernel_ptr, x, y, z);
+/// @endcode
+template<class ... Args>
+CUTLASS_HOST cutlass::Status
+launch_kernel_on_cluster(const ClusterLaunchParams& params,
+  void const* kernel_ptr,
+  Args&& ... args)
+{
+  // Unfortunately, we find ourselves needing to pass in
+  // the parameters as an array of raw pointers.
+  if constexpr (sizeof...(Args) == 0) {
+    return cutlass::ClusterLauncher::launch(
+      params.grid_dims,
+      params.cluster_dims,
+      params.block_dims,
+      params.smem_size_in_bytes,
+      params.cuda_stream,
+      kernel_ptr, nullptr);
+  }
+  else {
+    void* kernel_params[sizeof...(Args)] = {
+      detail::checked_addressof(std::forward<Args>(args))...
+    };
+    return cutlass::ClusterLauncher::launch(
+      params.grid_dims,
+      params.cluster_dims,
+      params.block_dims,
+      params.smem_size_in_bytes,
+      params.cuda_stream,
+      kernel_ptr,
+      kernel_params);
+  }
+}
+}  // namespace cutlass

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/complex.h ADDED Viewed

	@@ -0,0 +1,821 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#include <cuComplex.h>
+#include <cuda_fp16.h>
+#include "cutlass/cutlass.h"
+#if defined(__CUDACC_RTC__)
+#include CUDA_STD_HEADER(cstdint)
+#else
+#include <cstdint>
+#endif
+#include "cutlass/functional.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/real.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/fast_math.h"
+#if !defined(__CUDACC_RTC__)
+#include <iosfwd>
+#endif
+namespace cutlass {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Enumeraed type describing a transformation on a complex value.
+enum class ComplexTransform {
+  kNone,
+  kConjugate
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines ComplexTransform inversions
+template <ComplexTransform kTransform>
+struct InvertComplexTransform;
+/// Invert ComplexTransform from kNone to kConjugate
+template <>
+struct InvertComplexTransform<ComplexTransform::kNone> {
+  static ComplexTransform const transform = ComplexTransform::kConjugate;
+};
+/// Invert ComplexTransform from kConjugate to kNone
+template <>
+struct InvertComplexTransform<ComplexTransform::kConjugate> {
+  static ComplexTransform const transform = ComplexTransform::kNone;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Accessors for CUDA complex types
+//
+#if !defined(__CUDACC_RTC__)
+/// Returns the real part of the complex number
+CUTLASS_HOST_DEVICE
+float const &real(cuFloatComplex const &z) { return z.x; }
+/// Returns the real part of the complex number
+CUTLASS_HOST_DEVICE
+float &real(cuFloatComplex &z) { return z.x; }
+/// Returns the real part of the complex number
+CUTLASS_HOST_DEVICE
+double const &real(cuDoubleComplex const &z) { return z.x; }
+/// Returns the real part of the complex number
+CUTLASS_HOST_DEVICE
+double &real(cuDoubleComplex &z) { return z.x; }
+/// Returns the imaginary part of the complex number
+CUTLASS_HOST_DEVICE
+float const &imag(cuFloatComplex const &z) { return z.y; }
+/// Returns the imaginary part of the complex number
+CUTLASS_HOST_DEVICE
+float &imag(cuFloatComplex &z) { return z.y; }
+/// Returns the imaginary part of the complex number
+CUTLASS_HOST_DEVICE
+double const &imag(cuDoubleComplex const &z) { return z.y; }
+/// Returns the imaginary part of the complex number
+CUTLASS_HOST_DEVICE
+double &imag(cuDoubleComplex &z) { return z.y; }
+// Returns the conjugate of the complex number
+CUTLASS_HOST_DEVICE cuFloatComplex
+conj(cuFloatComplex const& z) {
+  return make_cuFloatComplex(z.x, -z.y);
+}
+// Returns the conjugate of the complex number
+CUTLASS_HOST_DEVICE cuDoubleComplex
+conj(cuDoubleComplex const& z) {
+  return make_cuDoubleComplex(z.x, -z.y);
+}
+#endif
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Class for representing and manipulating complex numbers with conversions from built-in CUDA
+/// complex types.
+template <typename T>
+class complex
+{
+ public:
+  /// Type alias for scalar type
+  using value_type = T;
+ private:
+  //
+  // Data members
+  //
+  /// Real part
+  T _real;
+  /// Imaginary part
+  T _imag;
+ public:
+//
+// Methods
+//
+  /// Default constructor
+  complex() = default;
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  complex(T r) : _real(r), _imag(T(0)) {}
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  complex(T r, T i) : _real(r), _imag(i) {}
+  /// Constructor
+  template<typename A>
+  CUTLASS_HOST_DEVICE
+  complex(complex<A> const &z) : _real(static_cast<T>(z.real())), _imag(static_cast<T>(z.imag())) {}
+  #if !defined(__CUDACC_RTC__)
+  /// Conversion from cuFloatComplex
+  CUTLASS_HOST_DEVICE
+  complex(cuFloatComplex const &z) : _real(static_cast<T>(cuCrealf(z))), _imag(static_cast<T>(cuCimagf(z))) {}
+  /// Conversion from cuDoubleComplex
+  CUTLASS_HOST_DEVICE
+  complex(cuDoubleComplex const &z) : _real(static_cast<T>(cuCreal(z))), _imag(static_cast<T>(cuCimag(z))) {}
+  #endif
+  /// Equality operator
+  CUTLASS_HOST_DEVICE bool operator==(complex<T> const &rhs) const {
+    return this->real() == rhs.real() && this->imag() == rhs.imag();
+  }
+  /// Inequality operator
+  CUTLASS_HOST_DEVICE bool operator!=(complex<T> const &rhs) const {
+    return !(*this == rhs);
+  }
+  /// Addition
+    template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> operator+(complex<A> const &rhs) const {
+    return complex<T>(this->real() + rhs.real(), this->imag() + rhs.imag());
+  }
+  /// Reduction into memory address.  Components may update out of order.
+  template <typename OtherT>
+  CUTLASS_DEVICE void red(complex<OtherT> *ptr) const {
+    static_assert(platform::is_same<T, OtherT>::value, "Component type must match");
+    cutlass::atomic_add<T> reduce;
+    reduce(&ptr->_real, _real);
+    reduce(&ptr->_imag, _imag);
+  }
+  /// Reduction into memory address.  Components may update out of order.  (Half specialization)
+  CUTLASS_DEVICE void red(complex<half_t> *ptr) const {
+    static_assert(platform::is_same<T, half_t>::value, "Component type must match");
+    half2 *h2_ptr = reinterpret_cast<half2*>(ptr);
+    half2 h2_data = reinterpret_cast<half2&>(*this);
+    cutlass::atomic_add<half2> reduce;
+    reduce(h2_ptr, h2_data);
+  }
+  /// Subtraction
+    template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> operator-(complex<A> const &rhs) const {
+    return complex<T>(this->real() - rhs.real(), this->imag() - rhs.imag());
+  }
+  /// Multiplication
+    template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> operator*(complex<A> const &rhs) const {
+    return complex<T>(this->real() * rhs.real() - this->imag() * rhs.imag(),
+                      this->real() * rhs.imag() + this->imag() * rhs.real());
+  }
+  /// Scalar Multiplication
+    template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> operator*(A const &s) const {
+    return complex<T>(this->real() * s, this->imag() * s);
+  }
+  /// Division
+    template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> operator/(complex<A> const &rhs) const {
+    T d = T(rhs.real() * rhs.real() + rhs.imag() * rhs.imag());
+    return complex<T>(
+      (real() * rhs.real() + imag() * rhs.imag()) / d,
+      (imag() * rhs.real() - real() * rhs.imag()) / d
+    );
+  }
+  /// Scalar Division
+    template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> operator/(A const &s) const {
+    return complex<T>(this->real() / s, this->imag() / s);
+  }
+  /// Addition
+    template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> &operator+=(complex<A> const &rhs) {
+      *this = *this + rhs;
+      return *this;
+  }
+  /// Subtraction
+  template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> &operator-=(complex<A> const &rhs) {
+      *this = *this - rhs;
+      return *this;
+  }
+  /// Multiplication
+  template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> &operator*=(complex<A> const &rhs) {
+      *this = *this * rhs;
+      return *this;
+  }
+  /// Scalar multiplication
+  template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> &operator*=(A s) {
+      *this = *this * s;
+      return *this;
+  }
+  /// Division
+  template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> &operator/=(complex<A> const &rhs) {
+      *this = *this / rhs;
+      return *this;
+  }
+  /// Accesses the real part of the complex number
+  CUTLASS_HOST_DEVICE
+  T const &real() const { return _real; }
+  /// Accesses the real part of the complex number
+  CUTLASS_HOST_DEVICE
+  T &real() { return _real; }
+  /// Accesses the imaginary part of the complex number
+  CUTLASS_HOST_DEVICE
+  T const &imag() const { return _imag; }
+  /// Accesses the imaginary part of the complex number
+  CUTLASS_HOST_DEVICE
+  T &imag() { return _imag; }
+  /// Set the real part of the complex number
+  CUTLASS_HOST_DEVICE
+  void real(T real) { _real = real; }
+  /// Set the imaginary part of the complex number
+  CUTLASS_HOST_DEVICE
+  void imag(T imag) { _imag = imag; }
+  #if !defined(__CUDACC_RTC__)
+  /// Converts to cuFloatComplex
+  CUTLASS_HOST_DEVICE
+  explicit operator cuFloatComplex() const { return make_cuFloatComplex(float(real()), float(imag())); }
+  /// Converts to cuDoubleComplex
+  CUTLASS_HOST_DEVICE
+  explicit operator cuDoubleComplex() const { return make_cuDoubleComplex(real(), imag()); }
+  #endif
+};
+// Complex conjugate
+template<class T>
+CUTLASS_HOST_DEVICE complex<T> conj(complex<T> const& z) {
+  return {z.real(), -z.imag()};
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Accessors for complex template
+//
+// Nonmember real and imag need to work for non-complex numbers too.
+// That means cutlass::complex, std::complex, cuda::std::complex, and
+// any user-defined complex number type that looks like std::complex.
+// It's reasonable to assume that a "complex number type" has
+// zero-argument real() and imag() member functions returning
+// non-void.  While cuFloatComplex and cuDoubleComplex lack those
+// member functions, one-argument nonmember real and imag overloads
+// for those types are defined above.
+namespace detail {
+template <typename T, typename Enable = void>
+struct has_zero_argument_real_member_function :
+  cutlass::platform::false_type
+{};
+template <typename T>
+struct has_zero_argument_real_member_function<T,
+  cutlass::platform::enable_if_t<
+    ! cutlass::platform::is_void_v<
+      decltype(cutlass::platform::declval<T>().real())
+    >
+  >
+> : cutlass::platform::true_type
+{};
+template <typename T>
+constexpr bool has_zero_argument_real_member_function_v =
+  has_zero_argument_real_member_function<T>::value;
+template <typename T, typename Enable = void>
+struct has_zero_argument_imag_member_function :
+  cutlass::platform::false_type
+{};
+template <typename T>
+struct has_zero_argument_imag_member_function<T,
+  cutlass::platform::enable_if_t<
+    ! cutlass::platform::is_void_v<
+      decltype(cutlass::platform::declval<T>().imag())
+    >
+  >
+> : cutlass::platform::true_type
+{};
+template <typename T>
+constexpr bool has_zero_argument_imag_member_function_v =
+  has_zero_argument_imag_member_function<T>::value;
+} // namespace detail
+template<typename T>
+CUTLASS_HOST_DEVICE auto real(T z) {
+  if constexpr (detail::has_zero_argument_real_member_function_v<T>) {
+    return z.real();
+  } else {
+    return z;
+  }
+}
+template<typename T>
+CUTLASS_HOST_DEVICE auto imag(T z) {
+  if constexpr (detail::has_zero_argument_imag_member_function_v<T>) {
+    return z.imag();
+  } else {
+    // Imaginary part of a non-complex input has the same type as the
+    // input, and its value is zero.  CUTLASS assumes in this case
+    // that value-initializing T is well-formed and results in zero.
+    return T{};
+  }
+}
+//
+// Output operators
+//
+#if !defined(__CUDACC_RTC__)
+template <typename T>
+std::ostream &operator<<(std::ostream &out, complex<T> const &z) {
+  T _r = real(z);
+  T _i = imag(z);
+  if (bool(_i)) {
+    return out << _r << "+i" << _i;
+  }
+  return out << _r;
+}
+#endif
+//
+// Non-member operators defined for complex types
+//
+//
+// Non-member functions defined for complex numbers
+//
+// abs returns the magnitude of the complex number.
+CUTLASS_HOST_DEVICE float abs(complex<float> const &z) {
+  return ::hypot(z.real(), z.imag());
+}
+CUTLASS_HOST_DEVICE double abs(complex<double> const &z) {
+  return ::hypot(z.real(), z.imag());
+}
+// In theory, it would make sense to add a complex<long double>
+// specialization of abs here, since hypot works for long double too.
+// In practice, long double doesn't have a portable number of bits or
+// behavior, so users who care about higher-precision floating-point
+// computation should probably insist on an actual FP128 type.
+template <typename T>
+CUTLASS_HOST_DEVICE T abs(complex<T> const &z) {
+  // cutlass::complex permits all kinds of T, including types that
+  // don't have NaN.  For a generic floating-point type with Inf
+  // and/or NaN, LAPACK's DLAPY2 algorithm would make sense, as it
+  // would handle issues like avoiding unwarranted overflow if
+  // z.real() or z.imag() is slightly bigger than the square root of
+  // the max finite number.  That could be a future improvement; for
+  // now, the code just uses the naive algorithm.
+  //
+  // Use the "swap two-step" idiom so that argument-dependent lookup
+  // can find any CUTLASS-specific overloads.
+  using cutlass::sqrt;
+  return sqrt(z.real() * z.real() + z.imag() * z.imag());
+}
+/// Returns the magnitude of the complex number
+template <typename T>
+CUTLASS_HOST_DEVICE T arg(complex<T> const &z) {
+  return atan2(imag(z), real(z));
+}
+/// Returns the squared magnitude of a real number
+template <typename T>
+CUTLASS_HOST_DEVICE T norm(T const &z) {
+    return z * z;
+}
+/// Returns the squared magnitude of a real number
+template <>
+CUTLASS_HOST_DEVICE int8_t norm(int8_t const &z) {
+    return static_cast<int8_t>(z * z);
+}
+/// Returns the squared magnitude of a complex number
+template <typename T>
+CUTLASS_HOST_DEVICE double norm(complex<T> const &z) {
+  return real(z) * real(z) + imag(z) * imag(z);
+}
+/// Norm-accumulate calculation
+template <typename T, typename R>
+CUTLASS_HOST_DEVICE R norm_accumulate(T const &x, R const & accumulator) {
+  return accumulator + static_cast<R>(x) * static_cast<R>(x);
+}
+/// Norm accumulate specialized for complex types
+template <typename T, typename R>
+CUTLASS_HOST_DEVICE R norm_accumulate(complex<T> const &z, R const &accumulator) {
+  return accumulator + static_cast<R>(real(z)) * static_cast<R>(real(z)) +
+    static_cast<R>(imag(z)) * static_cast<R>(imag(z));
+}
+namespace detail {
+template<class T>
+CUTLASS_HOST_DEVICE T conj_impl(T const& z, cutlass::platform::true_type) {
+  return conj(z);
+}
+template<class T>
+CUTLASS_HOST_DEVICE T conj_impl(T const& z, cutlass::platform::false_type) {
+  return z;
+}
+template<class T>
+CUTLASS_HOST_DEVICE T conj_impl(T const& z) {
+  constexpr bool use_unqualified_conj =
+    ! cutlass::platform::is_arithmetic_v<T> &&
+    ! detail::has_cutlass_conj_v<T> &&
+    detail::has_unqualified_conj_v<T>;
+  return conj_impl(z, cutlass::platform::bool_constant<use_unqualified_conj>{});
+}
+} // namespace detail
+// Return the complex conjugate of the input.
+//
+// This MUST be a function and not a function object, because it may
+// be common practice for downstream types to define specifically
+// cutlass::conj overloads, instead of overloads in their namespace.
+//
+// As a result of this being a function and not a function object,
+// CUTLASS code needs to declare "using cutlass::conj;" in scope and
+// then call this function unqualified, just like std::swap.
+//
+// If an overload already exists for cutlass::conj(T), that overload
+// will be called instead of this one.  Otherwise:
+//
+// 1. for arithmetic types, return z;
+//
+// 2. for types where (namespace-unqualified) conj(z) is well formed
+//    and cutlass::conj(z) is NOT well formed, return conj(z); and,
+//
+// 3. for everything else, return z.
+//
+// Regarding (1), the C++ Standard Library makes std::conj always
+// return std::complex, even for (noncomplex) arithmetic types.
+// cutlass::conj(T t) needs to return type T.  This follows the
+// convention of linear algebra software like the BLAS, where
+// "conjugate transpose" means the same thing as "transpose" for a
+// matrix of noncomplex numbers.
+//
+// Case (2) covers std::complex, cuda::std::complex, and non-Standard
+// (including user-defined) complex number types (for which "conj(z)"
+// is findable via argument-dependent lookup, but does not live in the
+// cutlass namespace).  It excludes cutlass::conj(z) in order to
+// prevent infinite recursion.
+//
+// Case (3) covers non-Standard non-complex number types.
+template<class T>
+CUTLASS_HOST_DEVICE T conj(T const& z) {
+  return detail::conj_impl(z);
+}
+/// Projects the complex number z onto the Riemann sphere
+template <typename T>
+CUTLASS_HOST_DEVICE complex<T> proj(complex<T> const &z) {
+  T d = real(z) * real(z) + imag(z) * imag(z) + T(1);
+  return complex<T>((T(2) * real(z)) / d, (T(2) * imag(z)) / d);
+}
+/// Returns a complex number with magnitude r and phase theta
+template <typename T>
+CUTLASS_HOST_DEVICE complex<T> polar(T const &r, T const &theta = T()) {
+  return complex<T>(r * cos(theta), r * sin(theta));
+}
+/// Computes the complex exponential of z.
+template <typename T>
+CUTLASS_HOST_DEVICE complex<T> exp(complex<T> const &z) {
+  return complex<T>(fast_exp(real(z)) * fast_cos(imag(z)), fast_exp(real(z)) * fast_sin(imag(z)));
+}
+/// Computes the log of z
+template <typename T>
+CUTLASS_HOST_DEVICE complex<T> log(complex<T> const &z) {
+  return complex<T>(log(abs(z)), arg(z));
+}
+/// Computes the log base 10 of z
+template <typename T>
+CUTLASS_HOST_DEVICE complex<T> log10(complex<T> const &z) {
+  return log(z) / T(log(T(10)));
+}
+/// Computes the square root of complex number z
+template <typename T>
+CUTLASS_HOST_DEVICE complex<T> sqrt(complex<T> const &z) {
+  return sqrt(T(2)) / T(2) *
+         complex<T>(sqrt(sqrt(norm(z)) + real(z)),
+                    (imag(z) < 0 ? T(-1) : T(1)) * sqrt(sqrt(norm(z)) - real(z)));
+}
+/// Computes the cosine of complex z.
+template <typename T>
+CUTLASS_HOST_DEVICE complex<T> cos(complex<T> const &z) {
+  return (exp(z) + exp(-z)) / T(2);
+}
+/// Computes the sin of complex z.
+template <typename T>
+CUTLASS_HOST_DEVICE complex<T> sin(complex<T> const &z) {
+  return (exp(-z) - exp(z)) * complex<T>(T(0), T(1) / T(2));
+}
+/// Comparison
+template <typename T>
+CUTLASS_HOST_DEVICE bool operator<(complex<T> const &lhs, complex<T> const &rhs) {
+  return true;
+}
+//////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for complex-valued type.
+template <typename T>
+struct RealType< complex<T> >
+{
+  using Type = T;
+  /// Number of elements
+  static int const kExtent = 2;
+  CUTLASS_HOST_DEVICE
+  static complex<T> from_real(double x) {
+    return complex<T>(static_cast<T>(x));
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <>
+CUTLASS_HOST_DEVICE
+cutlass::complex<half_t> from_real<cutlass::complex<half_t> >(double r) {
+  return cutlass::complex<half_t>(half_t(r));
+}
+template <>
+CUTLASS_HOST_DEVICE
+cutlass::complex<float> from_real<cutlass::complex<float> >(double r) {
+  return cutlass::complex<float>(float(r));
+}
+template <>
+CUTLASS_HOST_DEVICE
+cutlass::complex<double> from_real<cutlass::complex<double> >(double r) {
+  return cutlass::complex<double>(r);
+}
+//////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename T>
+struct is_complex {
+  static bool const value = false;
+};
+template <typename T>
+struct is_complex<complex<T>> {
+  static bool const value = true;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// functional.h numeric specializations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Squares with optional conversion
+template <typename T, typename Output>
+struct magnitude_squared<complex<T>, Output> {
+  CUTLASS_HOST_DEVICE
+  Output operator()(complex<T> lhs) const {
+    multiplies<Output> mul_op;
+    Output y_r = Output(lhs.real());
+    Output y_i = Output(lhs.imag());
+    return mul_op(y_r, y_r) + mul_op(y_i, y_i);
+  }
+};
+/// Fused multiply-add
+template <typename T>
+struct multiply_add<complex<T>, complex<T>, complex<T>> {
+  CUTLASS_HOST_DEVICE
+  complex<T> operator()(
+    complex<T> const &a,
+    complex<T> const &b,
+    complex<T> const &c) const {
+    T real = c.real();
+    T imag = c.imag();
+    real += a.real() * b.real();
+    real += -a.imag() * b.imag();
+    imag += a.real() * b.imag();
+    imag += a.imag () * b.real();
+    return complex<T>{
+      real,
+      imag
+    };
+  }
+};
+/// Fused multiply-add
+template <typename T>
+struct multiply_add<complex<T>, T, complex<T>> {
+  CUTLASS_HOST_DEVICE
+  complex<T> operator()(
+    complex<T> const &a,
+    T const &b,
+    complex<T> const &c) const {
+    T real = c.real();
+    T imag = c.imag();
+    real += a.real() * b;
+    imag += a.imag () * b;
+    return complex<T>{
+      real,
+      imag
+    };
+  }
+};
+/// Fused multiply-add
+template <typename T>
+struct multiply_add<T, complex<T>, complex<T>> {
+  CUTLASS_HOST_DEVICE
+  complex<T> operator()(
+    T const &a,
+    complex<T> const &b,
+    complex<T> const &c) const {
+    T real = c.real();
+    T imag = c.imag();
+    real += a * b.real();
+    imag += a * b.imag();
+    return complex<T>{
+      real,
+      imag
+    };
+  }
+};
+/// Conjugate
+template <typename T>
+struct conjugate<complex<T>>  {
+  CUTLASS_HOST_DEVICE
+  complex<T> operator()(complex<T> const &a) const {
+    // Invoke the complex<T> overload specifically, rather than
+    // wasting the compiler's effort on overload resolution.
+    return cutlass::conj(a);
+  }
+};
+#if ! defined(__CUDACC_RTC__)
+template <>
+struct conjugate<cuFloatComplex>  {
+  CUTLASS_HOST_DEVICE
+  cuFloatComplex operator()(cuFloatComplex const& z) const {
+    return make_cuFloatComplex(z.x, -z.y);
+  }
+};
+template <>
+struct conjugate<cuDoubleComplex>  {
+  CUTLASS_HOST_DEVICE
+  cuDoubleComplex operator()(cuDoubleComplex const& z) const {
+    return make_cuDoubleComplex(z.x, -z.y);
+  }
+};
+#endif
+/// Computes the square of a difference with optional conversion
+template <typename T, typename Output>
+struct magnitude_squared_difference<complex<T>, Output> {
+  CUTLASS_HOST_DEVICE
+  Output operator()(complex<T> lhs, complex<T> rhs) const {
+    multiplies<Output> mul_op;
+    Output y_r = Output(lhs.real()) - Output(rhs.real());
+    Output y_i = Output(lhs.imag()) - Output(rhs.imag());
+    return mul_op(y_r, y_r) + mul_op(y_i, y_i);
+  }
+};
+/// Reduces value into the data pointed to by ptr (complex<T> specialization)
+template <typename T>
+struct atomic_add<complex<T>> {
+  CUTLASS_DEVICE
+  void operator()(complex<T> *ptr, const complex<T> &data)
+  {
+    data.red(ptr);
+  }
+};
+//////////////////////////////////////////////////////////////////////////////////////////////////
+}  // namespace cutlass
+//////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/constants.h ADDED Viewed

	@@ -0,0 +1,1239 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Boost-style constant definitions for floating-point types.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/complex.h"
+///////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace constants {
+///////////////////////////////////////////////////////////////////////////////////
+//
+// Primary templates
+//
+/// Returns 1, the multiplicative identity element
+template <typename T> CUTLASS_HOST_DEVICE T one();
+/// Returns 0, the additive identity element
+template <typename T> CUTLASS_HOST_DEVICE T zero();
+/// Returns 2
+template <typename T> CUTLASS_HOST_DEVICE T two();
+/// Returns pi, approximately 3.141
+template <typename T> CUTLASS_HOST_DEVICE T pi();
+/// Returns 2 * pi
+template <typename T> CUTLASS_HOST_DEVICE T two_pi();
+/// Returns pi / 2
+template <typename T> CUTLASS_HOST_DEVICE T half_pi();
+/// Returns sqrt(pi)
+template <typename T> CUTLASS_HOST_DEVICE T root_pi();
+/// Returns sqrt(pi / 2)
+template <typename T> CUTLASS_HOST_DEVICE T root_half_pi();
+/// Returns sqrt(2 * pi)
+template <typename T> CUTLASS_HOST_DEVICE T root_two_pi();
+/// Returns sqrt(ln(4))
+template <typename T> CUTLASS_HOST_DEVICE T root_ln_four();
+/// Returns e, approximately 2.718...
+template <typename T> CUTLASS_HOST_DEVICE T e();
+/// Returns (1/2)
+template <typename T> CUTLASS_HOST_DEVICE T half();
+/// Returns sqrt(2), approximately 1.414...
+template <typename T> CUTLASS_HOST_DEVICE T root_two();
+/// Returns sqrt(2)/2, approximately 0.707...
+template <typename T> CUTLASS_HOST_DEVICE T half_root_two();
+/// Returns ln(2), approximately 0.693...
+template <typename T> CUTLASS_HOST_DEVICE T ln_two();
+/// Returns ln(ln(2)), approximately -0.3665...
+template <typename T> CUTLASS_HOST_DEVICE T ln_ln_two();
+/// Returns 1/3, approximately 0.333...
+template <typename T> CUTLASS_HOST_DEVICE T third();
+/// Returns 2/3, approximately 0.666...
+template <typename T> CUTLASS_HOST_DEVICE T twothirds();
+/// Returns pi - 3, approximately 0.1416...
+template <typename T> CUTLASS_HOST_DEVICE T pi_minus_three();
+/// Returns 4 - pi, approximately 0.858...
+template <typename T> CUTLASS_HOST_DEVICE T four_minus_pi();
+/////////////////////////////////////////////////////////////////////////////////////
+// Specialization for double
+/// Returns 1, the multiplicative identity element  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double one<double>() {
+  uint64_t bits = 0x3ff0000000000000ull;
+  return reinterpret_cast<double const &>(bits);
+}
+/// Returns 1, the multiplicative identity element  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> one< complex<double> >() {
+  return complex<double>(one<double>(), double());
+}
+/// Returns 0, the additive identity element  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double zero<double>() {
+  uint64_t bits = 0x0ull;
+  return reinterpret_cast<double const &>(bits);
+}
+/// Returns 0, the additive identity element  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> zero< complex<double> >() {
+  return complex<double>(zero<double>(), double());
+}
+/// Returns 2  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double two<double>() {
+  uint64_t bits = 0x4000000000000000ull;
+  return reinterpret_cast<double const &>(bits);
+}
+/// Returns 2  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> two< complex<double> >() {
+  return complex<double>(two<double>(), double());
+}
+/// Returns pi, approximately 3.141  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double pi<double>() {
+  uint64_t bits = 0x400921fb54442d18ull;
+  return reinterpret_cast<double const &>(bits);
+}
+/// Returns pi, approximately 3.141  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> pi< complex<double> >() {
+  return complex<double>(pi<double>(), double());
+}
+/// Returns 2 * pi  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double two_pi<double>() {
+  uint64_t bits = 0x401921fb54442d18ull;
+  return reinterpret_cast<double const &>(bits);
+}
+/// Returns 2 * pi  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> two_pi< complex<double> >() {
+  return complex<double>(two_pi<double>(), double());
+}
+/// Returns pi / 2  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double half_pi<double>() {
+  uint64_t bits = 0x3ff921fb54442d18ull;
+  return reinterpret_cast<double const &>(bits);
+}
+/// Returns pi / 2  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> half_pi< complex<double> >() {
+  return complex<double>(half_pi<double>(), double());
+}
+/// Returns sqrt(pi)  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double root_pi<double>() {
+  uint64_t bits = 0x3ffc5bf891b4ef6aull;
+  return reinterpret_cast<double const &>(bits);
+}
+/// Returns sqrt(pi)  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> root_pi< complex<double> >() {
+  return complex<double>(root_pi<double>(), double());
+}
+/// Returns sqrt(pi / 2)  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double root_half_pi<double>() {
+  uint64_t bits = 0x3ff40d931ff62705ull;
+  return reinterpret_cast<double const &>(bits);
+}
+/// Returns sqrt(pi / 2)  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> root_half_pi< complex<double> >() {
+  return complex<double>(root_half_pi<double>(), double());
+}
+/// Returns sqrt(2 * pi)  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double root_two_pi<double>() {
+  uint64_t bits = 0x40040d931ff62705ull;
+  return reinterpret_cast<double const &>(bits);
+}
+/// Returns sqrt(2 * pi)  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> root_two_pi< complex<double> >() {
+  return complex<double>(root_two_pi<double>(), double());
+}
+/// Returns sqrt(ln(4))  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double root_ln_four<double>() {
+  uint64_t bits = 0x3ff2d6abe44afc43ull;
+  return reinterpret_cast<double const &>(bits);
+}
+/// Returns sqrt(ln(4))  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> root_ln_four< complex<double> >() {
+  return complex<double>(root_ln_four<double>(), double());
+}
+/// Returns e, approximately 2.718...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double e<double>() {
+  uint64_t bits = 0x4005bf0a8b145769ull;
+  return reinterpret_cast<double const &>(bits);
+}
+/// Returns e, approximately 2.718...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> e< complex<double> >() {
+  return complex<double>(e<double>(), double());
+}
+/// Returns (1/2)  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double half<double>() {
+  uint64_t bits = 0x3fe0000000000000ull;
+  return reinterpret_cast<double const &>(bits);
+}
+/// Returns (1/2)  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> half< complex<double> >() {
+  return complex<double>(half<double>(), double());
+}
+/// Returns sqrt(2), approximately 1.414...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double root_two<double>() {
+  uint64_t bits = 0x3ff6a09e667f3bcdull;
+  return reinterpret_cast<double const &>(bits);
+}
+/// Returns sqrt(2), approximately 1.414...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> root_two< complex<double> >() {
+  return complex<double>(root_two<double>(), double());
+}
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double half_root_two<double>() {
+  uint64_t bits = 0x3fe6a09e667f3bcdull;
+  return reinterpret_cast<double const &>(bits);
+}
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> half_root_two< complex<double> >() {
+  return complex<double>(half_root_two<double>(), double());
+}
+/// Returns ln(2), approximately 0.693...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double ln_two<double>() {
+  uint64_t bits = 0x3fe62e42fefa39efull;
+  return reinterpret_cast<double const &>(bits);
+}
+/// Returns ln(2), approximately 0.693...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> ln_two< complex<double> >() {
+  return complex<double>(ln_two<double>(), double());
+}
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double ln_ln_two<double>() {
+  uint64_t bits = 0xbfd774f29bdd6b9full;
+  return reinterpret_cast<double const &>(bits);
+}
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> ln_ln_two< complex<double> >() {
+  return complex<double>(ln_ln_two<double>(), double());
+}
+/// Returns 1/3, approximately 0.333...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double third<double>() {
+  uint64_t bits = 0x3fd5555555555555ull;
+  return reinterpret_cast<double const &>(bits);
+}
+/// Returns 1/3, approximately 0.333...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> third< complex<double> >() {
+  return complex<double>(third<double>(), double());
+}
+/// Returns 2/3, approximately 0.666...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double twothirds<double>() {
+  uint64_t bits = 0x3fe5555555555555ull;
+  return reinterpret_cast<double const &>(bits);
+}
+/// Returns 2/3, approximately 0.666...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> twothirds< complex<double> >() {
+  return complex<double>(twothirds<double>(), double());
+}
+/// Returns pi - 3, approximately 0.1416...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double pi_minus_three<double>() {
+  uint64_t bits = 0x3fc21fb54442d180ull;
+  return reinterpret_cast<double const &>(bits);
+}
+/// Returns pi - 3, approximately 0.1416...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> pi_minus_three< complex<double> >() {
+  return complex<double>(pi_minus_three<double>(), double());
+}
+/// Returns 4 - pi, approximately 0.858...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double four_minus_pi<double>() {
+  uint64_t bits = 0x3feb7812aeef4ba0ull;
+  return reinterpret_cast<double const &>(bits);
+}
+/// Returns 4 - pi, approximately 0.858...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> four_minus_pi< complex<double> >() {
+  return complex<double>(four_minus_pi<double>(), double());
+}
+/////////////////////////////////////////////////////////////////////////////////////
+// Specialization for float
+/// Returns 1, the multiplicative identity element  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float one<float>() {
+  uint32_t bits = 0x3f800000u;
+  return reinterpret_cast<float const &>(bits);
+}
+/// Returns 1, the multiplicative identity element  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> one< complex<float> >() {
+  return complex<float>(one<float>(), float());
+}
+/// Returns 0, the additive identity element  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float zero<float>() {
+  uint32_t bits = 0x0u;
+  return reinterpret_cast<float const &>(bits);
+}
+/// Returns 0, the additive identity element  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> zero< complex<float> >() {
+  return complex<float>(zero<float>(), float());
+}
+/// Returns 2  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float two<float>() {
+  uint32_t bits = 0x40000000u;
+  return reinterpret_cast<float const &>(bits);
+}
+/// Returns 2  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> two< complex<float> >() {
+  return complex<float>(two<float>(), float());
+}
+/// Returns pi, approximately 3.141  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float pi<float>() {
+  uint32_t bits = 0x40490fdbu;
+  return reinterpret_cast<float const &>(bits);
+}
+/// Returns pi, approximately 3.141  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> pi< complex<float> >() {
+  return complex<float>(pi<float>(), float());
+}
+/// Returns 2 * pi  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float two_pi<float>() {
+  uint32_t bits = 0x40c90fdbu;
+  return reinterpret_cast<float const &>(bits);
+}
+/// Returns 2 * pi  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> two_pi< complex<float> >() {
+  return complex<float>(two_pi<float>(), float());
+}
+/// Returns pi / 2  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float half_pi<float>() {
+  uint32_t bits = 0x3fc90fdbu;
+  return reinterpret_cast<float const &>(bits);
+}
+/// Returns pi / 2  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> half_pi< complex<float> >() {
+  return complex<float>(half_pi<float>(), float());
+}
+/// Returns sqrt(pi)  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float root_pi<float>() {
+  uint32_t bits = 0x3fe2dfc5u;
+  return reinterpret_cast<float const &>(bits);
+}
+/// Returns sqrt(pi)  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> root_pi< complex<float> >() {
+  return complex<float>(root_pi<float>(), float());
+}
+/// Returns sqrt(pi / 2)  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float root_half_pi<float>() {
+  uint32_t bits = 0x3fa06c99u;
+  return reinterpret_cast<float const &>(bits);
+}
+/// Returns sqrt(pi / 2)  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> root_half_pi< complex<float> >() {
+  return complex<float>(root_half_pi<float>(), float());
+}
+/// Returns sqrt(2 * pi)  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float root_two_pi<float>() {
+  uint32_t bits = 0x40206c99u;
+  return reinterpret_cast<float const &>(bits);
+}
+/// Returns sqrt(2 * pi)  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> root_two_pi< complex<float> >() {
+  return complex<float>(root_two_pi<float>(), float());
+}
+/// Returns sqrt(ln(4))  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float root_ln_four<float>() {
+  uint32_t bits = 0x3f96b55fu;
+  return reinterpret_cast<float const &>(bits);
+}
+/// Returns sqrt(ln(4))  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> root_ln_four< complex<float> >() {
+  return complex<float>(root_ln_four<float>(), float());
+}
+/// Returns e, approximately 2.718...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float e<float>() {
+  uint32_t bits = 0x402df854u;
+  return reinterpret_cast<float const &>(bits);
+}
+/// Returns e, approximately 2.718...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> e< complex<float> >() {
+  return complex<float>(e<float>(), float());
+}
+/// Returns (1/2)  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float half<float>() {
+  uint32_t bits = 0x3f000000u;
+  return reinterpret_cast<float const &>(bits);
+}
+/// Returns (1/2)  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> half< complex<float> >() {
+  return complex<float>(half<float>(), float());
+}
+/// Returns sqrt(2), approximately 1.414...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float root_two<float>() {
+  uint32_t bits = 0x3fb504f3u;
+  return reinterpret_cast<float const &>(bits);
+}
+/// Returns sqrt(2), approximately 1.414...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> root_two< complex<float> >() {
+  return complex<float>(root_two<float>(), float());
+}
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float half_root_two<float>() {
+  uint32_t bits = 0x3f3504f3u;
+  return reinterpret_cast<float const &>(bits);
+}
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> half_root_two< complex<float> >() {
+  return complex<float>(half_root_two<float>(), float());
+}
+/// Returns ln(2), approximately 0.693...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float ln_two<float>() {
+  uint32_t bits = 0x3f317218u;
+  return reinterpret_cast<float const &>(bits);
+}
+/// Returns ln(2), approximately 0.693...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> ln_two< complex<float> >() {
+  return complex<float>(ln_two<float>(), float());
+}
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float ln_ln_two<float>() {
+  uint32_t bits = 0xbebba795u;
+  return reinterpret_cast<float const &>(bits);
+}
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> ln_ln_two< complex<float> >() {
+  return complex<float>(ln_ln_two<float>(), float());
+}
+/// Returns 1/3, approximately 0.333...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float third<float>() {
+  uint32_t bits = 0x3eaaaaabu;
+  return reinterpret_cast<float const &>(bits);
+}
+/// Returns 1/3, approximately 0.333...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> third< complex<float> >() {
+  return complex<float>(third<float>(), float());
+}
+/// Returns 2/3, approximately 0.666...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float twothirds<float>() {
+  uint32_t bits = 0x3f2aaaabu;
+  return reinterpret_cast<float const &>(bits);
+}
+/// Returns 2/3, approximately 0.666...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> twothirds< complex<float> >() {
+  return complex<float>(twothirds<float>(), float());
+}
+/// Returns pi - 3, approximately 0.1416...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float pi_minus_three<float>() {
+  uint32_t bits = 0x3e10fdaau;
+  return reinterpret_cast<float const &>(bits);
+}
+/// Returns pi - 3, approximately 0.1416...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> pi_minus_three< complex<float> >() {
+  return complex<float>(pi_minus_three<float>(), float());
+}
+/// Returns 4 - pi, approximately 0.858...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float four_minus_pi<float>() {
+  uint32_t bits = 0x3f5bc095u;
+  return reinterpret_cast<float const &>(bits);
+}
+/// Returns 4 - pi, approximately 0.858...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> four_minus_pi< complex<float> >() {
+  return complex<float>(four_minus_pi<float>(), float());
+}
+/////////////////////////////////////////////////////////////////////////////////////
+// Specialization for tfloat32_t
+/// Returns 1, the multiplicative identity element  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t one<tfloat32_t>() {
+  uint32_t bits = 0x3f801000u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+/// Returns 1, the multiplicative identity element  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> one< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(one<tfloat32_t>(), tfloat32_t());
+}
+/// Returns 0, the additive identity element  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t zero<tfloat32_t>() {
+  uint32_t bits = 0x1000u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+/// Returns 0, the additive identity element  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> zero< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(zero<tfloat32_t>(), tfloat32_t());
+}
+/// Returns 2  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t two<tfloat32_t>() {
+  uint32_t bits = 0x40001000u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+/// Returns 2  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> two< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(two<tfloat32_t>(), tfloat32_t());
+}
+/// Returns pi, approximately 3.141  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t pi<tfloat32_t>() {
+  uint32_t bits = 0x40491fdbu;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+/// Returns pi, approximately 3.141  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> pi< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(pi<tfloat32_t>(), tfloat32_t());
+}
+/// Returns 2 * pi  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t two_pi<tfloat32_t>() {
+  uint32_t bits = 0x40c91fdbu;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+/// Returns 2 * pi  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> two_pi< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(two_pi<tfloat32_t>(), tfloat32_t());
+}
+/// Returns pi / 2  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t half_pi<tfloat32_t>() {
+  uint32_t bits = 0x3fc91fdbu;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+/// Returns pi / 2  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> half_pi< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(half_pi<tfloat32_t>(), tfloat32_t());
+}
+/// Returns sqrt(pi)  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t root_pi<tfloat32_t>() {
+  uint32_t bits = 0x3fe2efc5u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+/// Returns sqrt(pi)  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> root_pi< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(root_pi<tfloat32_t>(), tfloat32_t());
+}
+/// Returns sqrt(pi / 2)  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t root_half_pi<tfloat32_t>() {
+  uint32_t bits = 0x3fa07c99u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+/// Returns sqrt(pi / 2)  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> root_half_pi< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(root_half_pi<tfloat32_t>(), tfloat32_t());
+}
+/// Returns sqrt(2 * pi)  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t root_two_pi<tfloat32_t>() {
+  uint32_t bits = 0x40207c99u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+/// Returns sqrt(2 * pi)  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> root_two_pi< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(root_two_pi<tfloat32_t>(), tfloat32_t());
+}
+/// Returns sqrt(ln(4))  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t root_ln_four<tfloat32_t>() {
+  uint32_t bits = 0x3f96c55fu;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+/// Returns sqrt(ln(4))  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> root_ln_four< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(root_ln_four<tfloat32_t>(), tfloat32_t());
+}
+/// Returns e, approximately 2.718...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t e<tfloat32_t>() {
+  uint32_t bits = 0x402e0854u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+/// Returns e, approximately 2.718...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> e< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(e<tfloat32_t>(), tfloat32_t());
+}
+/// Returns (1/2)  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t half<tfloat32_t>() {
+  uint32_t bits = 0x3f001000u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+/// Returns (1/2)  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> half< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(half<tfloat32_t>(), tfloat32_t());
+}
+/// Returns sqrt(2), approximately 1.414...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t root_two<tfloat32_t>() {
+  uint32_t bits = 0x3fb514f3u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+/// Returns sqrt(2), approximately 1.414...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> root_two< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(root_two<tfloat32_t>(), tfloat32_t());
+}
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t half_root_two<tfloat32_t>() {
+  uint32_t bits = 0x3f3514f3u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> half_root_two< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(half_root_two<tfloat32_t>(), tfloat32_t());
+}
+/// Returns ln(2), approximately 0.693...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t ln_two<tfloat32_t>() {
+  uint32_t bits = 0x3f318218u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+/// Returns ln(2), approximately 0.693...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> ln_two< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(ln_two<tfloat32_t>(), tfloat32_t());
+}
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t ln_ln_two<tfloat32_t>() {
+  uint32_t bits = 0xbebbb795u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> ln_ln_two< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(ln_ln_two<tfloat32_t>(), tfloat32_t());
+}
+/// Returns 1/3, approximately 0.333...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t third<tfloat32_t>() {
+  uint32_t bits = 0x3eaabaabu;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+/// Returns 1/3, approximately 0.333...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> third< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(third<tfloat32_t>(), tfloat32_t());
+}
+/// Returns 2/3, approximately 0.666...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t twothirds<tfloat32_t>() {
+  uint32_t bits = 0x3f2abaabu;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+/// Returns 2/3, approximately 0.666...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> twothirds< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(twothirds<tfloat32_t>(), tfloat32_t());
+}
+/// Returns pi - 3, approximately 0.1416...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t pi_minus_three<tfloat32_t>() {
+  uint32_t bits = 0x3e110daau;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+/// Returns pi - 3, approximately 0.1416...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> pi_minus_three< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(pi_minus_three<tfloat32_t>(), tfloat32_t());
+}
+/// Returns 4 - pi, approximately 0.858...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t four_minus_pi<tfloat32_t>() {
+  uint32_t bits = 0x3f5bd095u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+/// Returns 4 - pi, approximately 0.858...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> four_minus_pi< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(four_minus_pi<tfloat32_t>(), tfloat32_t());
+}
+/////////////////////////////////////////////////////////////////////////////////////
+// Specialization for half_t
+/// Returns 1, the multiplicative identity element  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t one<half_t>() {
+  uint16_t bits = 0x3c00u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+/// Returns 1, the multiplicative identity element  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> one< complex<half_t> >() {
+  return complex<half_t>(one<half_t>(), half_t());
+}
+/// Returns 0, the additive identity element  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t zero<half_t>() {
+  uint16_t bits = 0x0u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+/// Returns 0, the additive identity element  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> zero< complex<half_t> >() {
+  return complex<half_t>(zero<half_t>(), half_t());
+}
+/// Returns 2  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t two<half_t>() {
+  uint16_t bits = 0x4000u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+/// Returns 2  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> two< complex<half_t> >() {
+  return complex<half_t>(two<half_t>(), half_t());
+}
+/// Returns pi, approximately 3.141  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t pi<half_t>() {
+  uint16_t bits = 0x4248u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+/// Returns pi, approximately 3.141  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> pi< complex<half_t> >() {
+  return complex<half_t>(pi<half_t>(), half_t());
+}
+/// Returns 2 * pi  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t two_pi<half_t>() {
+  uint16_t bits = 0x4648u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+/// Returns 2 * pi  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> two_pi< complex<half_t> >() {
+  return complex<half_t>(two_pi<half_t>(), half_t());
+}
+/// Returns pi / 2  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t half_pi<half_t>() {
+  uint16_t bits = 0x3e48u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+/// Returns pi / 2  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> half_pi< complex<half_t> >() {
+  return complex<half_t>(half_pi<half_t>(), half_t());
+}
+/// Returns sqrt(pi)  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t root_pi<half_t>() {
+  uint16_t bits = 0x3f17u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+/// Returns sqrt(pi)  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> root_pi< complex<half_t> >() {
+  return complex<half_t>(root_pi<half_t>(), half_t());
+}
+/// Returns sqrt(pi / 2)  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t root_half_pi<half_t>() {
+  uint16_t bits = 0x3d03u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+/// Returns sqrt(pi / 2)  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> root_half_pi< complex<half_t> >() {
+  return complex<half_t>(root_half_pi<half_t>(), half_t());
+}
+/// Returns sqrt(2 * pi)  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t root_two_pi<half_t>() {
+  uint16_t bits = 0x4103u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+/// Returns sqrt(2 * pi)  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> root_two_pi< complex<half_t> >() {
+  return complex<half_t>(root_two_pi<half_t>(), half_t());
+}
+/// Returns sqrt(ln(4))  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t root_ln_four<half_t>() {
+  uint16_t bits = 0x3cb6u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+/// Returns sqrt(ln(4))  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> root_ln_four< complex<half_t> >() {
+  return complex<half_t>(root_ln_four<half_t>(), half_t());
+}
+/// Returns e, approximately 2.718...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t e<half_t>() {
+  uint16_t bits = 0x4170u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+/// Returns e, approximately 2.718...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> e< complex<half_t> >() {
+  return complex<half_t>(e<half_t>(), half_t());
+}
+/// Returns (1/2)  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t half<half_t>() {
+  uint16_t bits = 0x3800u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+/// Returns (1/2)  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> half< complex<half_t> >() {
+  return complex<half_t>(half<half_t>(), half_t());
+}
+/// Returns sqrt(2), approximately 1.414...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t root_two<half_t>() {
+  uint16_t bits = 0x3da8u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+/// Returns sqrt(2), approximately 1.414...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> root_two< complex<half_t> >() {
+  return complex<half_t>(root_two<half_t>(), half_t());
+}
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t half_root_two<half_t>() {
+  uint16_t bits = 0x39a8u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> half_root_two< complex<half_t> >() {
+  return complex<half_t>(half_root_two<half_t>(), half_t());
+}
+/// Returns ln(2), approximately 0.693...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t ln_two<half_t>() {
+  uint16_t bits = 0x398cu;
+  return reinterpret_cast<half_t const &>(bits);
+}
+/// Returns ln(2), approximately 0.693...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> ln_two< complex<half_t> >() {
+  return complex<half_t>(ln_two<half_t>(), half_t());
+}
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t ln_ln_two<half_t>() {
+  uint16_t bits = 0xb5ddu;
+  return reinterpret_cast<half_t const &>(bits);
+}
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> ln_ln_two< complex<half_t> >() {
+  return complex<half_t>(ln_ln_two<half_t>(), half_t());
+}
+/// Returns 1/3, approximately 0.333...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t third<half_t>() {
+  uint16_t bits = 0x3555u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+/// Returns 1/3, approximately 0.333...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> third< complex<half_t> >() {
+  return complex<half_t>(third<half_t>(), half_t());
+}
+/// Returns 2/3, approximately 0.666...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t twothirds<half_t>() {
+  uint16_t bits = 0x3955u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+/// Returns 2/3, approximately 0.666...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> twothirds< complex<half_t> >() {
+  return complex<half_t>(twothirds<half_t>(), half_t());
+}
+/// Returns pi - 3, approximately 0.1416...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t pi_minus_three<half_t>() {
+  uint16_t bits = 0x3088u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+/// Returns pi - 3, approximately 0.1416...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> pi_minus_three< complex<half_t> >() {
+  return complex<half_t>(pi_minus_three<half_t>(), half_t());
+}
+/// Returns 4 - pi, approximately 0.858...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t four_minus_pi<half_t>() {
+  uint16_t bits = 0x3adeu;
+  return reinterpret_cast<half_t const &>(bits);
+}
+/// Returns 4 - pi, approximately 0.858...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> four_minus_pi< complex<half_t> >() {
+  return complex<half_t>(four_minus_pi<half_t>(), half_t());
+}
+/////////////////////////////////////////////////////////////////////////////////////
+// Specialization for bfloat16_t
+/// Returns 1, the multiplicative identity element  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t one<bfloat16_t>() {
+  uint16_t bits = 0x3f80u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+/// Returns 1, the multiplicative identity element  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> one< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(one<bfloat16_t>(), bfloat16_t());
+}
+/// Returns 0, the additive identity element  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t zero<bfloat16_t>() {
+  uint16_t bits = 0x0u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+/// Returns 0, the additive identity element  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> zero< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(zero<bfloat16_t>(), bfloat16_t());
+}
+/// Returns 2  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t two<bfloat16_t>() {
+  uint16_t bits = 0x4000u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+/// Returns 2  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> two< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(two<bfloat16_t>(), bfloat16_t());
+}
+/// Returns pi, approximately 3.141  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t pi<bfloat16_t>() {
+  uint16_t bits = 0x4049u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+/// Returns pi, approximately 3.141  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> pi< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(pi<bfloat16_t>(), bfloat16_t());
+}
+/// Returns 2 * pi  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t two_pi<bfloat16_t>() {
+  uint16_t bits = 0x40c9u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+/// Returns 2 * pi  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> two_pi< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(two_pi<bfloat16_t>(), bfloat16_t());
+}
+/// Returns pi / 2  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t half_pi<bfloat16_t>() {
+  uint16_t bits = 0x3fc9u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+/// Returns pi / 2  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> half_pi< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(half_pi<bfloat16_t>(), bfloat16_t());
+}
+/// Returns sqrt(pi)  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t root_pi<bfloat16_t>() {
+  uint16_t bits = 0x3fe3u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+/// Returns sqrt(pi)  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> root_pi< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(root_pi<bfloat16_t>(), bfloat16_t());
+}
+/// Returns sqrt(pi / 2)  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t root_half_pi<bfloat16_t>() {
+  uint16_t bits = 0x3fa0u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+/// Returns sqrt(pi / 2)  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> root_half_pi< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(root_half_pi<bfloat16_t>(), bfloat16_t());
+}
+/// Returns sqrt(2 * pi)  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t root_two_pi<bfloat16_t>() {
+  uint16_t bits = 0x4020u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+/// Returns sqrt(2 * pi)  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> root_two_pi< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(root_two_pi<bfloat16_t>(), bfloat16_t());
+}
+/// Returns sqrt(ln(4))  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t root_ln_four<bfloat16_t>() {
+  uint16_t bits = 0x3f97u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+/// Returns sqrt(ln(4))  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> root_ln_four< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(root_ln_four<bfloat16_t>(), bfloat16_t());
+}
+/// Returns e, approximately 2.718...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t e<bfloat16_t>() {
+  uint16_t bits = 0x402eu;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+/// Returns e, approximately 2.718...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> e< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(e<bfloat16_t>(), bfloat16_t());
+}
+/// Returns (1/2)  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t half<bfloat16_t>() {
+  uint16_t bits = 0x3f00u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+/// Returns (1/2)  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> half< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(half<bfloat16_t>(), bfloat16_t());
+}
+/// Returns sqrt(2), approximately 1.414...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t root_two<bfloat16_t>() {
+  uint16_t bits = 0x3fb5u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+/// Returns sqrt(2), approximately 1.414...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> root_two< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(root_two<bfloat16_t>(), bfloat16_t());
+}
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t half_root_two<bfloat16_t>() {
+  uint16_t bits = 0x3f35u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> half_root_two< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(half_root_two<bfloat16_t>(), bfloat16_t());
+}
+/// Returns ln(2), approximately 0.693...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t ln_two<bfloat16_t>() {
+  uint16_t bits = 0x3f31u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+/// Returns ln(2), approximately 0.693...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> ln_two< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(ln_two<bfloat16_t>(), bfloat16_t());
+}
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t ln_ln_two<bfloat16_t>() {
+  uint16_t bits = 0xbebcu;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> ln_ln_two< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(ln_ln_two<bfloat16_t>(), bfloat16_t());
+}
+/// Returns 1/3, approximately 0.333...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t third<bfloat16_t>() {
+  uint16_t bits = 0x3eabu;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+/// Returns 1/3, approximately 0.333...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> third< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(third<bfloat16_t>(), bfloat16_t());
+}
+/// Returns 2/3, approximately 0.666...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t twothirds<bfloat16_t>() {
+  uint16_t bits = 0x3f2bu;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+/// Returns 2/3, approximately 0.666...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> twothirds< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(twothirds<bfloat16_t>(), bfloat16_t());
+}
+/// Returns pi - 3, approximately 0.1416...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t pi_minus_three<bfloat16_t>() {
+  uint16_t bits = 0x3e11u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+/// Returns pi - 3, approximately 0.1416...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> pi_minus_three< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(pi_minus_three<bfloat16_t>(), bfloat16_t());
+}
+/// Returns 4 - pi, approximately 0.858...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t four_minus_pi<bfloat16_t>() {
+  uint16_t bits = 0x3f5cu;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+/// Returns 4 - pi, approximately 0.858...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> four_minus_pi< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(four_minus_pi<bfloat16_t>(), bfloat16_t());
+}
+///////////////////////////////////////////////////////////////////////////////////
+} // namespace constants
+} // namespace cutlass
+///////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/collective/collective_builder.hpp ADDED Viewed

	@@ -0,0 +1,94 @@

+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/conv/collective/collective_conv.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass::conv::collective {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Used to specify stage counts or dispatch to automatic computation of stage count
+template<int num_stages>
+struct StageCount {
+  static constexpr int value = num_stages;
+  StageCount() = default;
+  explicit StageCount(cute::Int<num_stages>) {}
+};
+template<int carveout_bytes>
+struct StageCountAutoCarveout {
+  static constexpr int bytes = carveout_bytes;
+  StageCountAutoCarveout() = default;
+  explicit StageCountAutoCarveout(cute::Int<carveout_bytes>) {}
+};
+// Used to automatically let the builder pick the kernel schedule.
+// Can be overridden with kernel schedule tags in cutlass/conv/dispatch_policy.hpp
+struct KernelScheduleAuto {};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  class ArchTag,
+  class OpClass,
+  conv::Operator,
+  class ElementA,
+  class GmemLayoutA,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutB,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType,
+  class Enable = void
+>
+struct CollectiveBuilder {
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Could not build a collective for given parameters.");
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass::conv::collective
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#include "builders/sm90_gmma_builder.inl"
+#include "builders/sm100_umma_builder.inl"
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/collective/collective_conv.hpp ADDED Viewed

	@@ -0,0 +1,63 @@

+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/conv/collective/detail.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass::conv::collective {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  class DispatchPolicy,
+  class TileShape,
+  class ElementA,
+  class ElementB,
+  class TiledMma,
+  class TileTraitsA,
+  class TileTraitsB
+>
+struct CollectiveConv {
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Could not find a mainloop specialization.");
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass::conv::collective
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#include "sm90_implicit_gemm_gmma_ss_warpspecialized.hpp"
+#include "sm100_implicit_gemm_umma_warpspecialized.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/collective/detail.hpp ADDED Viewed

	@@ -0,0 +1,271 @@

+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#include "cutlass/conv/convnd_problem_shape.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass::conv::collective::detail {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Construct the stride types for conv collectives based on the dispatch policy, strides 64b by default
+template <class DispatchPolicy>
+constexpr auto
+sm90_dispatch_policy_to_stride_A() {
+  if constexpr (DispatchPolicy::ConvOp == conv::Operator::kFprop) {
+    // Maps to modes ((w,n), C)
+    if constexpr (DispatchPolicy::NumSpatialDimensions == 1) {
+      return cute::Stride<cute::Stride<int64_t, int64_t>,
+                          cute::Int<1>>{};
+    }
+    // Maps to modes ((w,h,n), C)
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 2) {
+      return cute::Stride<cute::Stride<int64_t, int64_t, int64_t>,
+                          cute::Int<1>>{};
+    }
+    // Maps to modes ((w,h,d,n), C)
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 3) {
+      return cute::Stride<cute::Stride<int64_t, int64_t, int64_t, int64_t>,
+                          cute::Int<1>>{};
+    }
+    // error dims assert
+    else {
+      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
+    }
+  }
+  else if constexpr (DispatchPolicy::ConvOp == conv::Operator::kWgrad) {
+    // Maps to modes (k, nq/npq/nzpq)
+    if constexpr (DispatchPolicy::NumSpatialDimensions == 1 ||
+                  DispatchPolicy::NumSpatialDimensions == 2 ||
+                  DispatchPolicy::NumSpatialDimensions == 3) {
+      return cute::Stride<cute::Int<1>, int64_t>{};
+    }
+    // error dims assert
+    else {
+      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
+    }
+  }
+  else if constexpr (DispatchPolicy::ConvOp == conv::Operator::kDgrad) {
+    // Maps to modes ((q,n), K)
+    if constexpr (DispatchPolicy::NumSpatialDimensions == 1) {
+      return cute::Stride<cute::Stride<int64_t, int64_t>,
+                          cute::Int<1>>{};
+    }
+    // Maps to modes ((q,p,n), K)
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 2) {
+      return cute::Stride<cute::Stride<int64_t, int64_t, int64_t>,
+                          cute::Int<1>>{};
+    }
+    // Maps to modes ((q,p,z,n), K)
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 3) {
+      return cute::Stride<cute::Stride<int64_t, int64_t, int64_t, int64_t>,
+                          cute::Int<1>>{};
+    }
+    // error dims assert
+    else {
+      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
+    }
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported ConvOp.");
+  }
+}
+// Construct the stirde types for conv collectives based on the dispatch policy, strides 64b by default
+template <class DispatchPolicy>
+constexpr auto
+sm90_dispatch_policy_to_stride_B() {
+  if constexpr (DispatchPolicy::ConvOp == conv::Operator::kFprop) {
+    // Maps to modes (k, (C,s))
+    if constexpr      (DispatchPolicy::NumSpatialDimensions == 1) {
+      return cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t>>{};
+    }
+    // Maps to modes (k, (C,s,r))
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 2) {
+      return cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t, int64_t>>{};
+    }
+    // Maps to modes (k, (C,s,r,t))
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 3) {
+      return cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t, int64_t, int64_t>>{};
+    }
+    // error dims assert
+    else {
+      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
+    }
+  }
+  else if constexpr (DispatchPolicy::ConvOp == conv::Operator::kWgrad) {
+    // Maps to modes (C, (w,n))
+    if constexpr (DispatchPolicy::NumSpatialDimensions == 1) {
+      return cute::Stride<cute::Int<1>,
+                          cute::Stride<int64_t, int64_t>>{};
+    }
+    // Maps to modes (C, (w,h,n))
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 2) {
+      return cute::Stride<cute::Int<1>,
+                          cute::Stride<int64_t, int64_t, int64_t>>{};
+    }
+    // Maps to modes (C, (w,h,d,n))
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 3) {
+      return cute::Stride<cute::Int<1>,
+                          cute::Stride<int64_t, int64_t, int64_t, int64_t>>{};
+    }
+    // error dims assert
+    else {
+      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
+    }
+  }
+  else if constexpr (DispatchPolicy::ConvOp == conv::Operator::kDgrad) {
+    // Maps to modes (C, (k,s))
+    if constexpr      (DispatchPolicy::NumSpatialDimensions == 1) {
+      return cute::Stride<cute::Int<1>, cute::Stride<int64_t, int64_t>>{};
+    }
+    // Maps to modes (C, (k,s,r))
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 2) {
+      return cute::Stride<cute::Int<1>, cute::Stride<int64_t, int64_t, int64_t>>{};
+    }
+    // Maps to modes (C, (k,s,r,t))
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 3) {
+      return cute::Stride<cute::Int<1>, cute::Stride<int64_t, int64_t, int64_t, int64_t>>{};
+    }
+    // error dims assert
+    else {
+      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
+    }
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported ConvOp.");
+  }
+}
+template <class DispatchPolicy>
+constexpr auto
+sm100_dispatch_policy_to_stride_A() {
+  return sm90_dispatch_policy_to_stride_A<DispatchPolicy>();
+}
+template <class DispatchPolicy>
+constexpr auto
+sm100_dispatch_policy_to_stride_B() {
+  return sm90_dispatch_policy_to_stride_B<DispatchPolicy>();
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Compute the lower/near corner, returning it as a cute::array in [W,H,D] order
+template <conv::Operator ConvOp, int NumSpatialDimensions>
+CUTLASS_HOST_DEVICE
+constexpr auto
+compute_lower_corner_whd(ConvProblemShape<ConvOp, NumSpatialDimensions> const& problem_shape) {
+  using cute::for_each;
+  using cute::make_seq;
+  cute::array<int, NumSpatialDimensions> lower{};
+  if constexpr (ConvOp == conv::Operator::kFprop ||
+                ConvOp == conv::Operator::kWgrad) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      lower[NumSpatialDimensions-1-i] = -1 * problem_shape.lower_padding[i];
+    });
+  }
+  else if constexpr (ConvOp == conv::Operator::kDgrad) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      lower[NumSpatialDimensions-1-i] = problem_shape.lower_padding[i] -
+        (problem_shape.shape_B[i+1] - 1) * problem_shape.dilation[i];
+    });
+  }
+  return lower;
+}
+// Computes the upper/far corner, returning it as a cute::array in [W,H,D] order
+template <conv::Operator ConvOp, int NumSpatialDimensions>
+CUTLASS_HOST_DEVICE
+constexpr auto
+compute_upper_corner_whd(ConvProblemShape<ConvOp, NumSpatialDimensions> const& problem_shape) {
+  using cute::for_each;
+  using cute::make_seq;
+  cute::array<int, NumSpatialDimensions> upper{};
+  if constexpr (ConvOp == conv::Operator::kFprop) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      upper[NumSpatialDimensions-1-i] = problem_shape.upper_padding[i] -
+        (problem_shape.shape_B[i+1] - 1) * problem_shape.dilation[i];
+    });
+  }
+  else if constexpr (ConvOp == conv::Operator::kWgrad) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      upper[NumSpatialDimensions-1-i] = problem_shape.upper_padding[i] -
+        (problem_shape.shape_C[i+1] - 1) * problem_shape.dilation[i];
+    });
+  }
+  else if constexpr (ConvOp == conv::Operator::kDgrad) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      upper[NumSpatialDimensions-1-i] = problem_shape.lower_padding[i] -
+        (problem_shape.shape_B[i+1] - 1) * problem_shape.dilation[i] + problem_shape.shape_C[i+1] - problem_shape.shape_A[i+1];
+    });
+  }
+  return upper;
+}
+// Compute the lower/near corner of (t,r,s), returning it as a cute::array in [S,R,T] order
+template <conv::Operator ConvOp, int NumSpatialDimensions>
+CUTLASS_HOST_DEVICE
+constexpr auto
+compute_lower_srt(ConvProblemShape<ConvOp, NumSpatialDimensions> const& problem_shape) {
+  using cute::for_each;
+  using cute::make_seq;
+  cute::array<int, NumSpatialDimensions> lower{};
+  if constexpr (ConvOp == conv::Operator::kFprop ||
+                ConvOp == conv::Operator::kWgrad) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      lower[NumSpatialDimensions-1-i] = 0;
+    });
+  }
+  else if constexpr (ConvOp == conv::Operator::kDgrad) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      lower[NumSpatialDimensions-1-i] = (problem_shape.shape_B[i+1] - 1) * problem_shape.dilation[i];
+    });
+  }
+  return lower;
+}
+template <class CopyOp> struct is_im2col_load { static constexpr bool value = false; };
+template <> struct is_im2col_load<cute::SM90_TMA_LOAD_IM2COL          > { static constexpr bool value = true; };
+template <> struct is_im2col_load<cute::SM90_TMA_LOAD_IM2COL_MULTICAST> { static constexpr bool value = true; };
+template <> struct is_im2col_load<cute::SM100_TMA_2SM_LOAD_IM2COL          > { static constexpr bool value = true; };
+template <> struct is_im2col_load<cute::SM100_TMA_2SM_LOAD_IM2COL_MULTICAST> { static constexpr bool value = true; };
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass::conv::collective::detail

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/collective/sm100_implicit_gemm_umma_warpspecialized.hpp ADDED Viewed

	@@ -0,0 +1,917 @@

+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/conv/detail.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+#include "cutlass/trace.h"
+#if (! defined(__CUDA_ARCH__)) && (CUTLASS_DEBUG_TRACE_LEVEL > 0)
+#  include <sstream>
+#endif
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass::conv::collective {
+using namespace cute;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// WarpSpecialized Mainloop
+// Both DMA Load and MMA methods of this class must be run by a single thread that's picked by elect_one
+template <
+  conv::Operator ConvOp,
+  int Stages,
+  int NumSpatialDims,
+  int SchedulerPipelineStageCount,
+  int AccumulatorPipelineStageCount,
+  class ClusterShape,    // Static cluster shape or dynamic (int, int, _1)
+  class TileShapeMNKL_,  // (MmaAtomShapeM, MmaAtomShapeN, TileK, optional: TileL)
+  class ElementA_,
+  class ElementB_,
+  class TiledMma_,
+  class TileTraitsA_,
+  class TileTraitsB_>
+struct CollectiveConv<
+    MainloopSm100TmaUmmaWarpSpecializedImplicitGemm<
+      ConvOp,
+      Stages,
+      NumSpatialDims,
+      SchedulerPipelineStageCount,
+      AccumulatorPipelineStageCount,
+      ClusterShape>,
+    TileShapeMNKL_,
+    ElementA_,
+    ElementB_,
+    TiledMma_,
+    TileTraitsA_,
+    TileTraitsB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm100TmaUmmaWarpSpecializedImplicitGemm<
+                           ConvOp,
+                           Stages,
+                           NumSpatialDims,
+                           SchedulerPipelineStageCount,
+                           AccumulatorPipelineStageCount,
+                           ClusterShape>;
+  using TileShape = decltype(cute::take<0,3>(TileShapeMNKL_{})); // (MmaAtomShapeM, MmaAtomShapeN, TileK)
+  using ElementA = ElementA_;
+  using ElementB = ElementB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = typename TileTraitsA_::GmemTiledCopy;
+  using GmemTiledCopyB = typename TileTraitsB_::GmemTiledCopy;
+  using SmemLayoutAtomA = typename TileTraitsA_::SmemLayoutAtom;
+  using SmemLayoutAtomB = typename TileTraitsB_::SmemLayoutAtom;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  static constexpr int NumSpatialDimensions = DispatchPolicy::NumSpatialDimensions;
+  static constexpr int NumTensorDimensions = NumSpatialDimensions + 2;
+  // deducde the kernel facing stride tuple types based on the dispatch policy (spatial dim, algo, etc.)
+  using StrideA = decltype(detail::sm100_dispatch_policy_to_stride_A<DispatchPolicy>());
+  using StrideB = decltype(detail::sm100_dispatch_policy_to_stride_B<DispatchPolicy>());
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using TmaInternalElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, cute::uint_bit_t<cute::sizeof_bits_v<ElementA>>>;
+  using TmaInternalElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, cute::uint_bit_t<cute::sizeof_bits_v<ElementB>>>;
+  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+  // Determine MMA type: MMA_1SM vs MMA_2SM
+  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma_::ThrLayoutVMNK{})), _1, _1>;
+  using MainloopPipeline = cutlass::PipelineTmaUmmaAsync<
+                             DispatchPolicy::Stages,
+                             ClusterShape,
+                             AtomThrShapeMNK>;
+  using MainloopPipelineState = typename MainloopPipeline::PipelineState;
+  using ProblemShape = ConvProblemShape<ConvOp, NumSpatialDimensions>;
+  CUTE_STATIC_ASSERT_V(evenly_divides(shape<0>(TileShape{}), tile_size<0>(TiledMma{})), "TileShape_M should be evenly divided by TiledMma_M");
+  CUTE_STATIC_ASSERT_V(evenly_divides(shape<1>(TileShape{}), tile_size<1>(TiledMma{})) || (ConvOp == conv::Operator::kWgrad), "TileShape_N should be evenly divided by TiledMma_N");
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, AtomThrShapeMNK{}));
+  // Define A and B block shapes for reduced size TMA_LOADs
+  using MmaShapeA_MK = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShape{}), size<2>(TileShape{}))));
+  using MmaShapeB_NK = decltype(partition_shape_B(TiledMma{}, make_shape(size<1>(TileShape{}), size<2>(TileShape{}))));
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert(((size<0,0>(MmaShapeA_MK{}) * size<1>(MmaShapeA_MK{})) % size<0>(SmemLayoutAtomA{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(((size<0,1>(MmaShapeA_MK{}) * size<2>(MmaShapeA_MK{})) % size<1>(SmemLayoutAtomA{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert(((size<0,0>(MmaShapeB_NK{}) * size<1>(MmaShapeB_NK{})) % size<0>(SmemLayoutAtomB{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(((size<0,1>(MmaShapeB_NK{}) * size<2>(MmaShapeB_NK{})) % size<1>(SmemLayoutAtomB{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  // Tile along K mode first before tiling over MN. PIPE mode last as usual.
+  // This maximizes TMA boxes due to better smem-K vectorization, reducing total issued TMAs.
+  using SmemLayoutA = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomA{},
+      append(MmaShapeA_MK{}, Int<DispatchPolicy::Stages>{}),
+      Step<_2,_1,_3>{}));
+  using SmemLayoutB = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomB{},
+      append(MmaShapeB_NK{}, Int<DispatchPolicy::Stages>{}),
+      Step<_2,_1,_3>{}));
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+  static_assert(cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static constexpr bool is_im2col_A = detail::is_im2col_load<GmemTiledCopyA>::value;
+  static constexpr bool is_im2col_B = detail::is_im2col_load<GmemTiledCopyB>::value;
+  static constexpr bool is_strided_dgrad = ConvOp == conv::Operator::kDgrad && not is_im2col_A && not is_im2col_B;
+  static constexpr int TileShapeMNKLRank = rank(TileShapeMNKL_{});
+  // If rank > 3, TileL exists and it is GroupsPerTile. The kernel is grouped conv now.
+  static constexpr bool is_grouped_wgrad = ConvOp == conv::Operator::kWgrad && TileShapeMNKLRank > 3;
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+    } tensors;
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+  // Only one thread issues the TMA and updates the barriers in a 2SM MMA, adjust bytes accordingly
+  static constexpr uint32_t TmaTransactionBytes =
+    size(AtomThrShapeMNK{}) * (size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * size<2>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof(ElementA))) +
+    size(AtomThrShapeMNK{}) * (size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * size<2>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof(ElementB)));
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A{nullptr};
+    ElementB const* ptr_B{nullptr};
+  };
+private:
+  // Note that for fprop and non-strided dgrad kernel, the tma load mode is im2col for tensor A and tiled for
+  // tensor B while for wgrad kernel, the tma load mode is tiled for tensor A and im2col for tensor
+  // B since operand A, B is swapped.
+  // For strided dgrad A and B are both tma tiled and not im2col
+  template <class TensorA, class ClusterShapeVMNK>
+  static constexpr auto
+  get_tma_load_a_instance(
+    TensorA const& tensor_a,
+    ProblemShape const& problem_shape,
+    ClusterShapeVMNK const& cluster_shape_vmnk) {
+    if constexpr (is_im2col_A) {
+      // compute the upper and lower corners based on the conv padding
+      auto lower_corner_whd = detail::compute_lower_corner_whd(problem_shape);
+      auto upper_corner_whd = detail::compute_upper_corner_whd(problem_shape);
+      auto lower_srt = detail::compute_lower_srt(problem_shape);
+      // gbasis strides for dgrad kernel need to be negated
+      cute::array<int32_t, NumSpatialDimensions> stride_srt{};
+      for (int i = 0; i < NumSpatialDimensions; ++i) {
+        stride_srt[i] = ConvOp == conv::Operator::kDgrad ?
+            -problem_shape.dilation[NumSpatialDimensions-1-i] :
+            problem_shape.dilation[NumSpatialDimensions-1-i];
+      }
+      return make_im2col_tma_atom_A_sm100(
+          GmemTiledCopyA{},
+          tensor_a,
+          SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+          TileShape{},
+          TiledMma{},
+          cluster_shape_vmnk,
+          shape(lower_corner_whd),
+          shape(upper_corner_whd),
+          cute::reverse(shape(problem_shape.lower_padding)),
+          cute::reverse(shape(problem_shape.upper_padding)),
+          cute::reverse(shape(problem_shape.traversal_stride)),
+          shape(lower_srt),
+          shape(stride_srt));
+    }
+    // TMA tiled mode for tensor A in wgrad and strided dgrad
+    else {
+      return make_tma_atom_A_sm100<TmaInternalElementA>(
+          GmemTiledCopyA{},
+          tensor_a,
+          SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+          TileShape{},
+          TiledMma{},
+          cluster_shape_vmnk);
+    }
+  }
+  template <class TensorB, class ClusterShapeVMNK>
+  static constexpr auto
+  get_tma_load_b_instance(
+    TensorB const& tensor_b,
+    ProblemShape const& problem_shape,
+    ClusterShapeVMNK const& cluster_shape_vmnk) {
+    if constexpr (is_im2col_B) {
+      // compute the upper and lower corners based on the conv padding
+      auto lower_corner_whd = detail::compute_lower_corner_whd(problem_shape);
+      auto upper_corner_whd = detail::compute_upper_corner_whd(problem_shape);
+      auto lower_srt = detail::compute_lower_srt(problem_shape);
+      return make_im2col_tma_atom_B_sm100(
+          GmemTiledCopyB{},
+          tensor_b,
+          SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+          TileShape{},
+          TiledMma{},
+          cluster_shape_vmnk,
+          shape(lower_corner_whd),
+          shape(upper_corner_whd),
+          cute::reverse(shape(problem_shape.lower_padding)),
+          cute::reverse(shape(problem_shape.upper_padding)),
+          cute::reverse(shape(problem_shape.traversal_stride)),
+          shape(lower_srt),
+          cute::reverse(shape(problem_shape.dilation)));
+    }
+    else {
+      return make_tma_atom_B_sm100<TmaInternalElementB>(
+          GmemTiledCopyB{},
+          tensor_b,
+          SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+          TileShape{},
+          TiledMma{},
+          cluster_shape_vmnk);
+    }
+  }
+public:
+  // Performs im2col transformations on the input of type ConvProblemShape
+  static constexpr auto
+  get_problem_shape_MNKL(ProblemShape const& problem_shape) {
+    if constexpr (is_im2col_A || is_im2col_B) {
+      // transformation + im2col linearization
+      return cutlass::conv::detail::get_linearized_problem_shape_MNKL(problem_shape);
+    }
+    else {
+      // transformation
+      return cutlass::conv::detail::get_transformed_problem_shape_MNKL(problem_shape);
+    }
+  }
+  // Device-side kernel params
+  //
+  // Arguments has the untransformed problem shape from the user.
+  // Params will have the transformed problem shape.
+  struct Params {
+    using _Submode = decltype(take<0,NumTensorDimensions-1>(typename ProblemShape::TensorExtent{}));
+    using ClusterLayout_VMNK = decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}), ClusterShape{})),
+                                                     make_tile(typename TiledMma::AtomThrID{})));
+    // Assumption: StrideA is congruent with Problem_MK
+    // Select TMA load type according to convolution operator.
+    using TensorShapeA = cute::conditional_t<ConvOp == conv::Operator::kWgrad,
+        decltype(repeat_like(StrideA{}, int32_t(0))),
+        decltype(make_shape(_Submode{}, int32_t(0)))>;
+    using TensorShapeB = cute::conditional_t<ConvOp == conv::Operator::kWgrad,
+        decltype(make_shape(int32_t(0), _Submode{})),
+        decltype(repeat_like(StrideB{}, int32_t(0)))>;
+    using TMA_A = decltype(get_tma_load_a_instance(
+        make_tensor(
+            make_gmem_ptr(recast_ptr<TmaInternalElementA>(nullptr)),
+            make_layout(TensorShapeA{}, StrideA{})),
+        ConvProblemShape<ConvOp, NumSpatialDimensions>{},
+        ClusterLayout_VMNK{}));
+    using TMA_B = decltype(get_tma_load_b_instance(
+        make_tensor(
+            make_gmem_ptr(recast_ptr<TmaInternalElementB>(nullptr)),
+            make_layout(TensorShapeB{}, StrideB{})),
+        ConvProblemShape<ConvOp, NumSpatialDimensions>{},
+        ClusterLayout_VMNK{}));
+    // Members
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_A tma_load_a_fallback;
+    TMA_B tma_load_b_fallback;
+    dim3 cluster_shape_fallback;
+  };
+  //
+  // Constructor
+  //
+  CUTLASS_DEVICE
+  CollectiveConv(Params const& params, ClusterShape cluster_shape, uint32_t block_rank_in_cluster)
+    : cluster_shape_(cluster_shape)
+    , block_rank_in_cluster_(block_rank_in_cluster) {
+    if constexpr (IsDynamicCluster) {
+      const bool is_fallback_cluster = (cute::size<0>(cluster_shape_) == params.cluster_shape_fallback.x &&
+                                        cute::size<1>(cluster_shape_) == params.cluster_shape_fallback.y);
+      observed_tma_load_a_ = is_fallback_cluster ? &params.tma_load_a_fallback : &params.tma_load_a;
+      observed_tma_load_b_ = is_fallback_cluster ? &params.tma_load_b_fallback : &params.tma_load_b;
+    }
+    else {
+      observed_tma_load_a_ = &params.tma_load_a;
+      observed_tma_load_b_ = &params.tma_load_b;
+    }
+  }
+  //
+  // Methods
+  //
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) {
+    (void) workspace;
+    // from the flat problem shape arrays of ConvProblemShape<N>, create a rank-3 MNK problem shape tuple
+    // tma desc creation depends on the original untransformed domain.
+    // A extents.
+    auto shape_A_orig = problem_shape.get_shape_A();
+    // B extents.
+    auto shape_B_orig = problem_shape.get_shape_B();
+    // Fill inferred cute strides from flat stride arrays
+    auto dA = make_cute_packed_stride(StrideA{}, problem_shape.stride_A, ConvOp);
+    auto dB = make_cute_packed_stride(StrideB{}, problem_shape.stride_B, ConvOp);
+    auto ptr_A = recast_ptr<TmaInternalElementA>(args.ptr_A);
+    auto ptr_B = recast_ptr<TmaInternalElementB>(args.ptr_B);
+    Tensor tensor_a = make_tensor(make_gmem_ptr(ptr_A), make_layout(shape_A_orig, dA));
+    Tensor tensor_b = make_tensor(make_gmem_ptr(ptr_B), make_layout(shape_B_orig, dB));
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape);
+    // Cluster layout for TMA construction
+    auto cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMma::AtomThrID{}));
+    auto cluster_shape_fallback = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape_fallback);
+    // Cluster layout for TMA construction
+    auto cluster_layout_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMma::AtomThrID{}));
+    auto tma_load_a = get_tma_load_a_instance(tensor_a, problem_shape, cluster_layout_vmnk);
+    auto tma_load_b = get_tma_load_b_instance(tensor_b, problem_shape, cluster_layout_vmnk);
+    auto tma_load_a_fallback = get_tma_load_a_instance(tensor_a, problem_shape, cluster_layout_vmnk_fallback);
+    auto tma_load_b_fallback = get_tma_load_b_instance(tensor_b, problem_shape, cluster_layout_vmnk_fallback);
+    static_assert(size(typename decltype(tma_load_a)::ThrID{}) == size(AtomThrShapeMNK{}));
+    static_assert(size(typename decltype(tma_load_b)::ThrID{}) == size(AtomThrShapeMNK{}));
+    return {
+      tma_load_a,
+      tma_load_b,
+      tma_load_a_fallback,
+      tma_load_b_fallback,
+      hw_info.cluster_shape_fallback
+    };
+  }
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      Arguments const& args) {
+    // Activation and Filter channel mode extents much match
+    bool implementable = true;
+    // channel mode is major
+    {
+      const bool check = problem_shape.stride_A[NumTensorDimensions-1] == 1;
+#if (! defined(__CUDA_ARCH__)) && (CUTLASS_DEBUG_TRACE_LEVEL > 0)
+      if (not check) {
+        const auto offending_stride =
+          problem_shape.stride_A[NumTensorDimensions-1];
+        std::ostringstream os;
+        os << "CollectiveConv::can_implement: "
+          "problem_shape.stride_A[NumTensorDimensions-1 = "
+          << (NumTensorDimensions-1) << "] = "
+          << offending_stride << " != 1";
+        CUTLASS_TRACE_HOST( os.str() );
+      }
+#endif
+      implementable &= check;
+    }
+    {
+      const bool check = problem_shape.stride_B[NumTensorDimensions-1] == 1;
+#if (! defined(__CUDA_ARCH__)) && (CUTLASS_DEBUG_TRACE_LEVEL > 0)
+      if (not check) {
+        const auto offending_stride =
+          problem_shape.stride_B[NumTensorDimensions-1];
+        std::ostringstream os;
+        os << "CollectiveConv::can_implement: "
+          "problem_shape.stride_B[NumTensorDimensions-1 = "
+          << (NumTensorDimensions-1) << "] = "
+          << offending_stride << " != 1\n";
+        CUTLASS_TRACE_HOST( os.str() );
+      }
+#endif
+      implementable &= check;
+    }
+    {
+      const auto & traversal_stride  = problem_shape.traversal_stride;
+      for (auto stride: traversal_stride) {
+       implementable &= (stride >= 1 && stride <= 8);
+      }
+    }
+    if constexpr (ConvOp == conv::Operator::kDgrad && not is_strided_dgrad) {
+      const auto & traversal_stride  = problem_shape.traversal_stride;
+      for (auto stride: traversal_stride) {
+        implementable &= (stride == 1);
+      }
+    }
+    constexpr int tma_alignment_bits = 128;
+    // A extents.
+    auto shape_A_orig = problem_shape.get_shape_A();
+    // B extents.
+    auto shape_B_orig = problem_shape.get_shape_B();
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    {
+      const bool check = cutlass::detail::check_alignment<min_tma_aligned_elements_A>(shape_A_orig, StrideA{});
+      if (not check) {
+        CUTLASS_TRACE_HOST("A shape and/or strides have alignment issue.");
+      }
+      implementable &= check;
+    }
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    {
+      const bool check = cutlass::detail::check_alignment<min_tma_aligned_elements_B>(shape_B_orig, StrideB{});
+      if (not check) {
+        CUTLASS_TRACE_HOST("B shape and/or strides have alignment issue.");
+      }
+      implementable &= check;
+    }
+    if (not implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+      return false;
+    }
+    if (is_im2col_A || is_im2col_B) {
+      // Check valid corner values for TMA_LOAD_IM2COL, signed int ranging from [-corner_limit, corner_limit - 1]
+      constexpr int32_t corner_limit = 1 << (16 / NumSpatialDimensions - 1);
+      auto lower_corner_whd = detail::compute_lower_corner_whd(problem_shape);
+      for (int i = 0; i < problem_shape.RankS; ++i) {
+        implementable = implementable && lower_corner_whd[i] >= -corner_limit && lower_corner_whd[i] <= (corner_limit - 1);
+      }
+      auto upper_corner_whd = detail::compute_upper_corner_whd(problem_shape);
+      for (int i = 0; i < problem_shape.RankS; ++i) {
+        implementable = implementable && upper_corner_whd[i] >= -corner_limit && upper_corner_whd[i] <= (corner_limit - 1);
+      }
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Padding values don't meet requirements for TMA LOAD IM2COL.\n");
+        return false;
+      }
+    }
+    if (is_im2col_A || is_im2col_B) {
+      // Check valid filter offsets for TMA_LOAD_IM2COL, unsigned int ranging from [0, offset_limit]
+      constexpr int32_t offset_limit = (1 << (16 / NumSpatialDimensions)) - 1;
+      auto flt_data = (ConvOp == conv::Operator::kWgrad) ? problem_shape.shape_C : problem_shape.shape_B;
+      for (int i = 0; i < problem_shape.RankS; ++i) {
+        // flt_data array contains [K, T, R, S, C], so pure filter [T, R, S] starts from the second position in the array
+        implementable = implementable && ((flt_data[i+1] - 1) * problem_shape.dilation[i] >= 0)
+                                      && ((flt_data[i+1] - 1) * problem_shape.dilation[i] <= offset_limit);
+      }
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: tensor coordinate offset values don't meet requirements for TMA LOAD IM2COL.\n");
+        return false;
+      }
+    }
+    // Wgrad kernels don't support non-packed output strides, non-packed tensor A stride (linearized)
+    if constexpr (ConvOp == conv::Operator::kWgrad) {
+      const auto & input_shape  = problem_shape.shape_A;
+      const auto & input_stride  = problem_shape.stride_A;
+      implementable &= input_stride[ProblemShape::RankT - 1] == 1;
+      int64_t input_shape_size = 1;
+      for (int i = ProblemShape::RankT - 2; i >= 0; --i) {
+        input_shape_size *= input_shape[i + 1];
+        implementable &= input_stride[i] == input_shape_size;
+      }
+      const auto & output_shape  = problem_shape.shape_C;
+      const auto & output_stride  = problem_shape.stride_C;
+      implementable &= output_stride[ProblemShape::RankT - 1] == 1;
+      int64_t output_shape_size = 1;
+      for (int i = ProblemShape::RankT - 2; i >= 0; --i) {
+        output_shape_size *= output_shape[i + 1];
+        implementable &= output_stride[i] == output_shape_size;
+      }
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Wgrad kernels don't support non-packed output strides.\n");
+        return false;
+      }
+    }
+    // Conv kernels only support cross correlation mode currently.
+    {
+      implementable &= problem_shape.mode == cutlass::conv::Mode::kCrossCorrelation;
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Conv kernels only support cross correlation mode currently.\n");
+        return false;
+      }
+    }
+    // When groups > 1, it should be a Grouped Conv.
+    if (problem_shape.groups > 1) {
+      implementable &= TileShapeMNKLRank > 3;
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Only Grouped Conv can support groups > 1.\n");
+        return false;
+      }
+    }
+    // Only support Grouped Wgrad currently.
+    if constexpr (TileShapeMNKLRank > 3) {
+      implementable &= ConvOp == conv::Operator::kWgrad;
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Grouped Conv Only support Grouped Wgrad currently.\n");
+        return false;
+      }
+    }
+    // Grouped Wgrad channel check.
+    if constexpr (is_grouped_wgrad) {
+      int input_K = size<0>(problem_shape.get_shape_A());
+      int input_C = size<0>(problem_shape.get_shape_B());
+      implementable &= input_K == input_C;
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Grouped Conv's input K and input C do not match.\n");
+        return false;
+      }
+      int output_K = size<0>(problem_shape.get_shape_C());
+      int output_C = size<1,0>(problem_shape.get_shape_C());
+      implementable &= input_K == output_K;
+      implementable &= input_C == output_C * problem_shape.groups;
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Grouped Wgrad's input and output K,C and groups do not match\n");
+        return false;
+      }
+      constexpr int Tile_N = size<1>(TileShape{});
+      constexpr int GroupsPerTile = size<3>(TileShapeMNKL_{});
+      implementable &= Tile_N / GroupsPerTile == input_C / problem_shape.groups;
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Grouped Wgrad's Tile_N, GroupsPerTile and input_C, groups do not match.\n");
+        return false;
+      }
+    }
+    // The extents of linearized problem shape should be int32_t type(maximum is 2^31-1).
+    if constexpr (is_im2col_A || is_im2col_B) {
+      auto [M, N, K, L] = cutlass::conv::detail::get_transformed_problem_shape_MNKL(problem_shape);
+      auto to_64b = [](auto S) { return transform_leaf(S, [](auto s) { return static_cast<int64_t>(s); }); };
+      if constexpr (ConvOp == conv::Operator::kFprop || ConvOp == conv::Operator::kDgrad) {
+        implementable &= (cute::product(to_64b(M)) <= cutlass::platform::numeric_limits<int32_t>::max()) &
+                         (cute::product(to_64b(L)) <= cutlass::platform::numeric_limits<int32_t>::max());
+      }
+      else if constexpr (ConvOp == conv::Operator::kWgrad) {
+        implementable &= (cute::product(to_64b(K)) <= cutlass::platform::numeric_limits<int32_t>::max());
+      }
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: the extents exceed the maximum number.\n");
+        return false;
+      }
+    }
+    return true;
+  }
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE void
+  prefetch_tma_descriptors() {
+    cute::prefetch_tma_descriptor(observed_tma_load_a_->get_tma_descriptor());
+    cute::prefetch_tma_descriptor(observed_tma_load_b_->get_tma_descriptor());
+  }
+  /// Construct A Single Stage's Accumulator Shape
+  CUTLASS_DEVICE static auto
+  partition_accumulator_shape() {
+    auto acc_shape = partition_shape_C(TiledMma{}, take<0,2>(TileShape{}));  // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+    return acc_shape;
+  }
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class GTensorA, class GTensorB,
+    class GTensorPartitionedA, class GTensorPartitionedB,
+    class STensorA, class STensorB,
+    class TileCoordMNKL,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE auto
+  load(
+      Params const& params,
+      MainloopPipeline pipeline,
+      MainloopPipelineState mainloop_pipe_producer_state,
+      cute::tuple<GTensorA, GTensorB,
+                  GTensorPartitionedA, GTensorPartitionedB,
+                  STensorA, STensorB,
+                  uint16_t, uint16_t> const& load_inputs,
+      TileCoordMNKL const& cta_coord_mnkl,
+      KTileIterator k_tile_iter, int k_tile_count) {
+    auto [unused_gA, unused_gB,
+          tAgA_mk, tBgB_nk, tAsA, tBsB,
+          mcast_mask_a, mcast_mask_b] = load_inputs;
+    // slice out the work coord from partitioned tensors
+    Tensor tAgA = tAgA_mk(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _);
+    auto tensor_b_coord = get<1>(cta_coord_mnkl);
+    if constexpr (is_grouped_wgrad) {
+      // in grouped wgrad, tensor A = NZPQK, tensor B = NDHWC, tensor C = KTRSc, where C = G*c, c = channel_per_group = 8,16,32.
+      // CTA Tiling follows output tensor KTRSc. So cta_size_m = K/CTA_TILE_M. cta_size_n = T*R*S*ceil(c/CTA_TILE_N) = T*R*S*1 = T*R*S.
+      // tensor_a_coord = K_idx = cta_coord_m.
+      // tensor_b_coord = TRS_idx * C/CTA_TILE_N + C_idx = cta_coord_n * get<1,0>(shape(tBgB_nk) + cta_coord_m,
+      // because K == C and CTA_TILE_M == CTA_TILE_N => C_idx = K_idx = cta_coord_m.
+      tensor_b_coord = get<0>(cta_coord_mnkl) + get<1>(cta_coord_mnkl) * get<1,0>(shape(tBgB_nk));
+    }
+    Tensor tBgB = tBgB_nk(_, tensor_b_coord, _);
+    auto barrier_token = pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // LOCK mainloop_pipe_producer_state for _writing_
+      pipeline.producer_acquire(mainloop_pipe_producer_state, barrier_token);
+      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(mainloop_pipe_producer_state);
+      int write_stage = mainloop_pipe_producer_state.index();
+      ++mainloop_pipe_producer_state;
+      barrier_token = pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+      if constexpr (is_strided_dgrad) {
+        // construct gemm-k tile coord for gB
+        auto [conv_k, flt_coord, out_coord] = *k_tile_iter;
+        auto gemm_k_tile = prepend(flt_coord, conv_k); // (k,s,r,t)
+        // gA doesn't have a gemm-k (k,s,r,t) iterator mode because it's not an im2col tensor
+        auto offset_kqpzn = append(prepend(out_coord, _0{}),_0{}); // (k,q,p,z,n)
+        auto tAgA_offset = make_tensor(tAgA.data() + offset_kqpzn, tAgA.layout()); // (TMA, k)
+        if (cute::elect_one_sync()) {
+          copy(observed_tma_load_a_->with(*tma_barrier, mcast_mask_a), tAgA_offset(_,conv_k), tAsA(_,write_stage));
+          copy(observed_tma_load_b_->with(*tma_barrier, mcast_mask_b), tBgB(_,gemm_k_tile)  , tBsB(_,write_stage));
+        }
+      }
+      else {
+        if (cute::elect_one_sync()) {
+          copy(observed_tma_load_a_->with(*tma_barrier, mcast_mask_a), tAgA(_,*k_tile_iter), tAsA(_,write_stage));
+          copy(observed_tma_load_b_->with(*tma_barrier, mcast_mask_b), tBgB(_,*k_tile_iter), tBsB(_,write_stage));
+        }
+      }
+      --k_tile_count;
+      ++k_tile_iter;
+  }
+    return cute::make_tuple(mainloop_pipe_producer_state, k_tile_iter);
+  }
+  /// Set up the data needed by this collective for load.
+  /// Return tuple element contain
+  /// gA_mk - The tiled tma tensor for input A
+  /// gB_nk - The tiled tma tensor for input B
+  /// tAsA - partitioned smem tensor for A
+  /// tBsB - partitioned smem tensor for B
+  /// mcast_mask_a - tma multicast mask for A
+  /// mcast_mask_b - tma multicast mask for B
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      Params const& params,
+      TensorStorage& shared_tensors) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+    // Represent the full tensors -- get these from TMA
+    auto K_A = conditional_return<is_strided_dgrad>(get<0>(K), K);
+    Tensor mA_mk = observed_tma_load_a_->get_tma_tensor(make_shape(M, K_A));
+    Tensor mB_nk = observed_tma_load_b_->get_tma_tensor(make_shape(N, K));
+    // Tile the tensors and defer the slice
+    Tensor gA_mk = local_tile(mA_mk, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});         // (BLK_M, BLK_K, m, k)
+    Tensor gB_nk = local_tile(mB_nk, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});         // (BLK_N, BLK_K, n, k)
+    // Partition for this CTA
+    ThrMMA cta_mma = TiledMma{}.get_slice(blockIdx.x % size(typename TiledMma::AtomThrID{}));
+    Tensor tCgA_mk = cta_mma.partition_A(gA_mk);          // (MMA, MMA_M, MMA_K, m, k)
+    Tensor tCgB_nk = cta_mma.partition_B(gB_nk);          // (MMA, MMA_N, MMA_K, n, k)
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});  // (MMA,MMA_N,MMA_K,PIPE)
+    // Define the CTA-in-cluster Layout and Coord
+    Layout cta_layout_mnk  = make_layout(cluster_shape_);
+    Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{}));
+    auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster_);
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgA_mk, tAsA] = tma_partition(*observed_tma_load_a_,
+                                    get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                    group_modes<0,3>(sA), group_modes<0,3>(tCgA_mk));
+    // Project the cta_layout for tma_b along the m-modes
+    auto [tBgB_nk, tBsB] = tma_partition(*observed_tma_load_b_,
+                                    get<1>(cta_coord_vmnk), make_layout(size<1>(cta_layout_vmnk)),
+                                    group_modes<0,3>(sB), group_modes<0,3>(tCgB_nk));
+    // TMA Multicast Masks
+    uint16_t mcast_mask_a = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_b = create_tma_multicast_mask<1>(cta_layout_vmnk, cta_coord_vmnk);
+    return cute::make_tuple(
+        gA_mk, gB_nk,                        // for scheduler
+        tAgA_mk, tBgB_nk, tAsA, tBsB,        // for input tensor values
+        mcast_mask_a, mcast_mask_b);         // multicast masks
+  }
+  /// Perform a Producer Epilogue to prevent early exit of ctas in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, MainloopPipelineState mainloop_pipe_producer_state) {
+    // Issue the epilogue waits
+    /* This helps avoid early exit of ctas in Cluster
+      * Waits for all stages to either be released (all
+      * Consumer UNLOCKs), or if the stage was never used
+      * then would just be acquired since the phase was
+      * still inverted from make_producer_start_state
+      */
+    pipeline.producer_tail(mainloop_pipe_producer_state);
+  }
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgEngine, class FrgLayout,
+    class FragmentA, class FragmentB
+  >
+  CUTLASS_DEVICE auto
+  mma(MainloopPipeline pipeline,
+      MainloopPipelineState mainloop_pipe_consumer_state,
+      cute::Tensor<FrgEngine, FrgLayout>& accumulators,
+      cute::tuple<TiledMma, FragmentA, FragmentB> const& mma_inputs,
+      int k_tile_count)
+  {
+    static_assert(is_tmem<FrgEngine>::value, "Accumulator must be tmem resident.");
+    static_assert(rank(FrgLayout{}) == 3, "Accumulator must be MMA-partitioned: (MMA, MMA_M, MMA_N)");
+    auto [tiled_mma, tCrA, tCrB] = mma_inputs;
+    uint32_t skip_wait = k_tile_count <= 0;
+    auto barrier_token = pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+    //
+    // PIPELINED MAIN LOOP
+    //
+    tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // WAIT on mainloop_pipe_consumer_state until its data are available (phase bit flips from mainloop_pipe_consumer_state.phase() value)
+      pipeline.consumer_wait(mainloop_pipe_consumer_state, barrier_token);
+      // Compute on k_tile
+      int read_stage = mainloop_pipe_consumer_state.index();
+      // Save current mainlop pipeline read state
+      auto curr_mainloop_pipe_consumer_state = mainloop_pipe_consumer_state;
+      // Advance mainloop_pipe
+      ++mainloop_pipe_consumer_state;
+      --k_tile_count;
+      skip_wait = k_tile_count <= 0;
+      // Peek at next iteration
+      barrier_token = pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+      // Unroll the K mode manually so we can set scale C to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulators);
+        tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+      }
+      pipeline.consumer_release(curr_mainloop_pipe_consumer_state);
+    }
+    return mainloop_pipe_consumer_state;
+  }
+  CUTLASS_DEVICE auto
+  mma_init(TensorStorage& shared_tensors) const {
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+    TiledMma tiled_mma;
+    // Allocate "fragments/descriptors" for A and B matrices
+    Tensor tCrA = tiled_mma.make_fragment_A(sA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = tiled_mma.make_fragment_B(sB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sA));                                     // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sB));                                     // PIPE
+    return cute::make_tuple(tiled_mma, tCrA, tCrB);
+  }
+private:
+  typename Params::TMA_A const* observed_tma_load_a_ = nullptr;
+  typename Params::TMA_B const* observed_tma_load_b_ = nullptr;
+  ClusterShape cluster_shape_;
+  uint32_t block_rank_in_cluster_;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass::conv::collective
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp ADDED Viewed

	@@ -0,0 +1,785 @@

+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/atom/copy_traits_sm90_im2col.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cutlass/conv/detail.hpp"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/dispatch_policy.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/util/packed_stride.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass::conv::collective {
+using namespace cute;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  conv::Operator ConvOp,
+  int Stages,
+  int NumSpatialDims,
+  class ClusterShape,
+  class KernelSchedule,
+  int PipelineAsyncMmaStages,
+  class TileShape_,
+  class ElementA_,
+  class ElementB_,
+  class TiledMma_,
+  class TileTraitsA_,
+  class TileTraitsB_>
+struct CollectiveConv<
+    MainloopSm90TmaGmmaWarpSpecializedImplicitGemm<
+        ConvOp, Stages, NumSpatialDims, ClusterShape, KernelSchedule, PipelineAsyncMmaStages>,
+    TileShape_,
+    ElementA_,
+    ElementB_,
+    TiledMma_,
+    TileTraitsA_,
+    TileTraitsB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedImplicitGemm<
+      ConvOp, Stages, NumSpatialDims, ClusterShape, KernelSchedule, PipelineAsyncMmaStages>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using ElementB = ElementB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = typename TileTraitsA_::GmemTiledCopy;
+  using GmemTiledCopyB = typename TileTraitsB_::GmemTiledCopy;
+  using SmemLayoutA = typename TileTraitsA_::SmemLayout;
+  using SmemLayoutB = typename TileTraitsB_::SmemLayout;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  static constexpr int NumSpatialDimensions = DispatchPolicy::NumSpatialDimensions;
+  static constexpr int NumTensorDimensions = NumSpatialDimensions + 2;
+  // Deduce the kernel-facing stride tuple types based on the dispatch policy
+  // (which is a function of the number of spatial dimensions, the algorithm, etc.)
+  using StrideA = decltype(detail::sm90_dispatch_policy_to_stride_A<DispatchPolicy>());
+  using StrideB = decltype(detail::sm90_dispatch_policy_to_stride_B<DispatchPolicy>());
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineParams = typename MainloopPipeline::Params;
+  using PipelineState  = typename cutlass::PipelineState<DispatchPolicy::Stages>;
+  using ProblemShape = ConvProblemShape<ConvOp, NumSpatialDimensions>;
+  static_assert(rank(SmemLayoutA{}) == 3, "SmemLayout must be rank 3 (M/N, K, PIPE)");
+  static_assert((size<0>(TileShape{}) == size<0>(SmemLayoutA{})), "SmemLayout must be compatible with the tile shape.");
+  static_assert((size<2>(TileShape{}) == size<1>(SmemLayoutA{})), "SmemLayout must be compatible with the tile shape.");
+  static_assert(rank(SmemLayoutB{}) == 3, "SmemLayout must be rank 3 (M/N, K, PIPE)");
+  static_assert((size<1>(TileShape{}) == size<0>(SmemLayoutB{})), "SmemLayout must be compatible with the tile shape.");
+  static_assert((size<2>(TileShape{}) == size<1>(SmemLayoutB{})), "SmemLayout must be compatible with the tile shape.");
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  // The tma load mode of wgrad is tiled for tensor A and im2col for tensor B while the tma load mode of fprop and dgrad
+  // kernel is im2col for tensor A and tiled for tensor B.
+  static_assert((ConvOp == conv::Operator::kWgrad
+             && (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>))
+             || (ConvOp != conv::Operator::kWgrad
+             && (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_IM2COL> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_IM2COL_MULTICAST>)),
+      "GmemTiledCopyA - invalid SM90 TMA copy atom specified.");
+  static_assert((ConvOp == conv::Operator::kWgrad
+             && (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_IM2COL> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_IM2COL_MULTICAST>))
+             || (ConvOp != conv::Operator::kWgrad
+             && (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>)),
+      "GmemTiledCopyB - invalid SM90 TMA copy atom specified.");
+  static constexpr bool is_im2col_A = detail::is_im2col_load<GmemTiledCopyA>::value;
+  static constexpr bool is_im2col_B = detail::is_im2col_load<GmemTiledCopyB>::value;
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using InternalElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using InternalElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+  struct SharedStorage
+  {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+    } tensors;
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = DispatchPolicy::PipelineAsyncMmaStages;
+  static constexpr uint32_t TmaTransactionBytes =
+      (size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof(InternalElementA)))+
+      (size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof(InternalElementB)));
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A{nullptr};
+    ElementB const* ptr_B{nullptr};
+  };
+private:
+  // Note that for fprop and dgrad kernel, the tma load mode is im2col for tensor A and tiled for
+  // tensor B while for wgrad kernel, the tma load mode is tiled for tensor A and im2col for tensor
+  // B since operand A, B is swapped.
+  // Get tma_load_a instantce.
+  template <class TensorA>
+  static constexpr auto
+  get_tma_load_a_instance(TensorA const& tensor_a, ProblemShape const& problem_shape) {
+    if constexpr (is_im2col_A) {
+      // compute the upper and lower corners based on the conv padding
+      auto lower_corner_whd = detail::compute_lower_corner_whd(problem_shape);
+      auto upper_corner_whd = detail::compute_upper_corner_whd(problem_shape);
+      auto lower_srt = detail::compute_lower_srt(problem_shape);
+      // The calculation of gbasis strides for dgrad kernel needs perform negate for dilation values.
+      cute::array<int32_t, NumSpatialDimensions> stride_srt{};
+      for (int i = 0; i < NumSpatialDimensions; ++i) {
+        stride_srt[i] = ConvOp == conv::Operator::kDgrad ?
+            -problem_shape.dilation[NumSpatialDimensions-1-i] :
+            problem_shape.dilation[NumSpatialDimensions-1-i];
+      }
+      return make_im2col_tma_copy(
+          GmemTiledCopyA{},
+          tensor_a,
+          SmemLayoutA{}(_,_,_0{}),
+          product_each(shape(SmemLayoutA{}(_,_,_0{}))),
+          size<1>(ClusterShape{}),
+          shape(lower_corner_whd),
+          shape(upper_corner_whd),
+          cute::reverse(shape(problem_shape.lower_padding)),
+          cute::reverse(shape(problem_shape.upper_padding)),
+          cute::reverse(shape(problem_shape.traversal_stride)),
+          shape(lower_srt),
+          shape(stride_srt));
+    }
+    // TMA tiled mode for tensor A in wgrad kernel.
+    else {
+      return make_tma_copy(
+          GmemTiledCopyA{},
+          tensor_a,
+          SmemLayoutA{}(_,_,_0{}),
+          make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+          size<1>(ClusterShape{}));
+    }
+  }
+  // Get tma_load_b instantce.
+  template <class TensorB>
+  static constexpr auto
+  get_tma_load_b_instance(TensorB const& tensor_b, ProblemShape const& problem_shape) {
+    // TMA im2col mode for tensor B in wgrad kernel.
+    if constexpr (is_im2col_B) {
+      // compute the upper and lower corners based on the conv padding
+      auto lower_corner_whd = detail::compute_lower_corner_whd(problem_shape);
+      auto upper_corner_whd = detail::compute_upper_corner_whd(problem_shape);
+      auto lower_srt = detail::compute_lower_srt(problem_shape);
+      return make_im2col_tma_copy(
+          GmemTiledCopyB{},
+          tensor_b,
+          SmemLayoutB{}(_,_,_0{}),
+          product_each(shape(SmemLayoutB{}(_,_,_0{}))),
+          size<0>(ClusterShape{}),
+          shape(lower_corner_whd),
+          shape(upper_corner_whd),
+          cute::reverse(shape(problem_shape.lower_padding)),
+          cute::reverse(shape(problem_shape.upper_padding)),
+          cute::reverse(shape(problem_shape.traversal_stride)),
+          shape(lower_srt),
+          cute::reverse(shape(problem_shape.dilation)));
+    }
+    else {
+      return make_tma_copy(
+          GmemTiledCopyB{},
+          tensor_b,
+          SmemLayoutB{}(_,_,_0{}),
+          make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+          size<0>(ClusterShape{}));
+    }
+  }
+public:
+  // Performs im2col transformations on the input of type ConvProblemShape
+  static constexpr auto
+  get_problem_shape_MNKL(ProblemShape const& problem_shape) {
+    if constexpr (is_im2col_A || is_im2col_B) {
+      // transformation + im2col linearization
+      return cutlass::conv::detail::get_linearized_problem_shape_MNKL(problem_shape);
+    }
+    else {
+      // transformation
+      return cutlass::conv::detail::get_transformed_problem_shape_MNKL(problem_shape);
+    }
+  }
+  // Device side kernel params
+  struct Params {
+    using _Submode = decltype(take<0,NumTensorDimensions-1>(typename ProblemShape::TensorExtent{}));
+    // Assumption: StrideA is congruent with Problem_MK
+    // Select TMA load type according to convolution operator.
+    using TensorShapeA = cute::conditional_t<ConvOp == conv::Operator::kWgrad,
+        decltype(repeat_like(StrideA{}, int32_t(0))),
+        decltype(make_shape(_Submode{}, int(0)))>;
+    using TensorShapeB = cute::conditional_t<ConvOp == conv::Operator::kWgrad,
+        decltype(make_shape(int(0), _Submode{})),
+        decltype(repeat_like(StrideB{}, int32_t(0)))>;
+    using TMA_A = decltype(get_tma_load_a_instance(
+        make_tensor(
+            make_gmem_ptr(static_cast<InternalElementA const*>(nullptr)),
+            make_layout(TensorShapeA{}, StrideA{})),
+        ConvProblemShape<ConvOp, NumSpatialDimensions>{}));
+    using TMA_B = decltype(get_tma_load_b_instance(
+        make_tensor(
+            make_gmem_ptr(static_cast<InternalElementB const*>(nullptr)),
+            make_layout(TensorShapeB{}, StrideB{})),
+        ConvProblemShape<ConvOp, NumSpatialDimensions>{}));
+    // Members
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+  };
+  //
+  // Methods
+  //
+  // Lowers the host side user facing arguments to the kernel facing lauch params
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+    // from the flat problem shape arrays of ConvProblemShape<ConvOp, N>, create a rank-3 MNK problem shape tuple
+    // tma desc creation depends on the original untransformed domain.
+    // A extents.
+    auto shape_A_orig = problem_shape.get_shape_A();
+    // B extents.
+    auto shape_B_orig = problem_shape.get_shape_B();
+    // Fill inferred cute strides from flat stride arrays
+    auto dA = make_cute_packed_stride(StrideA{}, problem_shape.stride_A, ConvOp);
+    auto dB = make_cute_packed_stride(StrideB{}, problem_shape.stride_B, ConvOp);
+    auto ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
+    auto ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
+    Tensor tensor_a = make_tensor(make_gmem_ptr(ptr_A), make_layout(shape_A_orig, dA));
+    Tensor tensor_b = make_tensor(make_gmem_ptr(ptr_B), make_layout(shape_B_orig, dB));
+    auto tma_load_a = get_tma_load_a_instance(tensor_a, problem_shape);
+    auto tma_load_b = get_tma_load_b_instance(tensor_b, problem_shape);
+    return {
+      tma_load_a,
+      tma_load_b,
+      TmaTransactionBytes
+    };
+  }
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      Arguments const& args) {
+    // Activation and Filter channel mode extents much match
+    bool implementable = true;
+    // channel mode is major
+    implementable &= problem_shape.stride_A[NumTensorDimensions-1] == 1;
+    implementable &= problem_shape.stride_B[NumTensorDimensions-1] == 1;
+    constexpr int tma_alignment_bits = 128;
+    // A extents.
+    auto shape_A_orig = problem_shape.get_shape_A();
+    // B extents.
+    auto shape_B_orig = problem_shape.get_shape_B();
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(shape_A_orig, StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(shape_B_orig, StrideB{});
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+      return false;
+    }
+    // Check valid padding values for TMA_LOAD_IM2COL
+    constexpr int padding_limit = (ProblemShape::RankS == 1) ? 65536 : (ProblemShape::RankS == 2 ? 256 : 16);
+    for (int i = 0; i < problem_shape.RankS; ++i) {
+      implementable = implementable && problem_shape.lower_padding[i] <= padding_limit && problem_shape.lower_padding[i] >= 0;
+      implementable = implementable && problem_shape.upper_padding[i] <= padding_limit && problem_shape.upper_padding[i] >= 0;
+    }
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Padding values don't meet requirements for TMA LOAD IM2COL.\n");
+      return false;
+    }
+    if (is_im2col_A || is_im2col_B) {
+      // Check valid corner values for TMA_LOAD_IM2COL, signed int ranging from [-corner_limit, corner_limit - 1]
+      constexpr int32_t corner_limit = 1 << (16 / NumSpatialDimensions - 1);
+      auto lower_corner_whd = detail::compute_lower_corner_whd(problem_shape);
+      for (int i = 0; i < problem_shape.RankS; ++i) {
+        implementable = implementable && lower_corner_whd[i] >= -corner_limit && lower_corner_whd[i] <= (corner_limit - 1);
+      }
+      auto upper_corner_whd = detail::compute_upper_corner_whd(problem_shape);
+      for (int i = 0; i < problem_shape.RankS; ++i) {
+        implementable = implementable && upper_corner_whd[i] >= -corner_limit && upper_corner_whd[i] <= (corner_limit - 1);
+      }
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Padding values don't meet requirements for TMA LOAD IM2COL.\n");
+        return false;
+      }
+    }
+    if (is_im2col_A || is_im2col_B) {
+      // Check valid filter offsets for TMA_LOAD_IM2COL, unsigned int ranging from [0, offset_limit - 1]
+      constexpr int32_t offset_limit = (1 << (16 / NumSpatialDimensions)) - 1;
+      auto flt_data = (ConvOp == conv::Operator::kWgrad) ? problem_shape.shape_C : problem_shape.shape_B;
+      for (int i = 0; i < problem_shape.RankS; ++i) {
+        // flt_data array contains [K, T, R, S, C], so pure filter [T, R, S] starts from the second position in the array
+        implementable = implementable && ((flt_data[i+1] - 1) * problem_shape.dilation[i] >= 0)
+                                      && ((flt_data[i+1] - 1) * problem_shape.dilation[i] < offset_limit);
+      }
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: tensor coordinate offset values don't meet requirements for TMA LOAD IM2COL.\n");
+        return false;
+      }
+    }
+    // Wgrad kernels don't support non-packed output strides, non-packed tensor A stride (linearized)
+    if constexpr (ConvOp == conv::Operator::kWgrad) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      std::ostringstream os;
+#endif
+      const auto & input_shape  = problem_shape.shape_A;
+      const auto & input_stride  = problem_shape.stride_A;
+      implementable &= input_stride[ProblemShape::RankT - 1] == 1;
+      int64_t input_shape_size = 1;
+      for (int i = ProblemShape::RankT - 2; i >= 0; --i) {
+        input_shape_size *= input_shape[i + 1];
+        implementable &= input_stride[i] == input_shape_size;
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        if (input_stride[i] != input_shape_size) {
+          os << "\n    *** input_stride[" << i << "] = " << input_stride[i] << " != input_shape_size = " << input_shape_size << " ***";
+        }
+#endif
+      }
+      if (!implementable) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        os << "\n    input_shape_size: " << input_shape_size
+           << "\n    input_shape: " << input_shape
+           << "\n    input_stride: " << input_stride
+           << "\n";
+#endif
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Wgrad kernels don't support non-packed input strides.\n");
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        CUTLASS_TRACE_HOST(os.str());
+#endif
+        return false;
+      }
+      const auto & output_shape  = problem_shape.shape_C;
+      const auto & output_stride  = problem_shape.stride_C;
+      implementable &= output_stride[ProblemShape::RankT - 1] == 1;
+      int64_t output_shape_size = 1;
+      for (int i = ProblemShape::RankT - 2; i >= 0; --i) {
+        output_shape_size *= output_shape[i + 1];
+        implementable &= output_stride[i] == output_shape_size;
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        if (output_stride[i] != output_shape_size) {
+          os << "\n    *** output_stride[" << i << "] = " << output_stride[i] << " != output_shape_size = " << output_shape_size << " ***";
+        }
+#endif
+      }
+      if (!implementable) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        os << "\n    output_shape_size: " << input_shape_size
+           << "\n    output_shape: " << input_shape
+           << "\n    output_stride: " << input_stride
+           << "\n";
+#endif
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Wgrad kernels don't support non-packed output strides.\n");
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        CUTLASS_TRACE_HOST(os.str());
+#endif
+        return false;
+      }
+    }
+    // Conv kernels only support cross correlation mode currently.
+    implementable &= problem_shape.mode == cutlass::conv::Mode::kCrossCorrelation;
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Conv kernels only support cross correlation mode currently.\n");
+      return false;
+    }
+    if (problem_shape.groups > 1) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: This kernel does not support conv groups > 1.\n");
+      return false;
+    }
+    if constexpr (is_im2col_A || is_im2col_B) {
+      auto [M, N, K, L] = cutlass::conv::detail::get_transformed_problem_shape_MNKL(problem_shape);
+      auto to_64b = [](auto S) { return transform_leaf(S, [](auto s) { return static_cast<int64_t>(s); }); };
+      if constexpr (ConvOp == conv::Operator::kFprop || ConvOp == conv::Operator::kDgrad) {
+        implementable &= (cute::product(to_64b(M)) <= cutlass::platform::numeric_limits<int32_t>::max()) &
+                         (cute::product(to_64b(L)) <= cutlass::platform::numeric_limits<int32_t>::max());
+      }
+      else if constexpr (ConvOp == conv::Operator::kWgrad) {
+        implementable &= (cute::product(to_64b(K)) <= cutlass::platform::numeric_limits<int32_t>::max());
+      }
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: the extents exceed the maximum number.\n");
+        return false;
+      }
+    }
+    return true;
+  }
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mk - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k)
+  /// gB_nk - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k)
+  /// The rest of the tensors can be specified as needed by this collective.
+  /// The dimensions of gA_mk and gA_nk do not contain L to maintain consistency with
+  /// StrideA and StrideB set up for TMA
+  template <class ProblemShapeMNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShapeMNKL const& problem_shape_MNKL, Params const& mainloop_params){
+  //load_init(ProblemShapeMNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M, N, K, L] = problem_shape_MNKL;
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mk = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K));                            // (m,k)
+    Tensor mB_nk = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K));                            // (n,k)
+    // Make tiled views, defer the slice
+    Tensor gA_mk = local_tile(mA_mk, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k)
+    Tensor gB_nk = local_tile(mB_nk, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k)
+    return cute::make_tuple(gA_mk, gB_nk);
+  }
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_producer_state,
+      cute::tuple<TensorA, TensorB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+      //
+      // Prepare the TMA loads for A and B
+      //
+      constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+      auto [gA_mk, gB_nk] = load_inputs;
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mk(_,_,m_coord,_);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nk(_,_,n_coord,_);                                                     // (BLK_N,BLK_K,k)
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_IM2COL_MULTICAST> ||
+                    cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_IM2COL_MULTICAST> ||
+                    cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_producer_state for _writing_
+        pipeline.producer_acquire(smem_pipe_producer_state);
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_producer_state);
+        int write_stage = smem_pipe_producer_state.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+        // Advance smem_pipe_producer_state
+        ++smem_pipe_producer_state;
+      }
+    }
+  }
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_producer_state) {
+    int lane_predicate = cute::elect_one_sync();
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_producer_state);
+    }
+  }
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <class FrgTensorC>
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_consumer_state,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+    //
+    // Define C accumulators and A/B partitioning
+    //
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_consumer_state;
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+    warpgroup_fence_operand(accum);
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue) {
+      // WAIT on smem_pipe_consumer_state until its data are available (phase bit flips from rdPhaseBit value)
+      pipeline.consumer_wait(smem_pipe_consumer_state);
+      int read_stage = smem_pipe_consumer_state.index();
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+      ++smem_pipe_consumer_state;
+    }
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // WAIT on smem_pipe_consumer_state until its data are available (phase bit flips from rdPhaseBit value)
+      pipeline.consumer_wait(smem_pipe_consumer_state);
+      //
+      // Compute on k_tile
+      //
+      int read_stage = smem_pipe_consumer_state.index();
+      warpgroup_fence_operand(accum);
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_producer_state is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accum);
+      // UNLOCK smem_pipe_release, done _computing_ on it
+      pipeline.consumer_release(smem_pipe_release);
+      // Advance smem_pipe_consumer_state and smem_pipe_release
+      ++smem_pipe_consumer_state;
+      ++smem_pipe_release;
+    }
+    warpgroup_fence_operand(accum);
+  }
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+    smem_pipe_release.advance(k_tile_count);
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass::conv::collective
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/conv2d_problem_size.h ADDED Viewed

	@@ -0,0 +1,658 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This file contains definitions and utility functions for describing convolution problem sizes.
+  Conv2dProblem desciption:
+    activation (NHWC),
+    filter (KRSC),
+    output (NPQK),
+    pading (pad_h, pad_w),
+    stride (stride_h, stride_w),
+    dilation (dilation_h, dilation_w).
+  Free functions to map:
+    Map tensor extents (Conv2d -> ImplicitGemm)      : implicit_gemm_tensor_[a|b|c]_extent(ConvolutionOperator)
+    Map tensor sizes (Conv2d -> ImplicitGemm)        : implicit_gemm_tensor_[a|b|c]_size(ConvolutionOperator)
+    Map tensor problem sizes (Conv2d -> ImplicitGemm): implicit_gemm_problem_size(ConvolutionOperator)
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm_enumerated_types.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/functional.h"
+namespace cutlass {
+namespace conv {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Problem size structure
+struct Conv2dProblemSize {
+  // Conv2d strictly problem size parameters
+  int N, H, W, C, P, Q, K, R, S;
+  int pad_h, pad_w;
+  int stride_h, stride_w;
+  int dilation_h, dilation_w;
+  Mode mode;
+  // Conv2d implementation-related parameters
+  int split_k_slices;
+  int groups;
+  //
+  // Methods
+  //
+public:
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize():
+    N(0), H(0), W(0), C(0), P(0), Q(0), K(0), R(0), S(0),
+    pad_h(0), pad_w(0), stride_h(1), stride_w(1), dilation_h(1), dilation_w(1),
+    mode(Mode::kConvolution), split_k_slices(1), groups(1) { }
+  /// Constructor for default padding, stride, dilation, and split-K
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize(
+    int N,
+    int H,
+    int W,
+    int C,
+    int P,
+    int Q,
+    int K,
+    int R,
+    int S,
+    Mode mode
+  ):
+    N(N), H(H), W(W), C(C), P(P), Q(Q), K(K), R(R), S(S),
+    pad_h(R / 2), pad_w(S / 2), stride_h(1), stride_w(1), dilation_h(1), dilation_w(1),
+    mode(mode), split_k_slices(1), groups (1) { }
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize(
+    int N,
+    int H,
+    int W,
+    int C,
+    int K,
+    int R,
+    int S,
+    int P,
+    int Q,
+    int pad_h,
+    int pad_w,
+    int stride_h,
+    int stride_w,
+    int dilation_h,
+    int dilation_w,
+    Mode mode,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    N(N), H(H), W(W), C(C), P(P), Q(Q), K(K), R(R), S(S),
+    pad_h(pad_h), pad_w(pad_w), stride_h(stride_h), stride_w(stride_w),
+    dilation_h(dilation_h), dilation_w(dilation_w),
+    mode(mode), split_k_slices(split_k_slices), groups (groups) { }
+  /// Constructs convolution problem size from cutlass Tensor4DCoord and MatrixCoord
+  // set user-defined output size and sets P and Q (include all data members in ctor)
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize(
+    cutlass::Tensor4DCoord input_size,    // NHWC
+    cutlass::Tensor4DCoord filter_size,   // KRSC
+    cutlass::Tensor4DCoord padding,       // pad_h, _, pad_w, _
+    cutlass::MatrixCoord stride,          // stride_h, stride_w
+    cutlass::MatrixCoord dilation,        // dilation_h, dilation_w
+    cutlass::Tensor4DCoord output_size,   // NPQK
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    N(input_size.n()), H(input_size.h()), W(input_size.w()), C(input_size.c()),
+    P(output_size.h()), Q(output_size.w()),
+    K(filter_size.n()), R(filter_size.h()), S(filter_size.w()),
+    pad_h(padding[0]), pad_w(padding[2]),
+    stride_h(stride.row()), stride_w(stride.column()),
+    dilation_h(dilation.row()), dilation_w(dilation.column()),
+    mode(mode), split_k_slices(split_k_slices), groups(groups) {}
+  /// Constructs convolution problem size from cutlass Tensor4DCoord and MatrixCoord
+  // computes output size and sets P and Q (skip output from ctor arguments)
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize(
+    cutlass::Tensor4DCoord input_size,   // NHWC
+    cutlass::Tensor4DCoord filter_size,  // KRSC
+    cutlass::Tensor4DCoord padding,      // pad_h, upper_pad_h, pad_w, upper_pad_w
+    cutlass::MatrixCoord stride,         // stride_h, stride_w
+    cutlass::MatrixCoord dilation,       // dilation_h, dilation_w
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    N(input_size.n()), H(input_size.h()), W(input_size.w()), C(input_size.c()),
+    K(filter_size.n()), R(filter_size.h()), S(filter_size.w()),
+    pad_h(padding[0]), pad_w(padding[2]),
+    stride_h(stride.row()), stride_w(stride.column()),
+    dilation_h(dilation.row()), dilation_w(dilation.column()),
+    mode(mode), split_k_slices(split_k_slices), groups(groups) {
+      // set output P and Q
+      P = ((H + pad_h + padding[1] - R * dilation_h) / stride_h) + 1;
+      Q = ((W + pad_w + padding[3] - S * dilation_w) / stride_w) + 1;
+    }
+  /// Constructs convolution problem size from cutlass Tensor4DCoord and MatrixCoord
+  // set user-defined output size and sets P and Q (skip padding, striding, and dilation)
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize(
+    cutlass::Tensor4DCoord input_size,    // NHWC
+    cutlass::Tensor4DCoord filter_size,   // KRSC
+    cutlass::Tensor4DCoord output_size,   // NPQK
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    N(input_size.n()), H(input_size.h()), W(input_size.w()), C(input_size.c()),
+    P(output_size.h()), Q(output_size.w()),
+    K(filter_size.n()), R(filter_size.h()), S(filter_size.w()),
+    pad_h(R / 2), pad_w(S / 2), stride_h(1), stride_w(1),
+    dilation_h(1), dilation_w(1),
+    mode(mode), split_k_slices(split_k_slices), groups(groups) {}
+  // Reset covolution mode in the problem
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize reset_mode(cutlass::conv::Mode mode_) {
+    Conv2dProblemSize tmp(*this);
+    tmp.mode = mode_;
+    return tmp;
+  }
+  // Reset covolution mode in the problem
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize reset_split_k_slices(int split_k_slices_) {
+    Conv2dProblemSize tmp(*this);
+    tmp.split_k_slices = split_k_slices_;
+    return tmp;
+  }
+  /// Equality operator (ignores mode and split_k_slice)
+  CUTLASS_HOST_DEVICE
+  bool operator==(Conv2dProblemSize const &conv) const {
+    return (
+      (N == conv.N) && (H == conv.H) && (W == conv.W) && (C == conv.C) &&
+      (K == conv.K) && (R == conv.R) && (S == conv.S) &&
+      (P == conv.P) && (Q == conv.Q) &&
+      (pad_h == conv.pad_h) && (pad_w == conv.pad_w) &&
+      (stride_h == conv.stride_h) && (stride_w == conv.stride_w) &&
+      (dilation_h == conv.dilation_h) && (dilation_w == conv.dilation_w)
+    );
+  }
+  /// Inequality operator
+  CUTLASS_HOST_DEVICE
+  bool operator!=(Conv2dProblemSize const &rhs) const {
+    return !(*this == rhs);
+  }
+  /// Returns activation extent as Tensor4DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor4DCoord activation_extent() const {
+    return cutlass::Tensor4DCoord ({N, H, W, C});
+  }
+  /// Returns filter extent as Tensor4DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor4DCoord filter_extent(bool is_deconv = false) const {
+    return is_deconv ? cutlass::Tensor4DCoord ({C, R, S, K / groups})
+        : cutlass::Tensor4DCoord ({K, R, S, C / groups});
+  }
+  /// Returns output extent as Tensor4DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor4DCoord output_extent() const {
+    return cutlass::Tensor4DCoord ({N, P, Q, K});
+  }
+  /// Returns activation size in number of elements
+  CUTLASS_HOST_DEVICE
+  int64_t activation_size() const {
+    return static_cast<int64_t>(N) * static_cast<int64_t>(H) *
+           static_cast<int64_t>(W) * static_cast<int64_t>(C);
+  }
+  /// Returns filter size in number of elements
+  CUTLASS_HOST_DEVICE
+  int64_t filter_size() const {
+    return static_cast<int64_t>(K) * static_cast<int64_t>(R) *
+           static_cast<int64_t>(S) * static_cast<int64_t>(C) /
+           static_cast<int64_t>(groups);
+  }
+  /// Returns output size in number of elements
+  CUTLASS_HOST_DEVICE
+  int64_t output_size() const {
+    return static_cast<int64_t>(N) * static_cast<int64_t>(P) *
+           static_cast<int64_t>(Q) * static_cast<int64_t>(K);
+  }
+  /// Returns padding as Tensor4DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor4DCoord padding() const {
+    return cutlass::Tensor4DCoord ({pad_h, pad_h, pad_w, pad_w});
+  }
+  /// Returns stride as MatrixCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::MatrixCoord stride() const {
+    return cutlass::MatrixCoord ({stride_h, stride_w});
+  }
+  /// Returns dilation as MatrixCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::MatrixCoord dilation() const {
+    return cutlass::MatrixCoord ({dilation_h, dilation_w});
+  }
+  /////////////////////////////////////////////////////////////////
+  //        Methods used for strided dgrad implementation
+  /////////////////////////////////////////////////////////////////
+  /// Number of filter r positions to accumulate in gemm-k dim
+  CUTLASS_HOST_DEVICE
+  int num_gemm_k_filter_r(int r) const {
+    return ((R - r + stride_h - 1) / stride_h);
+  }
+  /// Number of filter s positions to accumulate in gemm-k dim
+  CUTLASS_HOST_DEVICE
+  int num_gemm_k_filter_s(int s) const {
+    return ((S - s + stride_w - 1) / stride_w);
+  }
+  /// Number of filter positions to accumulate in gemm-k dim
+  CUTLASS_HOST_DEVICE
+  int num_gemm_k_filter_positions(int r, int s) const {
+    return num_gemm_k_filter_r(r) * num_gemm_k_filter_s(s);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//                                  ImplicitGemm helper functions                                 //
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Determine the problem size of the implicit GEMM operation
+CUTLASS_HOST_DEVICE
+cutlass::gemm::GemmCoord implicit_gemm_problem_size(
+  Operator conv_operator,
+  Conv2dProblemSize const &problem_size) {
+  // Compute problem size
+  switch (conv_operator) {
+  case Operator::kFprop:
+    return gemm::GemmCoord(
+      problem_size.N * problem_size.P * problem_size.Q,
+      problem_size.K,
+      problem_size.R * problem_size.S * problem_size.C / problem_size.groups
+    );
+  case Operator::kDeconv:
+  case Operator::kDgrad:
+    return gemm::GemmCoord(
+      problem_size.N * problem_size.H * problem_size.W,
+      problem_size.C,
+      problem_size.R * problem_size.S * problem_size.K
+    );
+  case Operator::kWgrad:
+    return gemm::GemmCoord(
+      problem_size.K,
+      problem_size.R * problem_size.S * problem_size.C,
+      problem_size.N * problem_size.P * problem_size.Q
+    );
+  default:
+    break;
+  }
+  return gemm::GemmCoord();
+}
+// Determine the number of gemm_k iterations for conv2d problem using implicit gemm algorithm
+CUTLASS_HOST_DEVICE
+int implicit_gemm_k_iterations(
+  Operator conv_operator,
+  int threadblock_K,
+  Conv2dProblemSize const &problem_size,
+  IteratorAlgorithm algorithm = IteratorAlgorithm::kAnalytic,
+  GroupMode group_mode = GroupMode::kNone,
+  int threadblock_N = 0) {
+  int iterations = 0;
+  if (group_mode == GroupMode::kNone) {
+    if (algorithm == IteratorAlgorithm::kFixedChannels) {
+      int positions_per_iteration = threadblock_K / problem_size.C;
+      switch (conv_operator) {
+      case Operator::kFprop:
+        iterations = (problem_size.R * problem_size.S + positions_per_iteration - 1 ) / positions_per_iteration;
+        break;
+      default:
+        break;
+      }
+    }
+    else if (algorithm == IteratorAlgorithm::kFewChannels) {
+      switch (conv_operator) {
+      case Operator::kFprop:
+        iterations = (problem_size.R * problem_size.S * problem_size.C + threadblock_K - 1 ) / threadblock_K;
+        break;
+      default:
+        break;
+      }
+    }
+    else {
+      int elements_per_split_k_slice = 0;
+      switch (conv_operator) {
+      case Operator::kFprop:
+        elements_per_split_k_slice = (problem_size.C + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+        iterations = problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
+        break;
+      case Operator::kDeconv:
+      case Operator::kDgrad:
+        elements_per_split_k_slice = (problem_size.K + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+        iterations = problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
+        break;
+      case Operator::kWgrad:
+        elements_per_split_k_slice = (problem_size.N * problem_size.P * problem_size.Q + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+        iterations = (elements_per_split_k_slice + threadblock_K - 1) / threadblock_K;
+        break;
+      default:
+        break;
+      }
+    }
+  } else if (group_mode == GroupMode::kDepthwise) {
+    int channels_per_cta = threadblock_N;
+    if (algorithm == IteratorAlgorithm::kAnalytic) {
+      switch (conv_operator) {
+        case Operator::kFprop:
+          iterations = problem_size.R * problem_size.S *
+                       ((channels_per_cta + threadblock_K - 1) / threadblock_K);
+          break;
+        default:
+          break;
+      }
+    }
+  } else {  // Group conv
+    int channels_per_group = problem_size.C / problem_size.groups;
+    int k_per_group = problem_size.K / problem_size.groups;
+    if (algorithm == IteratorAlgorithm::kAnalytic) {
+      switch (conv_operator) {
+        case Operator::kFprop:
+          iterations = problem_size.R * problem_size.S * ((channels_per_group + threadblock_K - 1) / threadblock_K);
+          // In group conv, if k_per_group < threadblock_N, one Threadblock will calculate multiple groups
+          if (problem_size.groups != 1) {
+            if (k_per_group < threadblock_N) {
+              iterations *= threadblock_N / k_per_group;
+            }
+          }
+          break;
+        default:
+          break;
+      }
+    } else if (algorithm == IteratorAlgorithm::kOptimized) {
+      // Current optimized iterator only support GroupMode::kSingleGroup
+      if (group_mode == GroupMode::kSingleGroup) {
+        switch (conv_operator) {
+          case Operator::kFprop:
+            iterations = problem_size.R * problem_size.S * ((channels_per_group + threadblock_K - 1) / threadblock_K);
+            break;
+          default:
+            break;
+        }
+      }
+    }
+  }
+  return iterations;
+}
+template <int N = 1, int Output_P = 1, int Output_Q = 1>
+CUTLASS_HOST_DEVICE
+int depthwise_gemm_k_iterations(
+  Operator conv_operator,
+  int threadblock_K,
+  Conv2dProblemSize const &problem_size,
+  IteratorAlgorithm algorithm = IteratorAlgorithm::kAnalytic,
+  GroupMode group_mode = GroupMode::kNone,
+  int threadblock_N = 0) {
+    int n =  problem_size.N;
+    int p = (problem_size.P + Output_P - 1) /  Output_P;
+    int q = (problem_size.Q + Output_Q - 1) /  Output_Q;
+    int iterations = (n * p * q + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+    return iterations;
+}
+CUTLASS_HOST_DEVICE
+int implicit_gemm_k_iterations_per_channel(
+    Operator conv_operator,
+    Conv2dProblemSize const &problem_size,
+    IteratorAlgorithm algorithm = IteratorAlgorithm::kAnalytic) {
+  int iterations = 0; //0 means not applicable
+  if (algorithm == IteratorAlgorithm::kAnalytic || algorithm == IteratorAlgorithm::kOptimized) {
+    switch (conv_operator) {
+      case Operator::kFprop:
+        iterations = problem_size.R * problem_size.S;
+        break;
+      case Operator::kDeconv:
+      case Operator::kDgrad:
+        iterations = problem_size.R * problem_size.S;
+        break;
+      default:
+        break;
+    }
+  }
+  return iterations;
+}
+////////////////////////////////////////////////////////////////////////////////
+//  Mapping function (ImplicitGemm A, B, C -> Conv Activation, Filter, Output)
+////////////////////////////////////////////////////////////////////////////////
+/// Returns ImplicitGemm tensor A extent as Tensor4DCoord
+CUTLASS_HOST_DEVICE
+cutlass::Tensor4DCoord implicit_gemm_tensor_a_extent(
+  Operator conv_operator,
+  Conv2dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.activation_extent();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.output_extent();
+    case cutlass::conv::Operator::kWgrad: return problem_size.output_extent();
+    default : break;
+  }
+  return cutlass::Tensor4DCoord();
+}
+/// Returns ImplicitGemm tensor B extent as Tensor4DCoord
+CUTLASS_HOST_DEVICE
+cutlass::Tensor4DCoord implicit_gemm_tensor_b_extent(
+  Operator conv_operator,
+  Conv2dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.filter_extent();
+    case cutlass::conv::Operator::kDeconv: return problem_size.filter_extent(true);
+    case cutlass::conv::Operator::kDgrad: return problem_size.filter_extent();
+    case cutlass::conv::Operator::kWgrad: return problem_size.activation_extent();
+    default : break;
+  }
+  return cutlass::Tensor4DCoord();
+}
+/// Returns ImplicitGemm tensor C extent as Tensor4DCoord
+CUTLASS_HOST_DEVICE
+cutlass::Tensor4DCoord implicit_gemm_tensor_c_extent(
+  Operator conv_operator,
+  Conv2dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.output_extent();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.activation_extent();
+    case cutlass::conv::Operator::kWgrad: return problem_size.filter_extent();
+    default : break;
+  }
+  return cutlass::Tensor4DCoord();
+}
+/// Returns ImplicitGemm tensor A size in number of elements
+CUTLASS_HOST_DEVICE
+int64_t implicit_gemm_tensor_a_size(
+  Operator conv_operator,
+  Conv2dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.activation_size();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.output_size();
+    case cutlass::conv::Operator::kWgrad: return problem_size.output_size();
+    default : break;
+  }
+  return 0;
+}
+/// Returns ImplicitGemm tensor B size in number of elements
+CUTLASS_HOST_DEVICE
+int64_t implicit_gemm_tensor_b_size(
+  Operator conv_operator,
+  Conv2dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.filter_size();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.filter_size();
+    case cutlass::conv::Operator::kWgrad: return problem_size.activation_size();
+    default : break;
+  }
+  return 0;
+}
+/// Returns ImplicitGemm tensor C size in number of elements
+CUTLASS_HOST_DEVICE
+int64_t implicit_gemm_tensor_c_size(
+  Operator conv_operator,
+  Conv2dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.output_size();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.activation_size();
+    case cutlass::conv::Operator::kWgrad: return problem_size.filter_size();
+    default : break;
+  }
+  return 0;
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//                                  Strided dgrad helper functions                                 //
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Returns number of CTAs tile M to cover valid MMAs per starting filter postion
+CUTLASS_HOST_DEVICE
+int strided_dgrad_tile_m_per_filter(
+  Conv2dProblemSize const &problem_size,
+  int tile_size_m) {
+  // Compute NHW rows in Dx output that needs MMA per starting filter position
+  int rows_h_per_filter = (problem_size.H + problem_size.stride_h - 1) / problem_size.stride_h;
+  int rows_w_per_filter = (problem_size.W + problem_size.stride_w - 1) / problem_size.stride_w;
+  int rows_nhw_per_filter = problem_size.N * rows_h_per_filter * rows_w_per_filter;
+  // Number of CTAs tile M to cover valid MMAs per starting filter postion
+  int tile_m_per_filter = (rows_nhw_per_filter + tile_size_m - 1) / tile_size_m;
+  return tile_m_per_filter;
+}
+// Computes starting Dx coord (h, w) for given starting filter postion
+CUTLASS_HOST_DEVICE
+void strided_dgrad_starting_coords(
+  Conv2dProblemSize const &problem_size,
+  FastDivmod const &stride_h_divmod, FastDivmod const &stride_w_divmod,
+  int r, int s,
+  int &start_h, int &start_w) {
+  // function locals for remainder by fast divmod
+  int pad_h_rem_, pad_w_rem_;
+  // start_h  = std::abs(problem_size.stride_h - ((problem_size.pad_h % problem_size.stride_h) - r)) % problem_size.stride_h;
+  stride_h_divmod.divmod(pad_h_rem_, problem_size.pad_h);
+  int r_ = absolute_value(problem_size.stride_h - (pad_h_rem_ - r));
+  stride_h_divmod.divmod(start_h, r_);
+  //start_w  = std::abs(problem_size.stride_w - ((problem_size.pad_w % problem_size.stride_w) - s)) % problem_size.stride_w;
+  stride_w_divmod.divmod(pad_w_rem_, problem_size.pad_w);
+  int s_ = absolute_value(problem_size.stride_w - (pad_w_rem_ - s));
+  stride_w_divmod.divmod(start_w, s_);
+}
+} // namespace conv
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/conv3d_problem_size.h ADDED Viewed

	@@ -0,0 +1,519 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This file contains definitions and utility functions for describing convolution problem sizes.
+  Conv3dProblem desciption:
+    activation (NDHWC),
+    filter (KTRSC),
+    output (NZPQK),
+    pading (pad_d, pad_h, pad_w),
+    stride (stride_d, stride_h, stride_w),
+    dilation (dilation_d, dilation_h, dilation_w).
+  Free functions to map:
+    Map tensor extents (Conv3d -> ImplicitGemm)      : implicit_gemm_tensor_[a|b|c]_extent(ConvolutionOperator)
+    Map tensor sizes (Conv3d -> ImplicitGemm)        : implicit_gemm_tensor_[a|b|c]_size(ConvolutionOperator)
+    Map tensor problem sizes (Conv3d -> ImplicitGemm): implicit_gemm_problem_size(ConvolutionOperator)
+*/
+#pragma once
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+namespace cutlass {
+namespace conv {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Problem size structure
+struct Conv3dProblemSize : public Conv2dProblemSize {
+  //
+  // Type definitions
+  //
+  // 3D coordinate for padding, stride, and dilation in (d, h, w) dimensions
+  using Coord3D = Coord<3>;
+  //
+  // Data members
+  //
+  // Conv3d strictly problem size parameters
+  int D, T, Z;    // input depth, filter depth, output depth
+  int pad_d;      // padding in depth dimension
+  int stride_d;   // stride in depth dimension
+  int dilation_d; // dilation in depth dimension
+  //
+  // Methods
+  //
+public:
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize():
+    Conv2dProblemSize(),
+    D(0), T(0), Z(0),
+    pad_d(0),
+    stride_d(1),
+    dilation_d(1) { }
+  /// Constructor for default padding, stride, dilation, and split-K
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize(
+    int N,
+    int D,
+    int H,
+    int W,
+    int C,
+    int Z,
+    int P,
+    int Q,
+    int K,
+    int T,
+    int R,
+    int S,
+    Mode mode
+  ):
+    Conv2dProblemSize(N, H, W, C, P, Q, K, R, S, mode),
+    D(D), T(T), Z(Z),
+    pad_d(T / 2), stride_d(1), dilation_d(1) { }
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize(
+    int N,
+    int D,
+    int H,
+    int W,
+    int C,
+    int K,
+    int T,
+    int R,
+    int S,
+    int Z,
+    int P,
+    int Q,
+    int pad_d,
+    int pad_h,
+    int pad_w,
+    int stride_d,
+    int stride_h,
+    int stride_w,
+    int dilation_d,
+    int dilation_h,
+    int dilation_w,
+    Mode mode,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    Conv2dProblemSize(
+    N, H, W, C, K, R, S, P, Q,
+    pad_h, pad_w,
+    stride_h, stride_w,
+    dilation_h, dilation_w,
+    mode, split_k_slices, groups),
+    D(D), T(T), Z(Z),
+    pad_d(pad_d), stride_d(stride_d), dilation_d(dilation_d) { }
+  /// Constructs convolution problem size from cutlass Tensor5DCoord and Coord3D
+  // set *user-defined* output size and sets Z, P, and Q (include all data members in ctor)
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize(
+    cutlass::Tensor5DCoord input_size,    // NDHWC
+    cutlass::Tensor5DCoord filter_size,   // KTRSC
+    Coord3D padding,                      // pad_d, pad_h, pad_w
+    Coord3D stride,                       // stride_d, stride_h, stride_w
+    Coord3D dilation,                     // dilation_d, dilation_h, dilation_w
+    cutlass::Tensor5DCoord output_size,   // NZPQK
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    Conv2dProblemSize(
+      {input_size.n(), input_size.h(), input_size.w(), input_size.c()},
+      {filter_size.n(), filter_size.h(), filter_size.w(), filter_size.c()},
+      {padding[1], padding[1], padding[2], padding[2]},
+      {stride[1], stride[2]},
+      {dilation[1], dilation[2]},
+      {output_size.n(), output_size.h(), output_size.w(), output_size.c()},
+      mode, split_k_slices, groups),
+    D(input_size.d()), T(filter_size.d()), Z(output_size.d()),
+    pad_d(padding[0]), stride_d(stride[0]), dilation_d(dilation[0]) { }
+  /// Constructs convolution problem size from cutlass Tensor5DCoord and Coord3D
+  // *computes* output size and sets Z, P and Q (include all data members in ctor)
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize(
+    cutlass::Tensor5DCoord input_size,    // NDHWC
+    cutlass::Tensor5DCoord filter_size,   // KTRSC
+    Coord3D padding,                      // pad_d, pad_h, pad_w
+    Coord3D stride,                       // stride_d, stride_h, stride_w
+    Coord3D dilation,                     // dilation_d, dilation_h, dilation_w
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    Conv2dProblemSize(
+      {input_size.n(), input_size.h(), input_size.w(), input_size.c()},
+      {filter_size.n(), filter_size.h(), filter_size.w(), filter_size.c()},
+      {padding[1], padding[1], padding[2], padding[2]},
+      {stride[1], stride[2]},
+      {dilation[1], dilation[2]},
+      mode, split_k_slices, groups),
+    D(input_size.d()), T(filter_size.d()),
+    pad_d(padding[0]), stride_d(stride[0]), dilation_d(dilation[0])
+    {
+      // set output Z
+      Z = ((D + pad_d * 2 - T * dilation_d) / stride_d) + 1;
+    }
+  /// Constructs convolution problem size from cutlass Tensor5DCoord, Coord3D
+  // *computes* output size and sets Z, P and Q (include all data members in ctor)
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize(
+    cutlass::Tensor5DCoord input_size,    // NDHWC
+    cutlass::Tensor5DCoord filter_size,   // KTRSC
+    CUTLASS_STL_NAMESPACE::tuple<Coord3D, Coord3D> padding, // Coord3D {pad_d, pad_h, pad_w} & Coord3D {far pad_d, pad_h, pad_w} to calculate o/p/q
+    Coord3D stride,                       // stride_d, stride_h, stride_w
+    Coord3D dilation,                     // dilation_d, dilation_h, dilation_w
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    Conv2dProblemSize(
+      {input_size.n(), input_size.h(), input_size.w(), input_size.c()},
+      {filter_size.n(), filter_size.h(), filter_size.w(), filter_size.c()},
+      {CUTLASS_STL_NAMESPACE::get<0>(padding)[1], CUTLASS_STL_NAMESPACE::get<1>(padding)[1],
+       CUTLASS_STL_NAMESPACE::get<0>(padding)[2], CUTLASS_STL_NAMESPACE::get<1>(padding)[2]},
+      {stride[1], stride[2]},
+      {dilation[1], dilation[2]},
+      mode, split_k_slices, groups),
+    D(input_size.d()), T(filter_size.d()),
+    pad_d(CUTLASS_STL_NAMESPACE::get<0>(padding)[0]), stride_d(stride[0]), dilation_d(dilation[0])
+    {
+      // set output Z
+      Z = ((D + pad_d + CUTLASS_STL_NAMESPACE::get<1>(padding)[0] - T * dilation_d) / stride_d) + 1;
+    }
+  /// Equality operator (ignores mode and split_k_slice)
+  CUTLASS_HOST_DEVICE
+  bool operator==(Conv3dProblemSize const &conv) const {
+    return (
+      (N == conv.N) && (D == conv.D) && (H == conv.H) && (W == conv.W) && (C == conv.C) &&
+      (K == conv.K) && (T == conv.T) && (R == conv.R) && (S == conv.S) &&
+      (Z == conv.Z) &&(P == conv.P) && (Q == conv.Q) &&
+      (pad_d == conv.pad_d) && (pad_h == conv.pad_h) && (pad_w == conv.pad_w) &&
+      (stride_d == conv.stride_d) && (stride_h == conv.stride_h) && (stride_w == conv.stride_w) &&
+      (dilation_d == conv.dilation_d) && (dilation_h == conv.dilation_h) && (dilation_w == conv.dilation_w)
+    );
+  }
+  /// Inequality operator
+  CUTLASS_HOST_DEVICE
+  bool operator!=(Conv3dProblemSize const &rhs) const {
+    return !(*this == rhs);
+  }
+  // Reset covolution mode in the problem
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize reset_mode(cutlass::conv::Mode mode_) {
+    Conv3dProblemSize tmp(*this);
+    tmp.mode = mode_;
+    return tmp;
+  }
+  // Reset covolution mode in the problem
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize reset_split_k_slices(int split_k_slices_) {
+    Conv3dProblemSize tmp(*this);
+    tmp.split_k_slices = split_k_slices_;
+    return tmp;
+  }
+  /// Returns activation extent as Tensor5DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor5DCoord activation_extent() const {
+    return cutlass::Tensor5DCoord ({N, D, H, W, C});
+  }
+  /// Returns filter extent as Tensor5DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor5DCoord filter_extent(bool is_deconv = false) const {
+    return is_deconv ? cutlass::Tensor5DCoord ({C, T, R, S, K})
+        : cutlass::Tensor5DCoord ({K, T, R, S, C});
+  }
+  /// Returns output extent as Tensor5DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor5DCoord output_extent() const {
+    return cutlass::Tensor5DCoord ({N, Z, P, Q, K});
+  }
+  /// Returns activation size in number of elements
+  CUTLASS_HOST_DEVICE
+  int64_t activation_size() const {
+    return static_cast<int64_t>(N) * static_cast<int64_t>(D) *
+           static_cast<int64_t>(H) * static_cast<int64_t>(W) *
+           static_cast<int64_t>(C);
+  }
+  /// Returns filter size in number of elements
+  CUTLASS_HOST_DEVICE
+  int64_t filter_size() const {
+    return static_cast<int64_t>(K) * static_cast<int64_t>(T) *
+           static_cast<int64_t>(R) * static_cast<int64_t>(S) *
+           static_cast<int64_t>(C);
+  }
+  /// Returns output size in number of elements
+  CUTLASS_HOST_DEVICE
+  int64_t output_size() const {
+    return static_cast<int64_t>(N) * static_cast<int64_t>(Z) *
+           static_cast<int64_t>(P) * static_cast<int64_t>(Q) *
+           static_cast<int64_t>(K);
+  }
+  /// Returns padding as Coord3D
+  CUTLASS_HOST_DEVICE
+  Coord3D padding() const {
+    return Coord3D ({pad_d, pad_h, pad_w});
+  }
+  /// Returns stride as MatrixCoord
+  CUTLASS_HOST_DEVICE
+  Coord3D stride() const {
+    return Coord3D ({stride_d, stride_h, stride_w});
+  }
+  /// Returns dilation as MatrixCoord
+  CUTLASS_HOST_DEVICE
+  Coord3D dilation() const {
+    return Coord3D ({dilation_d, dilation_h, dilation_w});
+  }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//                                  ImplicitGemm helper functions                                 //
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Determine the problem size of the implicit GEMM operation
+CUTLASS_HOST_DEVICE
+cutlass::gemm::GemmCoord implicit_gemm_problem_size(
+  Operator conv_operator,
+  Conv3dProblemSize const &problem_size) {
+  // Compute problem size
+  switch (conv_operator) {
+  case Operator::kFprop:
+    return gemm::GemmCoord(
+      problem_size.N * problem_size.Z * problem_size.P * problem_size.Q,
+      problem_size.K,
+      problem_size.T * problem_size.R * problem_size.S * problem_size.C
+    );
+  case Operator::kDeconv:
+  case Operator::kDgrad:
+    return gemm::GemmCoord(
+      problem_size.N * problem_size.D * problem_size.H * problem_size.W,
+      problem_size.C,
+      problem_size.T * problem_size.R * problem_size.S * problem_size.K
+    );
+  case Operator::kWgrad:
+    return gemm::GemmCoord(
+      problem_size.K,
+      problem_size.T * problem_size.R * problem_size.S * problem_size.C,
+      problem_size.N * problem_size.Z * problem_size.P * problem_size.Q
+    );
+  default:
+    break;
+  }
+  return gemm::GemmCoord();
+}
+// Determine the number of gemm_k iterations for conv2d problem using implicit gemm algorithm
+CUTLASS_HOST_DEVICE
+int implicit_gemm_k_iterations(
+  Operator conv_operator,
+  int threadblock_K,
+  Conv3dProblemSize const &problem_size,
+  IteratorAlgorithm algorithm = IteratorAlgorithm::kAnalytic,
+  GroupMode group_mode = GroupMode::kNone,
+  int threadblock_N = 0) {
+  int iterations = 0;
+  int elements_per_split_k_slice = 0;
+  if (group_mode == GroupMode::kNone) {
+    switch (conv_operator) {
+      case Operator::kFprop:
+        elements_per_split_k_slice = (problem_size.C + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+        iterations = problem_size.T * problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
+        break;
+      case Operator::kDeconv:
+      case Operator::kDgrad:
+        elements_per_split_k_slice =  (problem_size.K + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+        iterations = problem_size.T * problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
+        break;
+      case Operator::kWgrad:
+        elements_per_split_k_slice = (problem_size.N * problem_size.Z * problem_size.P * problem_size.Q + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+        iterations = (elements_per_split_k_slice + threadblock_K - 1) / threadblock_K;
+        break;
+      default:
+        break;
+    }
+  } else if (group_mode == GroupMode::kDepthwise) {
+    int channels_per_cta = threadblock_N;
+    if (algorithm == IteratorAlgorithm::kAnalytic) {
+      switch (conv_operator) {
+        case Operator::kFprop:
+          iterations = problem_size.T * problem_size.R * problem_size.S *
+                       ((channels_per_cta + threadblock_K - 1) / threadblock_K);
+          break;
+        default:
+          break;
+      }
+    }
+  }
+  return iterations;
+}
+////////////////////////////////////////////////////////////////////////////////
+//  Mapping function (ImplicitGemm A, B, C -> Conv Activation, Filter, Output)
+////////////////////////////////////////////////////////////////////////////////
+/// Returns ImplicitGemm tensor A extent as Tensor5DCoord
+CUTLASS_HOST_DEVICE
+cutlass::Tensor5DCoord implicit_gemm_tensor_a_extent(
+  Operator conv_operator,
+  Conv3dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.activation_extent();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.output_extent();
+    case cutlass::conv::Operator::kWgrad: return problem_size.output_extent();
+    default : break;
+  }
+  return cutlass::Tensor5DCoord();
+}
+/// Returns ImplicitGemm tensor B extent as Tensor5DCoord
+CUTLASS_HOST_DEVICE
+cutlass::Tensor5DCoord implicit_gemm_tensor_b_extent(
+  Operator conv_operator,
+  Conv3dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.filter_extent();
+    case cutlass::conv::Operator::kDeconv: return problem_size.filter_extent(true);
+    case cutlass::conv::Operator::kDgrad: return problem_size.filter_extent();
+    case cutlass::conv::Operator::kWgrad: return problem_size.activation_extent();
+    default : break;
+  }
+  return cutlass::Tensor5DCoord();
+}
+/// Returns ImplicitGemm tensor C extent as Tensor5DCoord
+CUTLASS_HOST_DEVICE
+cutlass::Tensor5DCoord implicit_gemm_tensor_c_extent(
+  Operator conv_operator,
+  Conv3dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.output_extent();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.activation_extent();
+    case cutlass::conv::Operator::kWgrad: return problem_size.filter_extent();
+    default : break;
+  }
+  return cutlass::Tensor5DCoord();
+}
+/// Returns ImplicitGemm tensor A size in number of elements
+CUTLASS_HOST_DEVICE
+int64_t implicit_gemm_tensor_a_size(
+  Operator conv_operator,
+  Conv3dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.activation_size();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.output_size();
+    case cutlass::conv::Operator::kWgrad: return problem_size.output_size();
+    default : break;
+  }
+  return 0;
+}
+/// Returns ImplicitGemm tensor B size in number of elements
+CUTLASS_HOST_DEVICE
+int64_t implicit_gemm_tensor_b_size(
+  Operator conv_operator,
+  Conv3dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.filter_size();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.filter_size();
+    case cutlass::conv::Operator::kWgrad: return problem_size.activation_size();
+    default : break;
+  }
+  return 0;
+}
+/// Returns ImplicitGemm tensor C size in number of elements
+CUTLASS_HOST_DEVICE
+int64_t implicit_gemm_tensor_c_size(
+  Operator conv_operator,
+  Conv3dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.output_size();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.activation_size();
+    case cutlass::conv::Operator::kWgrad: return problem_size.filter_size();
+    default : break;
+  }
+  return 0;
+}
+} // namespace conv
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/convnd_problem_shape.hpp ADDED Viewed

	@@ -0,0 +1,601 @@

+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This file contains definitions and utility functions for describing convolution problem shapes.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/conv/convolution.h"
+#include "cute/container/array.hpp"
+#if ! defined(__CUDACC_RTC__)
+#include <initializer_list>
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass::conv {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Implements the user facing argument for all CUTLASS 3.x convolutions in a rank agnostic fashion.
+// All tensors are flat and by default treated as layout right (NDHWC, KTRSC, NZPQK)
+// Supports asymmetric padding, traversal strides, dilations, and all conv algorithm types.
+template <
+  conv::Operator ConvOp_,
+  int NumSpatialDimensions_
+>
+struct ConvProblemShape {
+  //
+  // Alias types for members
+  //
+  static constexpr int RankS = NumSpatialDimensions_;
+  static constexpr int RankT = NumSpatialDimensions_ + 2;
+  static constexpr conv::Operator ConvOp = ConvOp_;
+  static constexpr int NumSpatialDimensions = NumSpatialDimensions_;
+  using SpatialExtent = cute::array<int, RankS>;
+  using TensorExtent  = cute::array<int, RankT>;
+  using TensorStride  = cute::array<int64_t, RankT>;
+  using ShapePadding = SpatialExtent;
+  using TraversalStride = SpatialExtent;
+  using ShapeDilation = SpatialExtent;
+  using Corner = SpatialExtent;
+  //
+  // Members
+  //
+  cutlass::conv::Mode mode{};
+  TensorExtent shape_A{};
+  TensorStride stride_A{};
+  TensorExtent shape_B{};
+  TensorStride stride_B{};
+  TensorExtent shape_C{};
+  TensorStride stride_C{};
+  // asymmetric padding, both upper and lower padding must be >= 0
+  ShapePadding lower_padding{};
+  ShapePadding upper_padding{};
+  TraversalStride traversal_stride{};
+  ShapeDilation dilation{};
+  int groups = 1;
+  //
+  // Methods
+  //
+  ConvProblemShape() = default;
+  // Constructor accepts user facing arguments and computes to stores the corners as its internal state
+  ConvProblemShape(
+      conv::Mode mode,                                                     // convolution/cross-correlation
+      TensorExtent shape_act,                                              // [n,d,h,w,c]
+      TensorStride stride_act,                                             // [n,d,h,w,c]
+      TensorExtent shape_flt,                                              // [k,t,r,s,c]
+      TensorStride stride_flt,                                             // [k,t,r,s,c]
+      ShapePadding lower_padding,                                          // [pad_d, pad_h, pad_w]
+      ShapePadding upper_padding,                                          // [pad_d, pad_h, pad_w]
+      TraversalStride tstride,                                             // [stride_d, stride_h, stride_w]
+      ShapeDilation dilation,                                              // [dilation_d, dilation_h, dilation_w]
+      int groups)
+      : mode(mode)
+      , lower_padding(lower_padding)
+      , upper_padding(upper_padding)
+      , traversal_stride(tstride)
+      , dilation(dilation)
+      , groups(groups) {
+    auto [shape_xformed_act, stride_xformed_act] = calculate_xformed_act(shape_act, shape_flt);
+    set_shape_stride_ABC(shape_act, stride_act, shape_flt, stride_flt, shape_xformed_act, stride_xformed_act);
+  }
+  // Allow user input of xformed activation stride to support non-packed strides.
+  ConvProblemShape(
+      conv::Mode mode,                                                     // convolution/cross-correlation
+      TensorExtent shape_act,                                              // [n,d,h,w,c]
+      TensorStride stride_act,                                             // [n,d,h,w,c]
+      TensorExtent shape_flt,                                              // [k,t,r,s,c]
+      TensorStride stride_flt,                                             // [k,t,r,s,c]
+      TensorStride stride_xformed_act,                                     // [n,z,p,q,k]
+      ShapePadding lower_padding,                                          // [pad_d, pad_h, pad_w]
+      ShapePadding upper_padding,                                          // [pad_d, pad_h, pad_w]
+      TraversalStride tstride,                                             // [stride_d, stride_h, stride_w]
+      ShapeDilation dilation,                                              // [dilation_d, dilation_h, dilation_w]
+      int groups)
+      : mode(mode)
+      , lower_padding(lower_padding)
+      , upper_padding(upper_padding)
+      , traversal_stride(tstride)
+      , dilation(dilation)
+      , groups(groups) {
+    CUTLASS_ASSERT(stride_act[RankT - 1] == 1);
+    CUTLASS_ASSERT(stride_flt[RankT - 1] == 1);
+    CUTLASS_ASSERT(stride_xformed_act[RankT - 1] == 1);
+    auto stride_act_packed = packed_stride_right_major(shape_act);
+    auto stride_flt_packed = packed_stride_right_major(shape_flt);
+    auto [shape_xformed_act, stride_xformed_act_packed] = calculate_xformed_act(shape_act, shape_flt);
+    CUTLASS_PRAGMA_UNROLL
+    for(int i = 0; i < RankT - 1; ++i) {
+      CUTLASS_ASSERT(stride_act[i] >= stride_act_packed[i]);
+      CUTLASS_ASSERT(stride_flt[i] >= stride_flt_packed[i]);
+      CUTLASS_ASSERT(stride_xformed_act[i] >= stride_xformed_act_packed[i]);
+    }
+    set_shape_stride_ABC(shape_act, stride_act, shape_flt, stride_flt, shape_xformed_act, stride_xformed_act);
+  }
+  // Constructor accepts user facing arguments and presume packed tensor strides in canonical (CWHDN) order.
+  ConvProblemShape(
+      conv::Mode mode,
+      TensorExtent shape_act,
+      TensorExtent shape_flt,
+      ShapePadding lower_padding,
+      ShapePadding upper_padding,
+      TraversalStride tstride,
+      ShapeDilation dilation,
+      int groups)
+      : ConvProblemShape(
+        mode,
+        shape_act,
+        packed_stride_right_major(shape_act),
+        shape_flt,
+        packed_stride_right_major(shape_flt),
+        lower_padding,
+        upper_padding,
+        tstride,
+        dilation,
+        groups) {
+    }
+#if ! defined(__CUDACC_RTC__)
+  // Constructor accepts user facing arguments and computes to stores the corners as its internal state
+  ConvProblemShape(
+      conv::Mode                     mode,
+      std::initializer_list<int>     shape_act_,
+      std::initializer_list<int64_t> stride_act_,
+      std::initializer_list<int>     shape_flt_,
+      std::initializer_list<int64_t> stride_flt_,
+      std::initializer_list<int>     lower_padding_,
+      std::initializer_list<int>     upper_padding_,
+      std::initializer_list<int>     traversal_stride_,
+      std::initializer_list<int>     dilation_,
+      int groups)
+      : mode(mode)
+      , groups(groups) {
+    TensorExtent shape_act{};
+    TensorStride stride_act{};
+    TensorExtent shape_flt{};
+    TensorStride stride_flt{};
+    assert(shape_act_.size() == shape_act.size());
+    assert(stride_act_.size() == stride_act.size());
+    assert(shape_flt_.size() == shape_flt.size());
+    assert(stride_flt_.size() == stride_flt.size());
+    assert(lower_padding_.size() == lower_padding.size());
+    assert(upper_padding_.size() == upper_padding.size());
+    assert(traversal_stride_.size() == traversal_stride.size());
+    assert(dilation_.size() == dilation.size());
+    std::copy(shape_act_.begin(), shape_act_.end(), shape_act.begin());
+    std::copy(stride_act_.begin(), stride_act_.end(), stride_act.begin());
+    std::copy(shape_flt_.begin(), shape_flt_.end(), shape_flt.begin());
+    std::copy(stride_flt_.begin(), stride_flt_.end(), stride_flt.begin());
+    std::copy(lower_padding_.begin(), lower_padding_.end(), lower_padding.begin());
+    std::copy(upper_padding_.begin(), upper_padding_.end(), upper_padding.begin());
+    std::copy(traversal_stride_.begin(), traversal_stride_.end(), traversal_stride.begin());
+    std::copy(dilation_.begin(), dilation_.end(), dilation.begin());
+    auto [shape_xformed_act, stride_xformed_act] = calculate_xformed_act(shape_act, shape_flt);
+    set_shape_stride_ABC(shape_act, stride_act, shape_flt, stride_flt, shape_xformed_act, stride_xformed_act);
+  }
+  // Allow user input of xformed activation stride to support non-packed strides.
+  ConvProblemShape(
+      conv::Mode                     mode,
+      std::initializer_list<int>     shape_act_,
+      std::initializer_list<int64_t> stride_act_,
+      std::initializer_list<int>     shape_flt_,
+      std::initializer_list<int64_t> stride_flt_,
+      std::initializer_list<int64_t> stride_xformed_act_,
+      std::initializer_list<int>     lower_padding_,
+      std::initializer_list<int>     upper_padding_,
+      std::initializer_list<int>     traversal_stride_,
+      std::initializer_list<int>     dilation_,
+      int groups)
+      : mode(mode)
+      , groups(groups) {
+    TensorExtent shape_act{};
+    TensorStride stride_act{};
+    TensorExtent shape_flt{};
+    TensorStride stride_flt{};
+    TensorStride stride_xformed_act{};
+    std::copy(shape_act_.begin(), shape_act_.end(), shape_act.begin());
+    std::copy(stride_act_.begin(), stride_act_.end(), stride_act.begin());
+    std::copy(shape_flt_.begin(), shape_flt_.end(), shape_flt.begin());
+    std::copy(stride_flt_.begin(), stride_flt_.end(), stride_flt.begin());
+    std::copy(stride_xformed_act_.begin(), stride_xformed_act_.end(), stride_xformed_act.begin());
+    std::copy(lower_padding_.begin(), lower_padding_.end(), lower_padding.begin());
+    std::copy(upper_padding_.begin(), upper_padding_.end(), upper_padding.begin());
+    std::copy(traversal_stride_.begin(), traversal_stride_.end(), traversal_stride.begin());
+    std::copy(dilation_.begin(), dilation_.end(), dilation.begin());
+    CUTLASS_ASSERT(stride_act[RankT - 1] == 1);
+    CUTLASS_ASSERT(stride_flt[RankT - 1] == 1);
+    CUTLASS_ASSERT(stride_xformed_act[RankT - 1] == 1);
+    auto stride_act_packed = packed_stride_right_major(shape_act);
+    auto stride_flt_packed = packed_stride_right_major(shape_flt);
+    auto [shape_xformed_act, stride_xformed_act_packed] = calculate_xformed_act(shape_act, shape_flt);
+    CUTLASS_PRAGMA_UNROLL
+    for(int i = 0; i < RankT - 1; ++i) {
+      CUTLASS_ASSERT(stride_act[i] >= stride_act_packed[i]);
+      CUTLASS_ASSERT(stride_flt[i] >= stride_flt_packed[i]);
+      CUTLASS_ASSERT(stride_xformed_act[i] >= stride_xformed_act_packed[i]);
+    }
+    set_shape_stride_ABC(shape_act, stride_act, shape_flt, stride_flt, shape_xformed_act, stride_xformed_act);
+  }
+  // Constructor accepts user facing arguments and computes to stores the corners as its internal state
+  ConvProblemShape(
+      conv::Mode                     mode,
+      std::initializer_list<int>     shape_act_,
+      std::initializer_list<int>     shape_flt_,
+      std::initializer_list<int>     lower_padding_,
+      std::initializer_list<int>     upper_padding_,
+      std::initializer_list<int>     traversal_stride_,
+      std::initializer_list<int>     dilation_,
+      int groups)
+      : mode(mode)
+      , groups(groups) {
+    TensorExtent shape_act{};
+    TensorStride stride_act{};
+    TensorExtent shape_flt{};
+    TensorStride stride_flt{};
+    assert(shape_act_.size() == shape_act.size());
+    assert(shape_flt_.size() == shape_flt.size());
+    assert(lower_padding_.size() == lower_padding.size());
+    assert(upper_padding_.size() == upper_padding.size());
+    assert(traversal_stride_.size() == traversal_stride.size());
+    assert(dilation_.size() == dilation.size());
+    std::copy(shape_act_.begin(), shape_act_.end(), shape_act.begin());
+    std::copy(shape_flt_.begin(), shape_flt_.end(), shape_flt.begin());
+    std::copy(lower_padding_.begin(), lower_padding_.end(), lower_padding.begin());
+    std::copy(upper_padding_.begin(), upper_padding_.end(), upper_padding.begin());
+    std::copy(traversal_stride_.begin(), traversal_stride_.end(), traversal_stride.begin());
+    std::copy(dilation_.begin(), dilation_.end(), dilation.begin());
+    stride_act = packed_stride_right_major(shape_act);
+    stride_flt = packed_stride_right_major(shape_flt);
+    auto [shape_xformed_act, stride_xformed_act] = calculate_xformed_act(shape_act, shape_flt);
+    set_shape_stride_ABC(shape_act, stride_act, shape_flt, stride_flt, shape_xformed_act, stride_xformed_act);
+  }
+#endif // not defined(__CUDACC_RTC__)
+  // Set shape and stride of tensor A/B/C according to following table:
+  // |              | Fprop  | Dgrad  | Wgrad |
+  // | ------       | ------ | ------ | ------|
+  // |   ShapeA     | NDHWC  | NZPQK  | NZPQK |
+  // |   ShapeB     | KTRSC  | KTRSC  | NDHWC |
+  // |   ShapeC     | NZPQK  | NDHWC  | KTRSC |
+  //
+  // Input comes from calculate_xformed_act, which does NOT depend on ConvOp.
+  CUTLASS_HOST_DEVICE
+  constexpr void
+  set_shape_stride_ABC(
+    TensorExtent shape_act,
+    TensorStride stride_act,
+    TensorExtent shape_flt,
+    TensorStride stride_flt,
+    TensorExtent shape_xformed_act,
+    TensorStride stride_xformed_act) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    printf("*** set_shape_stride_ABC ***");
+    printf("\n  shape_act: ");
+    print(shape_act);
+    printf("\n  stride_act: ");
+    print(stride_act);
+    printf("\n  shape_flt: ");
+    print(shape_flt);
+    printf("\n  stride_flt: ");
+    print(stride_flt);
+    printf("\n  shape_xformed_act: ");
+    print(shape_xformed_act);
+    printf("\n  stride_xformed_act: ");
+    print(stride_xformed_act);
+    if constexpr (ConvOp == cutlass::conv::Operator::kFprop) {
+      printf("\n  ConvOp: Fprop");
+    }
+    if constexpr (ConvOp == cutlass::conv::Operator::kDgrad) {
+      printf("\n  ConvOp: Dgrad");
+    }
+    if constexpr (ConvOp == cutlass::conv::Operator::kWgrad) {
+      printf("\n  ConvOp: Wgrad");
+    }
+    printf("\n");
+#endif
+    if constexpr (ConvOp == cutlass::conv::Operator::kFprop) {
+      shape_A = shape_act;
+      stride_A = stride_act;
+      shape_B = shape_flt;
+      stride_B = stride_flt;
+      shape_C = shape_xformed_act;
+      stride_C = stride_xformed_act;
+    }
+    else if constexpr (ConvOp == cutlass::conv::Operator::kDgrad) {
+      shape_A = shape_xformed_act;
+      stride_A = stride_xformed_act;
+      shape_B = shape_flt;
+      stride_B = stride_flt;
+      shape_C = shape_act;
+      stride_C = stride_act;
+    }
+    else if constexpr (ConvOp == cutlass::conv::Operator::kWgrad) {
+      shape_A = shape_xformed_act;
+      stride_A = stride_xformed_act;
+      shape_B = shape_act;
+      stride_B = stride_act;
+      shape_C = shape_flt;
+      stride_C = stride_flt;
+    }
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    printf("\n  shape_A: ");
+    print(shape_A);
+    printf("\n  stride_A: ");
+    print(stride_A);
+    printf("\n  shape_B: ");
+    print(shape_B);
+    printf("\n  stride_B: ");
+    print(stride_B);
+    printf("\n  shape_C: ");
+    print(shape_C);
+    printf("\n  stride_C: ");
+    print(stride_C);
+#endif
+  }
+  // Get A extents.
+  // fprop: A extents array contains [N,D,H,W,C]. Turn that into ((W,H,D,N), (C))
+  // dgrad: A extents array contains [N,Z,P,Q,K]. Turn that into ((Q,P,Z,N), (K))
+  // wgrad: A extents array contains [N,Z,P,Q,K]. Turn that into ((K), (Q,P,Z,N))
+  CUTLASS_HOST_DEVICE
+  constexpr auto
+  get_shape_A() const {
+    using cute::make_shape;
+    using cute::take;
+    if constexpr (ConvOp == conv::Operator::kFprop ||
+                  ConvOp == conv::Operator::kDgrad) {
+      return make_shape(
+        cute::reverse(take<0, RankT - 1>(shape_A)),
+        shape_A[RankT - 1]);
+    }
+    // For wgrad kernel, we need to linearize NZPQ for tensor A
+    else if constexpr (ConvOp == conv::Operator::kWgrad) {
+      return make_shape(
+        shape_A[RankT - 1],
+        cute::product(take<0, RankT - 1>(shape_A)));
+    }
+  }
+  // Get B extents.
+  // fprop: B extents array contains [K,T,R,S,C]. Turn that into ((K), (C,S,R,T))
+  // dgrad: B extents array contains [K,T,R,S,C]. Turn that into ((C), (K,S,R,T))
+  // wgrad: B extents array contains [N,D,H,W,C]. Turn that into ((C), (W,H,D,N))
+  CUTLASS_HOST_DEVICE
+  constexpr auto
+  get_shape_B() const {
+    using cute::make_shape;
+    using cute::reverse;
+    using cute::take;
+    if constexpr (ConvOp == conv::Operator::kFprop) {
+      return make_shape(
+        shape_B[0],
+        reverse(take<1, RankT>(shape_B)));
+    }
+    else if constexpr (ConvOp == conv::Operator::kWgrad) {
+      return make_shape(
+        shape_B[RankT - 1],
+        reverse(take<0, RankT - 1>(shape_B)));
+    }
+    else if constexpr (ConvOp == conv::Operator::kDgrad) {
+      // shape_B: [K,T,R,S,C], return: [(C),(K,S,R,T)]
+      return make_shape(
+        shape_B[RankT - 1],
+        cute::insert<0>(
+          reverse(take<1, RankT - 1>(shape_B)),
+          shape_B[0]));
+    }
+  }
+  // Get C extents.
+  // fprop: C extents array contains [N,Z,P,Q,K]. Turn that into ((Q,P,Z,N), (K))
+  // dgrad: C extents array contains [N,D,H,W,C]. Turn that into ((W,H,D,N), (C))
+  // wgrad: C extents array contains [K,T,R,S,C]. Turn that into ((K), (C,S,R,T))
+  CUTLASS_HOST_DEVICE
+  constexpr auto
+  get_shape_C() const {
+    using cute::make_shape;
+    using cute::reverse;
+    using cute::take;
+    if constexpr (ConvOp == conv::Operator::kFprop ||
+                  ConvOp == conv::Operator::kDgrad) {
+      return make_shape(
+        reverse(take<0, RankT - 1>(shape_C)),
+        shape_C[RankT - 1]);
+    }
+    else if constexpr (ConvOp == conv::Operator::kWgrad) {
+      return make_shape(
+        shape_C[0],
+        reverse(take<1, RankT>(shape_C)));
+    }
+  }
+  // Static method that returns the canonical strides of tensors (layouts are right major and compact)
+  CUTLASS_HOST_DEVICE
+  static constexpr TensorStride
+  packed_stride_right_major(TensorExtent const& extents) {
+    TensorStride strides{};
+    strides[RankT-1] = 1;
+    cute::for_each(cute::make_rseq<RankT-1>{}, [&](auto i) {
+      strides[i] = extents[i+1] * strides[i+1];
+    });
+    return strides;
+  }
+  // Static method that returns the packed logical size of any TensorExtent
+  CUTLASS_HOST_DEVICE
+  static constexpr size_t
+  size(TensorExtent const& extents) {
+    size_t size = 1;
+    cute::for_each(cute::make_seq<RankT>{}, [&](auto i) {
+      size *= extents[i];
+    });
+    return size;
+  }
+  CUTLASS_HOST_DEVICE
+  constexpr size_t
+  size_A() const {
+    return shape_A[0] * stride_A[0];
+  }
+  CUTLASS_HOST_DEVICE
+  constexpr size_t
+  size_B() const {
+    return shape_B[0] * stride_B[0];
+  }
+  CUTLASS_HOST_DEVICE
+  constexpr size_t
+  size_C() const {
+    return shape_C[0] * stride_C[0];
+  }
+  // Equality operator
+  CUTLASS_HOST_DEVICE
+  bool operator==(ConvProblemShape<ConvOp, NumSpatialDimensions> const& rhs) const {
+    using cute::for_each;
+    using cute::make_seq;
+    bool is_equal = true;
+    // Compare all tensor extents
+    for_each(make_seq<RankT>{}, [&](auto i) {
+      is_equal = is_equal
+          && (shape_A[i] == rhs.shape_A[i])
+          && (shape_B[i] == rhs.shape_B[i]);
+    });
+    // Compare all spatial extents
+    for_each(make_seq<RankS>{}, [&](auto i) {
+      is_equal = is_equal
+          && (lower_padding[i] == rhs.lower_padding[i])
+          && (upper_padding[i] == rhs.upper_padding[i])
+          && (traversal_stride[i] == rhs.traversal_stride[i])
+          && (dilation[i] == rhs.dilation[i]);
+    });
+    return is_equal;
+  }
+  /// Inequality operator
+  CUTLASS_HOST_DEVICE
+  bool operator!=(ConvProblemShape<ConvOp, NumSpatialDimensions> const &rhs) const {
+    return !(*this == rhs);
+  }
+private:
+  CUTLASS_HOST_DEVICE
+  constexpr auto
+  calculate_xformed_act(TensorExtent shape_act, TensorExtent shape_flt) {
+    TensorExtent shape_xformed_act{};
+    // calculate n,z,p,q,k.
+    // a helper lambda to compute a single spatial extent of the nzpqk tensor
+    auto nzpqk_extent = [](int act_ext, int filter_ext, int pad_total, int dilation, int tstride) {
+      return 1 + (act_ext + pad_total - ((filter_ext -1) * dilation + 1)) / tstride;
+    };
+    shape_xformed_act[0] = shape_act[0]; // Activation N extent
+    cute::for_each(cute::make_seq<RankS>{}, [&](auto i) {
+      shape_xformed_act[i+1] = nzpqk_extent(
+          shape_act[i+1], shape_flt[i+1], upper_padding[i] + lower_padding[i], dilation[i], traversal_stride[i]);
+      });
+    shape_xformed_act[RankT-1] = shape_flt[0]; // Filter K extent
+    TensorStride stride_xformed_act = packed_stride_right_major(shape_xformed_act);
+    return cute::make_tuple(shape_xformed_act, stride_xformed_act);
+  }
+};
+template<
+  conv::Operator ConvOp,
+  int SpatialDim
+>
+void print(ConvProblemShape<ConvOp, SpatialDim> const& problem) {
+  printf("ConvProblemShape with %d spatial dimensions implementing cutlass::conv::Operator::%d\n",
+      SpatialDim, int(ConvOp));
+  printf("\tTensorA: ");
+      cute::print(problem.shape_A); printf(":");
+      cute::print(problem.stride_A); printf("\n");
+  printf("\tTensorB: ");
+      cute::print(problem.shape_B); printf(":");
+      cute::print(problem.stride_B); printf("\n");
+  printf("\tTensorC: ");
+      cute::print(problem.shape_C); printf(":");
+      cute::print(problem.stride_C); printf("\n");
+  printf("\tLower padding:     "); print(problem.lower_padding);       printf("\n");
+  printf("\tUpper padding:     "); print(problem.upper_padding);       printf("\n");
+  printf("\tTraversal strides: "); print(problem.traversal_stride);    printf("\n");
+  printf("\tDilation:          "); print(problem.dilation);            printf("\n");
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass::conv
+////////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/convolution.h ADDED Viewed

	@@ -0,0 +1,194 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief
+This file contains definitions and utility functions for describing convolution problem sizes in terms of
+activation (NHWC), filter (KRSC), output (NPQK), padding (pad_h, pad_w), stride (stride_h, stride_w), and
+dilation (dilation_h, dilation_w).  Furthermore, it defines helper functions to map CUTLASS's implicit gemm
+tensor extents, sizes, and data types to that of the convolution's extents, sizes, and data types.
+                        * Mapping convolutions to Gemm computation *
+Cutlass implements convolutions with the Implicit Gemm algorithm.  This algorithm performs a gemm
+(general matrix-matrix multiply) on the convolution tensors Activation, Filter, and Output.
+The underlying gemm operation follows the standard gemm definition:
+                                     C = A * B + C
+                               A and B are input matrices
+                            C is source and output matrix
+For the three convolutional operators (Fprop, Dgrad, Wgrad), ImplicitGemm matrices A, B, and C are mapped
+to convolution tensors Activation, Filter and Output as described in the table below.
+        ___________________________________________________________________________
+         ConvolutionalOperator |        A        |      B         |       C
+        ___________________________________________________________________________
+        |                      |                 |                |               |
+        |       Fprop          |    Activation   |    Filter      |     Output    |
+        |       Dgrad          |     Output      |    Filter      |   Activation  |
+        |       Wgrad          |     Output      |  Activation    |     Filter    |
+        ___________________________________________________________________________
+In convolution codebase, DO NOT mix using (A, B, C) with (Activation, Filter, Output).
+For example, it's confusing and error prone to document a convolution class or function
+as operating on "A, B, Output."  Instead, use the mapping functions below,
+and adhere to using either A, B, C or Activation, Filter, Output.
+Map elements' data types (ImplicitGemm -> Conv): GemmToConvElementMap
+Map elements' data types (Conv -> ImplicitGemm): ConvToGemmElementMap
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm_enumerated_types.h"
+#include "cutlass/matrix_coord.h"
+namespace cutlass {
+namespace conv {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Convolutional operator
+enum class Operator {
+  kFprop,
+  kDgrad,
+  kWgrad,
+  kDeconv
+};
+/// Distinguishes convolution from cross correlation
+enum class Mode {
+  kCrossCorrelation,
+  kConvolution
+};
+/// Selects among several implementation variants trading off performance with simplicity
+enum class IteratorAlgorithm {
+  kAnalytic,      ///< functionally correct in all cases but lower performance
+  kOptimized,     ///< optimized for R <= 32, S <= 32 and unity-stride dgrad
+  kFixedChannels, ///< Analytic algorithm optimized for fixed channel count (C == AccessSize)
+  kFewChannels,   ///< Analytic algorithm optimized for few channels (C divisible by AccessSize)
+  kFixedStrideDilation ///< Optimized for fixed stride and dilation
+};
+/// Distinguishes among partial specializations that accelerate certain problems where convolution
+/// stride is unit.
+enum class StrideSupport {
+  kStrided,       ///< arbitrary convolution stride
+  kUnity,         ///< unit convolution stride
+  kFixed          ///< fixed convolution stride
+};
+/// Identifies split-K mode
+enum class SplitKMode {
+  kNone,
+  kSerial,
+  kParallel
+};
+/// Identifies group mode
+enum class GroupMode {
+  kNone,
+  kSingleGroup,   ///< One CTA calculates one group or less
+  kMultipleGroup, ///< One CTA calculates multiple groups
+  kDepthwise      ///< One CTA calculates cta_n groups (problem_size.C == problem_size.K == problem_size.groups)
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Shape of a tensor
+template <
+  int N = 1,
+  int H = 1,
+  int W = 1,
+  int C = 1
+>
+struct TensorNHWCShape {
+  static int const kN = N;
+  static int const kH = H;
+  static int const kW = W;
+  static int const kC = C;
+  static int const kHW = H * W;
+  static int const kNHW = N * kHW;
+  static int const kNHWC = N * H * W * C;
+  static int const kCount = kNHWC;
+  //
+  // Static member functions
+  //
+  /// Returns a Coord object
+  CUTLASS_HOST_DEVICE
+  static Coord<4> toCoord() {
+    return make_Coord(kN, kH, kW, kC);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Shape of a conv2d stride, which controls how the filter convolves around the input volume
+template <
+  /// Stride in horizontal direction
+  int u = 1,
+  /// Stride in vertical direction
+  int v = 1
+>
+struct Stride2D {
+  static int const kU = u;
+  static int const kV = v;
+  //
+  // Static member functions
+  //
+  /// Returns a Coord object
+  CUTLASS_HOST_DEVICE
+  static Coord<2> toCoord() {
+    return make_Coord(kU, kV);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace conv
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/detail.hpp ADDED Viewed

	@@ -0,0 +1,137 @@

+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#include "cutlass/conv/convnd_problem_shape.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass::conv::detail {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+  // Helper function to get the problem shape
+template <typename T, class ProblemShape>
+auto get_problem_shape_MNKL_helper(ProblemShape const& problem_shape, cute::true_type) {
+  return T::get_problem_shape_MNKL(problem_shape);
+}
+template <typename T, class ProblemShape>
+ProblemShape get_problem_shape_MNKL_helper(ProblemShape const& problem_shape, cute::false_type) {
+  return problem_shape;
+}
+// Get problem shape MNKL according to following table:
+// |               |   Fprop   |   Dgrad         |   Wgrad   |
+// |   ----        | --------- | --------        | --------  |
+// |   Shape_M     | (Q,P,Z,N) | (W/V,H/U,D/O,N) | (K)       |
+// |   Shape_N     | (K)       | (C)             | (C,S,R,T) |
+// |   Shape_K     | (C,S,R,T) | (K,S,R,T)       | (Q,P,Z,N) |
+// |   Shape_L     | _1        | (V,U,O)         | _1        |
+template <class ProblemShape>
+CUTLASS_HOST_DEVICE
+constexpr auto
+get_transformed_problem_shape_MNKL(ProblemShape const& problem_shape) {
+  return problem_shape;
+}
+template <conv::Operator ConvOp, int SpatialDim>
+CUTLASS_HOST_DEVICE
+constexpr auto
+get_transformed_problem_shape_MNKL(ConvProblemShape<ConvOp, SpatialDim> const& problem_shape) {
+  using cute::insert;
+  using cute::make_shape;
+  using cute::reverse;
+  using cute::take;
+  constexpr int RankT = SpatialDim + 2;
+  if constexpr (ConvOp == conv::Operator::kWgrad) {
+    auto M_xformed = problem_shape.shape_C[0];
+    auto N_xformed = reverse(take<1, RankT>(problem_shape.shape_C));
+    auto K_xformed = reverse(take<0, RankT - 1>(problem_shape.shape_A));
+    auto L_xformed = cute::Int<1>{};
+    return make_shape(M_xformed, N_xformed, K_xformed, L_xformed);
+  }
+  else if constexpr (ConvOp == conv::Operator::kFprop){
+    auto M_xformed = reverse(take<0, RankT - 1>(problem_shape.shape_C));
+    auto N_xformed = problem_shape.shape_C[RankT - 1];
+    auto K_xformed = reverse(take<1, RankT>(problem_shape.shape_B));
+    auto L_xformed = cute::Int<1>{};
+    return make_shape(M_xformed, N_xformed, K_xformed, L_xformed);
+  }
+  else if constexpr (ConvOp == conv::Operator::kDgrad) {
+    auto L_xformed = reverse(problem_shape.traversal_stride); // (V,U,O)
+    auto M_xformed = ceil_div(reverse(take<0,RankT - 1>(problem_shape.shape_C)), L_xformed);
+    auto N_xformed = problem_shape.shape_C[RankT - 1];
+    // shape_B: [K,T,R,S,C], K_xformed: [K,S,R,T]
+    auto K_xformed = insert<0>(
+                (reverse(take<1,RankT - 1>(problem_shape.shape_B))),
+                problem_shape.shape_B[0]);
+    return make_shape(M_xformed, N_xformed, K_xformed, L_xformed);
+  }
+}
+// Assuming im2col linearization
+// Get problem shape MNKL according to following table:
+// |               |   Fprop   |   Dgrad               |   Wgrad   |
+// |   ----        | --------- | --------              | --------  |
+// |   Shape_M     | (Q*P*Z*N) | ([W/V]*[H/U]*[D/O]*N) | (K)       |
+// |   Shape_N     | (K)       | (C)                   | (C,S,R,T) |
+// |   Shape_K     | (C,S,R,T) | (K,S,R,T)             | (Q*P*Z*N) |
+// |   Shape_L     | _1        | (V*U*O)               | _1        |
+template <conv::Operator ConvOp, int SpatialDim>
+CUTLASS_HOST_DEVICE
+constexpr auto
+get_linearized_problem_shape_MNKL(ConvProblemShape<ConvOp, SpatialDim> const& problem_shape) {
+  auto [M, N, K, L] = get_transformed_problem_shape_MNKL(problem_shape);
+  if constexpr (ConvOp == conv::Operator::kFprop || ConvOp == conv::Operator::kDgrad) {
+    return cute::make_shape(cute::product(M), N, K, cute::product(L));
+  }
+  else if constexpr (ConvOp == conv::Operator::kWgrad) {
+    return cute::make_shape(M, N, cute::product(K), L);
+  }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass::conv::detail
+////////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/device/conv_universal_adapter.hpp ADDED Viewed

	@@ -0,0 +1,448 @@

+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+// common
+#include "cutlass/arch/mma.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/trace.h"
+#include "cutlass/cluster_launch.hpp"
+#include "cutlass/device_kernel.h"
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass::conv::device {
+////////////////////////////////////////////////////////////////////////////////
+/*!
+  ConvUniversalAdapter is a stateful, reusable handle built around a kernel
+  of type cutlass::conv::kernel::ConvUniversal.
+  It manages the lifetime of the underlying `kernel::Params` struct, and exposes APIs
+  to create it from the host facing arguments. For power users, static methods
+  are exposed that bypass the stateful methods or args->params lowering.
+*/
+template <class ConvKernel_>
+class ConvUniversalAdapter
+{
+public:
+  using ConvKernel = GetUnderlyingKernel_t<ConvKernel_>;
+  using TileShape = typename ConvKernel::TileShape;
+  using ElementA = typename ConvKernel::ElementA;
+  using ElementB = typename ConvKernel::ElementB;
+  using ElementC = typename ConvKernel::ElementC;
+  using ElementD = typename ConvKernel::ElementD;
+  using ElementAccumulator = typename ConvKernel::TiledMma::ValTypeC;
+  using DispatchPolicy = typename ConvKernel::DispatchPolicy;
+  using CollectiveMainloop = typename ConvKernel::CollectiveMainloop;
+  using CollectiveEpilogue = typename ConvKernel::CollectiveEpilogue;
+  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
+  // Tease out meta-information about the conv algorithm
+  static constexpr conv::Operator kConvolutionalOperator = DispatchPolicy::ConvOp;
+  static constexpr int NumSpatialDimensions = CollectiveMainloop::NumSpatialDimensions;
+  // If our TiledMMA's instruction thread layout size is larger than 1, we know its a tensorop!
+  using OperatorClass = cute::conditional_t<
+      (cute::size(typename ConvKernel::TiledMma::AtomThrID{}) > 1),
+      cutlass::arch::OpClassTensorOp, cutlass::arch::OpClassSimt>;
+  using ArchTag = typename ConvKernel::ArchTag;
+  // Assume TiledMma's ShapeMNK is the same as 2.x's ThreadblockShape
+  using ThreadblockShape = cutlass::gemm::GemmShape<
+      cute::size<0>(TileShape{}),
+      cute::size<1>(TileShape{}),
+      cute::size<2>(TileShape{})>;
+  using ClusterShape = cutlass::gemm::GemmShape<
+      cute::size<0>(typename ConvKernel::DispatchPolicy::ClusterShape{}),
+      cute::size<1>(typename ConvKernel::DispatchPolicy::ClusterShape{}),
+      cute::size<2>(typename ConvKernel::DispatchPolicy::ClusterShape{})>;
+  // Instruction shape is easy too, since we get that directly from our TiledMma's atom shape
+  using InstructionShape = cutlass::gemm::GemmShape<
+      cute::size<0>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
+      cute::size<1>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
+      cute::size<2>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{})>;
+  // Legacy: provide a correct warp count, but no reliable warp shape
+  static int const kThreadCount = ConvKernel::MaxThreadsPerBlock;
+  // Warp shape is not a primary API type in 3.x
+  // But we can best approximate it by inspecting the TiledMma
+  // For this, we make the assumption that we always have 4 warps along M, and rest along N, none along K
+  // We also always round up the warp count to 4 if the tiled mma is smaller than 128 threads
+  static constexpr int WarpsInMma = cute::max(4, CUTE_STATIC_V(cute::size(typename ConvKernel::TiledMma{})) / 32);
+  static constexpr int WarpsInMmaM = 4;
+  static constexpr int WarpsInMmaN = cute::ceil_div(WarpsInMma, WarpsInMmaM);
+  using WarpCount = cutlass::gemm::GemmShape<WarpsInMmaM, WarpsInMmaN, 1>;
+  using WarpShape = cutlass::gemm::GemmShape<
+      CUTE_STATIC_V(cute::tile_size<0>(typename CollectiveMainloop::TiledMma{})) / WarpsInMmaM,
+      CUTE_STATIC_V(cute::tile_size<1>(typename CollectiveMainloop::TiledMma{})) / WarpsInMmaN,
+      CUTE_STATIC_V(cute::tile_size<2>(typename CollectiveMainloop::TiledMma{}))>;
+  static int constexpr kStages = CollectiveMainloop::DispatchPolicy::Stages;
+  // Inspect TiledCopy for A and B to compute the alignment size
+  static int constexpr kAlignmentA = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveMainloop::GmemTiledCopyA, ElementA>();
+  static int constexpr kAlignmentB = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveMainloop::GmemTiledCopyB, ElementB>();
+  static int constexpr kAlignmentC = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveEpilogue::GmemTiledCopyC, ElementC>();
+  static int constexpr kAlignmentD = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveEpilogue::GmemTiledCopyD, ElementD>();
+  using EpilogueOutputOp = typename CollectiveEpilogue::ThreadEpilogueOp;
+  /// Argument structure: User API
+  using Arguments = typename ConvKernel::Arguments;
+  /// Argument structure: Kernel API
+  using Params = typename ConvKernel::Params;
+private:
+  /// Kernel API parameters object
+  Params params_;
+public:
+  /// Access the Params structure
+  Params const& params() const {
+    return params_;
+  }
+  /// Determines whether the conv can execute the given problem.
+  static Status
+  can_implement(Arguments const& args) {
+    if (ConvKernel::can_implement(args)) {
+      return Status::kSuccess;
+    }
+    else {
+      return Status::kInvalid;
+    }
+  }
+  /// Gets the workspace size
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_bytes = 0;
+    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+    workspace_bytes += ConvKernel::get_workspace_size(args);
+    return workspace_bytes;
+  }
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(Arguments const& args, void* workspace = nullptr) {
+    auto tmp_params = ConvKernel::to_underlying_arguments(args, workspace);
+    return ConvKernel::get_grid_shape(tmp_params);
+  }
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(Params const& params) {
+    return ConvKernel::get_grid_shape(params);
+  }
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int /* smem_capacity */ = -1) {
+    CUTLASS_TRACE_HOST("ConvUniversal::maximum_active_blocks()");
+    int max_active_blocks = -1;
+    int smem_size = ConvKernel::SharedStorageSize;
+    // first, account for dynamic smem capacity if needed
+    cudaError_t result;
+    if (smem_size >= (48 << 10)) {
+      CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+      result = cudaFuncSetAttribute(
+          device_kernel<ConvKernel>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize,
+          smem_size);
+      if (cudaSuccess != result) {
+        result = cudaGetLastError(); // to clear the error bit
+        CUTLASS_TRACE_HOST(
+          "  cudaFuncSetAttribute() returned error: "
+          << cudaGetErrorString(result));
+        return -1;
+      }
+    }
+    // query occupancy after setting smem size
+    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks,
+        device_kernel<ConvKernel>,
+        ConvKernel::MaxThreadsPerBlock,
+        smem_size);
+    if (cudaSuccess != result) {
+      result = cudaGetLastError(); // to clear the error bit
+      CUTLASS_TRACE_HOST(
+        "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error: "
+        << cudaGetErrorString(result));
+      return -1;
+    }
+    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
+    return max_active_blocks;
+  }
+  /// Initializes conv state from arguments.
+  Status
+  initialize(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+    CUTLASS_TRACE_HOST("ConvUniversal::initialize() - workspace "
+      << workspace << ", stream: " << (stream ? "non-null" : "null"));
+    // Initialize the workspace
+    Status status = ConvKernel::initialize_workspace(args, workspace, stream, cuda_adapter);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    // Initialize the Params structure
+    params_ = ConvKernel::to_underlying_arguments(args, workspace);
+    // Don't set the function attributes - require the CudaHostAdapter to set it.
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+      return Status::kSuccess;
+    }
+    else {
+      // account for dynamic smem capacity if needed
+      int smem_size = ConvKernel::SharedStorageSize;
+      if (smem_size >= (48 << 10)) {
+        CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+        cudaError_t result = cudaFuncSetAttribute(
+            device_kernel<ConvKernel>,
+            cudaFuncAttributeMaxDynamicSharedMemorySize,
+            smem_size);
+        if (cudaSuccess != result) {
+          result = cudaGetLastError(); // to clear the error bit
+          CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    return Status::kSuccess;
+  }
+  /// Update API is preserved in 3.0, but does not guarantee a lightweight update of params.
+  Status
+  update(Arguments const& args, void* workspace = nullptr) {
+    CUTLASS_TRACE_HOST("ConvUniversal()::update() - workspace: " << workspace);
+    size_t workspace_bytes = get_workspace_size(args);
+    if (workspace_bytes > 0 && nullptr == workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+    params_ = ConvKernel::to_underlying_arguments(args, workspace);
+    return Status::kSuccess;
+  }
+  /// Primary run() entry point API that is static allowing users to create and manage their own params.
+  /// Supplied params struct must be construct by calling ConvKernel::to_underling_arguments()
+  static Status
+  run(Params& params, cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
+    CUTLASS_TRACE_HOST("ConvUniversal::run()");
+    dim3 const block = ConvKernel::get_block_shape();
+    dim3 const grid = get_grid_shape(params);
+    // configure smem size and carveout
+    int smem_size = ConvKernel::SharedStorageSize;
+    Status launch_result;
+    // Use extended launch API only for mainloops that use it
+    if constexpr (ConvKernel::ArchTag::kMinComputeCapability >= 90) {
+      [[maybe_unused]] constexpr bool is_static_1x1x1 =
+        cute::is_static_v<typename ConvKernel::DispatchPolicy::ClusterShape> and
+        cute::size(typename ConvKernel::DispatchPolicy::ClusterShape{}) == 1;
+      dim3 cluster(cute::size<0>(typename ConvKernel::DispatchPolicy::ClusterShape{}),
+                   cute::size<1>(typename ConvKernel::DispatchPolicy::ClusterShape{}),
+                   cute::size<2>(typename ConvKernel::DispatchPolicy::ClusterShape{}));
+      // Dynamic cluster support
+      [[maybe_unused]] dim3 fallback_cluster = dim3{0,0,0};
+      if constexpr (ConvKernel::ArchTag::kMinComputeCapability == 100 ||
+                    ConvKernel::ArchTag::kMinComputeCapability == 101) {
+        if constexpr (!cute::is_static_v<typename ConvKernel::DispatchPolicy::ClusterShape>) {
+          fallback_cluster = params.hw_info.cluster_shape_fallback;
+          cluster = params.hw_info.cluster_shape;
+        }
+      }
+      void* kernel_params[] = {&params};
+      if constexpr (kEnableCudaHostAdapter) {
+        //
+        // Use the cuda host adapter
+        //
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+          launch_result = cuda_adapter->launch(grid,
+                                               cluster,
+                                               fallback_cluster,
+                                               block,
+                                               smem_size,
+                                               stream,
+                                               kernel_params,
+                                               kernel_index);
+        }
+        else {
+          return Status::kErrorInternal;
+        }
+      }
+      else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+        void const* kernel = (void const*) device_kernel<ConvKernel>;
+        if constexpr (ConvKernel::ArchTag::kMinComputeCapability == 90
+                        || ConvKernel::ArchTag::kMinComputeCapability == 100
+                     ) {
+          if constexpr (is_static_1x1x1) {
+            device_kernel<ConvKernel><<<grid, block, smem_size, stream>>>(params);
+            launch_result = Status::kSuccess;
+          }
+          else {
+            launch_result = ClusterLauncher::launch(
+                grid, cluster, block, smem_size, stream, kernel, kernel_params);
+          }
+        }
+        else {
+          if constexpr (ConvKernel::ArchTag::kMinComputeCapability == 100 ||
+                        ConvKernel::ArchTag::kMinComputeCapability == 101) {
+            launch_result = ClusterLauncher::launch_with_fallback_cluster(
+              grid,
+              cluster,
+              fallback_cluster,
+              block,
+              smem_size,
+              stream,
+              kernel,
+              kernel_params);
+          }
+        }
+      }
+    }
+    else {
+      launch_result = Status::kSuccess;
+      if constexpr (kEnableCudaHostAdapter) {
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+          void* kernel_params[] = {&params};
+          launch_result = cuda_adapter->launch(
+              grid, block, smem_size, stream, kernel_params, 0
+              );
+        }
+        else {
+          return Status::kErrorInternal;
+        }
+      }
+      else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+        device_kernel<ConvKernel><<<grid, block, smem_size, stream>>>(params);
+      }
+    }
+    cudaError_t result = cudaGetLastError();
+    if (cudaSuccess == result && Status::kSuccess == launch_result) {
+      return Status::kSuccess;
+    }
+    else {
+      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
+      return Status::kErrorInternal;
+    }
+  }
+  //
+  // Non-static launch overloads that first create and set the internal params struct of this kernel handle.
+  //
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  run(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    int32_t kernel_index = 0
+  ) {
+    Status status = initialize(args, workspace, stream, cuda_adapter);
+    if (Status::kSuccess == status) {
+      status = run(params_, stream, cuda_adapter, kernel_index);
+    }
+    return status;
+  }
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  operator()(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+    return run(args, workspace, stream, cuda_adapter);
+  }
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  run(cudaStream_t stream = nullptr) {
+    return run(params_, stream);
+  }
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  operator()(cudaStream_t stream = nullptr) {
+    return run(params_, stream);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass::conv::device
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/device/direct_convolution.h ADDED Viewed

	@@ -0,0 +1,270 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Template for device-level Depthwise Convolution
+*/
+#pragma once
+#include <limits>
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/conv/convolution.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace conv {
+namespace device {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename DirectConvolutionKernel_>
+class DirectConvolution {
+public:
+  using UnderlyingKernel = DirectConvolutionKernel_;
+  using ElementA = typename UnderlyingKernel::ElementA;
+  using LayoutA = typename UnderlyingKernel::LayoutA;
+  using ElementB = typename UnderlyingKernel::ElementB;
+  using LayoutB = typename UnderlyingKernel::LayoutB;
+  using ElementC = typename UnderlyingKernel::ElementC;
+  using LayoutC = typename UnderlyingKernel::LayoutC;
+  using ElementAccumulator = typename UnderlyingKernel::ElementAccumulator;
+  using ElementCompute = typename UnderlyingKernel::ElementCompute;
+  using OperatorClass = typename UnderlyingKernel::OperatorClass;
+  using ArchTag = typename UnderlyingKernel::ArchTag;
+  using ThreadblockShape = typename UnderlyingKernel::ThreadblockShape;
+  using WarpShape = typename UnderlyingKernel::WarpShape;
+  using InstructionShape = typename UnderlyingKernel::InstructionShape;
+  using ThreadblockSwizzle = typename UnderlyingKernel::ThreadblockSwizzle;
+  using EpilogueOutputOp = typename UnderlyingKernel::EpilogueOutputOp;
+  static int const kStages = UnderlyingKernel::kStages;
+  static int const kConvDim = UnderlyingKernel::kConvDim;
+  using WarpMmaOperator = typename UnderlyingKernel::WarpMmaOperator;
+  using ArchMmaOperator = typename UnderlyingKernel::ArchMmaOperator;
+  using MathOperator = typename UnderlyingKernel::MathOperator;
+  static cutlass::conv::Operator const kConvolutionalOperator = UnderlyingKernel::kConvolutionalOperator;
+  static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = UnderlyingKernel::kIteratorAlgorithm;
+  static cutlass::conv::StrideSupport const kStrideSupport = UnderlyingKernel::kStrideSupport;
+  static cutlass::conv::GroupMode const kGroupMode = UnderlyingKernel::kGroupMode;
+  static int const kWarpCount =
+    (ThreadblockShape::kM / WarpShape::kM) *
+    (ThreadblockShape::kN / WarpShape::kN) *
+    (ThreadblockShape::kK / WarpShape::kK);
+  /// Argument structure
+  using Arguments = typename UnderlyingKernel::Arguments;
+  using ReorderKernel = typename UnderlyingKernel::ReorderKernel;
+ private:
+  /// Kernel parameters object
+  typename UnderlyingKernel::Params params_;
+public:
+  /// Constructs Implicit GEMM
+  DirectConvolution() { }
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+    // dispatch to iterators
+    Status status = UnderlyingKernel::Mma::IteratorA::can_implement(args.problem_size);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+    status = UnderlyingKernel::Mma::IteratorB::can_implement(args.problem_size);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+    if (kGroupMode != conv::GroupMode::kDepthwise) {
+      return Status::kErrorInvalidProblem;
+    }
+    // C and K should be multiple of groups
+    if (args.problem_size.K != args.problem_size.groups &&
+      args.problem_size.C != args.problem_size.groups) {
+      return Status::kErrorInvalidProblem;
+    }
+    static int const kAlignmentC = UnderlyingKernel::Epilogue::OutputTileIterator::kElementsPerAccess;
+    if (kConvolutionalOperator == conv::Operator::kFprop) {
+      if (args.problem_size.K % kAlignmentC)
+        return Status::kErrorMisalignedOperand;
+    } else if (kConvolutionalOperator == conv::Operator::kDgrad) {
+       if (args.problem_size.C % kAlignmentC)
+        return Status::kErrorMisalignedOperand;
+    } else if (kConvolutionalOperator == conv::Operator::kWgrad) {
+       if (args.problem_size.C % kAlignmentC)
+        return Status::kErrorMisalignedOperand;
+    }
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+    dim3 grid = threadblock_swizzle.get_grid_shape(
+      threadblock_swizzle.get_tiled_shape(
+        kConvolutionalOperator,
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices));
+    if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
+          grid.z <= std::numeric_limits<uint16_t>::max())) {
+      return Status::kErrorInvalidProblem;
+    }
+    return Status::kSuccess;
+  }
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    return 0;
+  }
+  /// Initializes GEMM state from arguments.
+  Status initialize(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr) {
+    // initialize the params structure from the arguments
+    params_ = typename UnderlyingKernel::Params(
+    	args,
+    	static_cast<int *>(workspace)
+    );
+    int smem_size = int(sizeof(typename UnderlyingKernel::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(cutlass::Kernel<UnderlyingKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+    return Status::kSuccess;
+  }
+  /// Initializes GEMM state from arguments.
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    // update the params structure from the arguments
+    params_.ptr_A = args.ref_A.data();
+    params_.ptr_B = args.ref_B.data();
+    params_.ptr_C = args.ref_C.data();
+    params_.ptr_D = args.ref_D.data();
+    params_.output_op = args.output_op;
+    params_.ptr_reordered_B = args.ref_reordered_B.data();
+    params_.semaphore = static_cast<int *>(workspace);
+    return Status::kSuccess;
+  }
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+    // Launch reorder kernel
+    if (params_.ptr_reordered_B != nullptr) {
+      dim3 grid = ReorderKernel::get_grid_shape(params_);
+      dim3 block = ReorderKernel::get_block_shape();
+      cutlass::arch::synclog_setup();
+      cutlass::Kernel<ReorderKernel><<<grid, block, 0, stream>>>(params_);
+    }
+    // Launch main kernel
+    ThreadblockSwizzle threadblock_swizzle;
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(32 * kWarpCount, 1, 1);
+    // Dynamic SMEM size based on input params.
+    int smem_size = int(params_.get_smem_size());
+    // Make sure we can use that much shared memory.
+    cudaError_t status =
+        cudaFuncSetAttribute(cutlass::Kernel<UnderlyingKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+    if (status != cudaSuccess)
+      return Status::kErrorInternal;
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<UnderlyingKernel><<<grid, block, smem_size, stream>>>(params_);
+    cudaError_t result = cudaGetLastError();
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr) {
+    Status status = initialize(args, workspace, stream);
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+    return status;
+  }
+  int get_smem_size() { return int(params_.get_smem_size()); }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+}
+}
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/device/implicit_gemm_convolution.h ADDED Viewed

	@@ -0,0 +1,388 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Template for device-level Implicit GEMM Convolution
+*/
+#pragma once
+#include <limits>
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/cuda_host_adapter.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace conv {
+namespace device {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename ImplicitGemmKernel_>
+class ImplicitGemmConvolution {
+public:
+  using UnderlyingKernel = GetUnderlyingKernel_t<ImplicitGemmKernel_>;
+  using ElementA = typename UnderlyingKernel::ElementA;
+  using LayoutA = typename UnderlyingKernel::LayoutA;
+  using ElementB = typename UnderlyingKernel::ElementB;
+  using LayoutB = typename UnderlyingKernel::LayoutB;
+  using ElementC = typename UnderlyingKernel::ElementC;
+  using LayoutC = typename UnderlyingKernel::LayoutC;
+  using ElementAccumulator = typename UnderlyingKernel::ElementAccumulator;
+  using ElementCompute = typename UnderlyingKernel::ElementCompute;
+  using OperatorClass = typename UnderlyingKernel::OperatorClass;
+  using ArchTag = typename UnderlyingKernel::ArchTag;
+  using ThreadblockShape = typename UnderlyingKernel::ThreadblockShape;
+  using WarpShape = typename UnderlyingKernel::WarpShape;
+  using InstructionShape = typename UnderlyingKernel::InstructionShape;
+  using ThreadblockSwizzle = typename UnderlyingKernel::ThreadblockSwizzle;
+  using EpilogueOutputOp = typename UnderlyingKernel::EpilogueOutputOp;
+  static int const kStages = UnderlyingKernel::kStages;
+  static int const kConvDim = UnderlyingKernel::kConvDim;
+  using WarpMmaOperator = typename UnderlyingKernel::WarpMmaOperator;
+  using ArchMmaOperator = typename UnderlyingKernel::ArchMmaOperator;
+  using MathOperator = typename UnderlyingKernel::MathOperator;
+  static cutlass::conv::Operator const kConvolutionalOperator = UnderlyingKernel::kConvolutionalOperator;
+  static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = UnderlyingKernel::kIteratorAlgorithm;
+  static cutlass::conv::StrideSupport const kStrideSupport = UnderlyingKernel::kStrideSupport;
+  static cutlass::conv::GroupMode const kGroupMode = UnderlyingKernel::kGroupMode;
+  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
+  static int const kWarpCount =
+    (ThreadblockShape::kM / WarpShape::kM) *
+    (ThreadblockShape::kN / WarpShape::kN) *
+    (ThreadblockShape::kK / WarpShape::kK);
+  /// Argument structure
+  using Arguments = typename UnderlyingKernel::Arguments;
+private:
+  /// Kernel parameters object
+  typename UnderlyingKernel::Params params_;
+public:
+  /// Constructs Implicit GEMM
+  ImplicitGemmConvolution() { }
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+    // dispatch to iterators
+    Status status = UnderlyingKernel::Mma::IteratorA::can_implement(args.problem_size);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+    status = UnderlyingKernel::Mma::IteratorB::can_implement(args.problem_size);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+    // Check that tensor sizes don't exceed maximum supported size
+    if (kConvolutionalOperator == conv::Operator::kFprop) {
+      if (args.problem_size.activation_size() * sizeof(ElementA) >=
+              (1ull << 31) ||
+          args.problem_size.filter_size() * sizeof(ElementB) >= (1ull << 31) ||
+          args.problem_size.output_size() * sizeof(ElementC) >= (1ull << 31)) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+    else if (kConvolutionalOperator == conv::Operator::kDgrad ||
+               kConvolutionalOperator == conv::Operator::kDeconv) {
+      if (args.problem_size.activation_size() * sizeof(ElementC) >=
+              (1ull << 31) ||
+          args.problem_size.filter_size() * sizeof(ElementB) >= (1ull << 31) ||
+          args.problem_size.output_size() * sizeof(ElementA) >= (1ull << 31)) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+    else if (kConvolutionalOperator == conv::Operator::kWgrad) {
+      if (args.problem_size.activation_size() * sizeof(ElementB) >=
+              (1ull << 31) ||
+          args.problem_size.filter_size() * sizeof(ElementC) >= (1ull << 31) ||
+          args.problem_size.output_size() * sizeof(ElementA) >= (1ull << 31)) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+    // check group conv constraint
+    if (args.problem_size.groups != 1) {
+      if (kGroupMode == conv::GroupMode::kNone) {
+        return Status::kErrorInvalidProblem;
+      }
+      // C and K should be multiple of groups
+      if (args.problem_size.K % args.problem_size.groups ||
+        args.problem_size.C % args.problem_size.groups) {
+        return Status::kErrorInvalidProblem;
+      }
+      // split-k is not supported
+      if (args.problem_size.split_k_slices != 1) {
+        return Status::kErrorInvalidProblem;
+      }
+      int k_per_group = args.problem_size.K / args.problem_size.groups;
+      // k_per_group should be multiple of ThreadblockShape N, one CTA calculate one group
+      if (kGroupMode == conv::GroupMode::kSingleGroup && k_per_group % ThreadblockShape::kN) {
+        return Status::kErrorInvalidProblem;
+      }
+      // ThreadblockShape::kN should be divisible by k_per_group, one CTA calculate multiple groups
+      if (kGroupMode == conv::GroupMode::kMultipleGroup && ThreadblockShape::kN % k_per_group) {
+        return Status::kErrorInvalidProblem;
+      }
+      // current optimized iterator algo only supports SingleGroup mode
+      if (kIteratorAlgorithm == IteratorAlgorithm::kOptimized &&
+        kGroupMode != conv::GroupMode::kSingleGroup) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+    static int const kAlignmentC = UnderlyingKernel::Epilogue::OutputTileIterator::kElementsPerAccess;
+    if (kConvolutionalOperator == conv::Operator::kFprop) {
+      if (args.problem_size.K % kAlignmentC)
+        return Status::kErrorMisalignedOperand;
+    } else if (kConvolutionalOperator == conv::Operator::kDgrad || kConvolutionalOperator == conv::Operator::kDeconv) {
+       if (args.problem_size.C % kAlignmentC)
+        return Status::kErrorMisalignedOperand;
+    } else if (kConvolutionalOperator == conv::Operator::kWgrad) {
+       if (args.problem_size.C % kAlignmentC)
+        return Status::kErrorMisalignedOperand;
+    }
+    // check for unsupported problem sizes for strided dgrad / deconv implementation
+    if ((kConvolutionalOperator == conv::Operator::kDgrad || kConvolutionalOperator == conv::Operator::kDeconv) &&
+      kStrideSupport == conv::StrideSupport::kStrided) {
+      // split-k (serial or parallel) is not supported for strided dgrad / deconv
+      if(args.problem_size.split_k_slices > 1 && (args.problem_size.stride().at(args.problem_size.stride().max_dim_index()) > 1)) {
+        return Status::kErrorNotSupported;
+      }
+      // dilation > {1x1} is not supported for strided dgrad / deconv
+      if(args.problem_size.dilation_h > 1 || args.problem_size.dilation_w > 1) {
+        return Status::kErrorNotSupported;
+      }
+    }
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+    dim3 grid = threadblock_swizzle.get_grid_shape(
+      threadblock_swizzle.get_tiled_shape(
+        kConvolutionalOperator,
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices));
+    if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
+          grid.z <= std::numeric_limits<uint16_t>::max())) {
+      return Status::kErrorInvalidProblem;
+    }
+    return Status::kSuccess;
+  }
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    size_t workspace_bytes = 0;
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        kConvolutionalOperator,
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices);
+    if(args.split_k_mode == SplitKMode::kParallel) {
+      // Split-K parallel: CTAs in k-dimension write the partial results in a temporary workspace.
+      // The user needs to call a reduction operator to optain the final output tensor
+      workspace_bytes =
+        sizeof(ElementAccumulator) *
+        size_t(cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, args.problem_size)) *
+        size_t(grid_tiled_shape.k());
+    }
+    else if(args.split_k_mode == SplitKMode::kSerial && args.problem_size.split_k_slices > 1) {
+      // Split-K serial: The user workspace is used to store semaphore and serialize writing the
+      // final reduced output to user's output tensor
+      workspace_bytes = sizeof(int) * size_t(grid_tiled_shape.m()) * size_t(grid_tiled_shape.n());
+    }
+    return workspace_bytes;
+  }
+  /// Initializes GEMM state from arguments.
+  Status initialize(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+    if (args.problem_size.split_k_slices > 1) {
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+      cudaError_t status = cudaMemsetAsync(workspace, 0, get_workspace_size(args), stream);
+      if (status != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+    // initialize the params structure from the arguments
+    params_ = typename UnderlyingKernel::Params(
+    	args,
+    	static_cast<int *>(workspace)
+    );
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+      return Status::kSuccess;
+    }
+    else {
+      int smem_size = int(sizeof(typename UnderlyingKernel::SharedStorage));
+      if (smem_size >= (48 << 10)) {
+        cudaError_t result = cudaFuncSetAttribute(cutlass::Kernel<UnderlyingKernel>,
+                                      cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                      smem_size);
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    return Status::kSuccess;
+  }
+  /// Initializes GEMM state from arguments.
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    // update the params structure from the arguments
+    params_.ptr_A = args.ref_A.data();
+    params_.ptr_B = args.ref_B.data();
+    params_.ptr_C = args.ref_C.data();
+    params_.ptr_D = args.ref_D.data();
+    params_.output_op = args.output_op;
+    params_.semaphore = static_cast<int *>(workspace);
+    return Status::kSuccess;
+  }
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
+    ThreadblockSwizzle threadblock_swizzle;
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(32 * kWarpCount, 1, 1);
+    int smem_size = int(sizeof(typename UnderlyingKernel::SharedStorage));
+    cutlass::Status launch_result = cutlass::Status::kSuccess ;
+    if constexpr (kEnableCudaHostAdapter) {
+        //
+        // Use the cuda host adapter
+        //
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+          void* kernel_params[] = {&params_};
+          launch_result = cuda_adapter->launch(
+              grid, dim3(1,1,1), block, smem_size, stream, kernel_params, kernel_index
+              );
+        }
+        else {
+          launch_result = Status::kErrorInternal;
+        }
+    }
+    else {
+      cutlass::arch::synclog_setup();
+      cutlass::Kernel<UnderlyingKernel><<<grid, block, smem_size, stream>>>(params_);
+    }
+    cudaError_t result = cudaGetLastError();
+    if (cudaSuccess == result && Status::kSuccess == launch_result) {
+      return Status::kSuccess;
+    }
+    else {
+      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
+      return Status::kErrorInternal;
+    }
+  }
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
+    return run(stream, cuda_adapter, kernel_index);
+  }
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
+    Status status = initialize(args, workspace, stream, cuda_adapter);
+    if (status == Status::kSuccess) {
+      status = run(stream, cuda_adapter, kernel_index);
+    }
+    return status;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+}
+}
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h ADDED Viewed

	@@ -0,0 +1,269 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Template for device-level fused activation's scale+bias+relu and Implicit GEMM Convolution
+*/
+#pragma once
+#include <limits>
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/conv/convolution.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace conv {
+namespace device {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename ImplicitGemmFusionKernel_>
+class ImplicitGemmConvolutionFusion {
+public:
+  using ImplicitGemmFusionKernel = ImplicitGemmFusionKernel_;
+  using ElementA = typename ImplicitGemmFusionKernel::ElementA;
+  using LayoutA = typename ImplicitGemmFusionKernel::LayoutA;
+  using ElementB = typename ImplicitGemmFusionKernel::ElementB;
+  using LayoutB = typename ImplicitGemmFusionKernel::LayoutB;
+//  using ElementScaleBias = typename ImplicitGemmFusionKernel::ElementScaleBias;
+//  using LayoutScaleBias = typename ImplicitGemmFusionKernel::LayoutScaleBias;
+  using ElementC = typename ImplicitGemmFusionKernel::ElementC;
+  using LayoutC = typename ImplicitGemmFusionKernel::LayoutC;
+  using ElementAccumulator = typename ImplicitGemmFusionKernel::ElementAccumulator;
+  using ElementCompute = typename ImplicitGemmFusionKernel::ElementCompute;
+  using OperatorClass = typename ImplicitGemmFusionKernel::OperatorClass;
+  using ArchTag = typename ImplicitGemmFusionKernel::ArchTag;
+  using ThreadblockShape = typename ImplicitGemmFusionKernel::ThreadblockShape;
+  using WarpShape = typename ImplicitGemmFusionKernel::WarpShape;
+  using InstructionShape = typename ImplicitGemmFusionKernel::InstructionShape;
+  using ThreadblockSwizzle = typename ImplicitGemmFusionKernel::ThreadblockSwizzle;
+  using EpilogueOutputOp = typename ImplicitGemmFusionKernel::EpilogueOutputOp;
+  static int const kStages = ImplicitGemmFusionKernel::kStages;
+  static int const kConvDim = ImplicitGemmFusionKernel::kConvDim;
+  using WarpMmaOperator = typename ImplicitGemmFusionKernel::WarpMmaOperator;
+  using ArchMmaOperator = typename ImplicitGemmFusionKernel::ArchMmaOperator;
+  using MathOperator = typename ImplicitGemmFusionKernel::MathOperator;
+  static cutlass::conv::Operator const kConvolutionalOperator = ImplicitGemmFusionKernel::kConvolutionalOperator;
+  static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = ImplicitGemmFusionKernel::kIteratorAlgorithm;
+  static int const kWarpCount =
+    (ThreadblockShape::kM / WarpShape::kM) *
+    (ThreadblockShape::kN / WarpShape::kN) *
+    (ThreadblockShape::kK / WarpShape::kK);
+  /// Argument structure
+  using Arguments = typename ImplicitGemmFusionKernel::Arguments;
+private:
+  /// Kernel parameters object
+  typename ImplicitGemmFusionKernel::Params params_;
+public:
+  /// Constructs Implicit GEMM
+  ImplicitGemmConvolutionFusion() { }
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+    // dispatch to iterators
+    Status status = ImplicitGemmFusionKernel::Mma::IteratorA::can_implement(args.problem_size);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+    status = ImplicitGemmFusionKernel::Mma::IteratorB::can_implement(args.problem_size);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+    dim3 grid = threadblock_swizzle.get_grid_shape(
+      threadblock_swizzle.get_tiled_shape(
+        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size),
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices));
+    if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
+          grid.z <= std::numeric_limits<uint16_t>::max())) {
+      return Status::kErrorInvalidProblem;
+    }
+    return Status::kSuccess;
+  }
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    size_t workspace_bytes = 0;
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size),
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices);
+    if(args.split_k_mode == SplitKMode::kParallel) {
+      // Split-K parallel: CTAs in k-dimension write the partial results in a temporary workspace.
+      // The user needs to call a reduction operator to optain the final output tensor
+      workspace_bytes =
+        sizeof(ElementAccumulator) *
+        size_t(cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, args.problem_size)) *
+        size_t(grid_tiled_shape.k());
+    }
+    else if(args.split_k_mode == SplitKMode::kSerial && args.problem_size.split_k_slices > 1) {
+      // Split-K serial: The user workspace is used to store semaphore and serialize writing the
+      // final reduced output to user's output tensor
+      workspace_bytes = sizeof(int) * size_t(grid_tiled_shape.m()) * size_t(grid_tiled_shape.n());
+    }
+    return workspace_bytes;
+  }
+  /// Initializes GEMM state from arguments.
+  Status initialize(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr) {
+    if (args.problem_size.split_k_slices > 1) {
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+      cudaError_t status = cudaMemsetAsync(workspace, 0, get_workspace_size(args), stream);
+      if (status != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+    // initialize the params structure from the arguments
+    params_ = typename ImplicitGemmFusionKernel::Params(
+    	args,
+    	static_cast<int *>(workspace)
+    );
+    int smem_size = int(sizeof(typename ImplicitGemmFusionKernel::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(cutlass::Kernel<ImplicitGemmFusionKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+    return Status::kSuccess;
+  }
+  /// Initializes Impicit GEMM state from arguments.
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    // update the params structure from the arguments
+    params_.ptr_A = args.ref_A.data();
+    params_.ptr_B = args.ref_B.data();
+    params_.ptr_scale = args.ref_A_scale.data();
+    params_.ptr_bias = args.ref_A_bias.data();
+    params_.ptr_C = args.ref_C.data();
+    params_.ptr_D = args.ref_D.data();
+    params_.output_op = args.output_op;
+    params_.semaphore = static_cast<int *>(workspace);
+    return Status::kSuccess;
+  }
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+    ThreadblockSwizzle threadblock_swizzle;
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(32 * kWarpCount, 1, 1);
+    int smem_size = int(sizeof(typename ImplicitGemmFusionKernel::SharedStorage));
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<ImplicitGemmFusionKernel><<<grid, block, smem_size, stream>>>(params_);
+    cudaError_t result = cudaGetLastError();
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr) {
+    Status status = initialize(args, workspace, stream);
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+    return status;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+}
+}
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/dispatch_policy.hpp ADDED Viewed

	@@ -0,0 +1,136 @@

+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#include "cutlass/conv/convolution.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/arch/arch.h"
+#include "cute/layout.hpp"
+#include "cute/numeric/integral_constant.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+//////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+namespace cutlass::conv {
+//////////////////////////////////////////////////////////////////////////////
+//
+// Policies for categorical dispatch of mainloop against kernel grid schedules
+//
+struct KernelImplicitTmaWarpSpecializedSm90 : cutlass::gemm::KernelTmaWarpSpecialized { };
+struct KernelImplicitTmaWarpSpecializedSm90Cooperative { };
+struct KernelImplicitTmaWarpSpecializedSm90Pingpong { };
+//
+// Collective Mainloop Policies
+//
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, static schedule between TMA and GMMA
+// for fprop
+template<
+  conv::Operator ConvOp_,
+  int Stages_,
+  int NumSpatialDimensions_,
+  class ClusterShape_ = cute::Shape<cute::C<1>,cute::C<1>,cute::C<1>>,
+  class KernelSchedule = KernelImplicitTmaWarpSpecializedSm90,
+  int PipelineAsyncMmaStages_ = 1
+>
+struct MainloopSm90TmaGmmaWarpSpecializedImplicitGemm {
+  static constexpr int Stages = Stages_;
+  static constexpr int NumSpatialDimensions = NumSpatialDimensions_;
+  static constexpr Operator ConvOp = ConvOp_;
+  static constexpr int PipelineAsyncMmaStages = PipelineAsyncMmaStages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+  static_assert(NumSpatialDimensions >= 1);
+  static_assert(! (cute::is_same_v<KernelSchedule,KernelImplicitTmaWarpSpecializedSm90Cooperative> ||
+                   cute::is_same_v<KernelSchedule,KernelImplicitTmaWarpSpecializedSm90Pingpong>),
+    "Persistent schedules not support for conv yet.");
+};
+// SM100 tensor op kernel schedule
+struct KernelImplicitTmaWarpSpecializedSm100 {
+  static constexpr int SchedulerPipelineStageCount = 0;
+  static constexpr int AccumulatorPipelineStageCount = 0;
+};
+// Pseudo-policies for builder auto override that dispatches to the KernelImplicitTmaWarpSpecializedSm100
+// but for opting into 1 or 2 SM atoms
+struct KernelImplicitTmaWarpSpecialized1SmSm100 : KernelImplicitTmaWarpSpecializedSm100 { };
+struct KernelImplicitTmaWarpSpecialized2SmSm100 : KernelImplicitTmaWarpSpecializedSm100 { };
+struct KernelStridedDgradTmaWs1SmSm100 { };
+struct KernelStridedDgradTmaWs2SmSm100 { };
+// Policy for implicit gemm kernel
+template<
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_
+>
+struct KernelScheduleImplicitTmaWarpSpecializedSm100 : KernelImplicitTmaWarpSpecializedSm100 {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  static constexpr int AccumulatorPipelineStageCount = AccumulatorPipelineStageCount_;
+};
+// n-buffer in smem (Blackwell TMA), pipelined with Blackwell UMMA and TMA, fprop
+template<
+  conv::Operator ConvOp_,
+  int Stages_,
+  int NumSpatialDimensions_,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  class ClusterShape_ = cute::Shape<cute::C<1>,cute::C<1>,cute::C<1>>
+>
+struct MainloopSm100TmaUmmaWarpSpecializedImplicitGemm {
+  static constexpr int Stages = Stages_;
+  static constexpr int NumSpatialDimensions = NumSpatialDimensions_;
+  static constexpr Operator ConvOp = ConvOp_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm100;
+  using Schedule = KernelScheduleImplicitTmaWarpSpecializedSm100<SchedulerPipelineStageCount_, AccumulatorPipelineStageCount_>;
+  static_assert(NumSpatialDimensions >= 1);
+};
+//////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass::conv
+//////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/conv_universal.hpp ADDED Viewed

	@@ -0,0 +1,65 @@

+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#include "cutlass/conv/convnd_problem_shape.hpp"
+#include "cutlass/detail/dependent_false.hpp"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass::conv::kernel {
+////////////////////////////////////////////////////////////////////////////////
+/*
+ * Stateless universal device CONV kernel type that treats CONV as
+ * a composition of a collective mainloop and a collective epilogue.
+**/
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileSchedulerTag_ = void,
+  class Enable = void
+>
+class ConvUniversal {
+  static_assert(cutlass::detail::dependent_false<Enable>,
+      "Could not find a valid specialization at the kernel layer to dispatch against.");
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass::conv::kernel
+////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp"
+#include "cutlass/conv/kernel/sm100_implicit_gemm_tma_warpspecialized.hpp"
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d.h ADDED Viewed

	@@ -0,0 +1,322 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief
+      Default kernel-level implicit GEMM convolution definitions for threadblock-scoped epilogue.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/conv/threadblock/threadblock_swizzle.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_with_reduction.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
+#include "cutlass/conv/threadblock/implicit_gemm_pipelined.h"
+#include "cutlass/conv/threadblock/implicit_gemm_multistage.h"
+#include "cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h"
+#include "cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution_fusion.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace conv {
+namespace kernel {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+template <
+  typename ArchTag,
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename OutputOp
+>
+struct DefaultConvEpilogue {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    OutputOp::kCount
+  >::Epilogue;
+};
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename OutputOp
+>
+struct DefaultConvEpilogue<
+  arch::Sm70,
+  Shape,
+  WarpMmaTensorOp,
+  PartitionsK,
+  OutputOp
+> {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueVoltaTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    OutputOp::kCount
+  >::Epilogue;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ArchTag,
+  typename Shape,
+  typename WarpMmaSimt,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess,
+  typename PermuteDLayout = layout::NoPermute,
+  conv::StrideSupport StrideSupport = conv::StrideSupport::kUnity,
+  int Rank = 4
+>
+struct DefaultConvEpilogueWithBroadcastSimt {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithBroadcastSimt<
+    Shape,
+    WarpMmaSimt,
+    ElementOutput,
+    ElementTensor,
+    ElementVector,
+    OutputOp,
+    ElementsPerAccess,
+    false,
+    PermuteDLayout,
+    StrideSupport,
+    Rank
+  >::Epilogue;
+};
+template <
+  typename ArchTag,
+  typename Shape,
+  typename WarpMmaSimt,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess
+>
+struct DefaultConvEpilogueWithBroadcastSimtStridedDgrad {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithBroadcastSimtStridedDgrad<
+    Shape,
+    WarpMmaSimt,
+    ElementOutput,
+    ElementTensor,
+    ElementVector,
+    OutputOp,
+    ElementsPerAccess
+  >::Epilogue;
+};
+template <
+  typename ArchTag,
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess
+>
+struct DefaultConvEpilogueWithBroadcastTensorOp {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithBroadcastTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    ElementOutput,
+    ElementTensor,
+    ElementVector,
+    OutputOp,
+    ElementsPerAccess
+  >::Epilogue;
+};
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess
+>
+struct DefaultConvEpilogueWithBroadcastTensorOp<
+  arch::Sm70,
+  Shape,
+  WarpMmaTensorOp,
+  PartitionsK,
+  ElementOutput,
+  ElementTensor,
+  ElementVector,
+  OutputOp,
+  ElementsPerAccess
+  > {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithBroadcastVoltaTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    ElementOutput,
+    ElementTensor,
+    ElementVector,
+    OutputOp,
+    ElementsPerAccess
+  >::Epilogue;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ArchTag,
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename OutputOp,
+  typename ReductionOp,
+  int ElementsPerAccess
+>
+struct DefaultConvEpilogueWithReductionTensorOp {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithReductionTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    ElementOutput,
+    OutputOp,
+    ReductionOp,
+    ElementsPerAccess
+  >::Epilogue;
+};
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename OutputOp,
+  typename ReductionOp,
+  int ElementsPerAccess
+>
+struct DefaultConvEpilogueWithReductionTensorOp<
+  arch::Sm70,
+  Shape,
+  WarpMmaTensorOp,
+  PartitionsK,
+  ElementOutput,
+  OutputOp,
+  ReductionOp,
+  ElementsPerAccess
+  > {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithReductionVoltaTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    ElementOutput,
+    OutputOp,
+    ReductionOp,
+    ElementsPerAccess
+  >::Epilogue;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Defaults for strided Dgrad
+template <
+  typename ArchTag,
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename OutputOp
+>
+struct DefaultConvEpilogueStridedDgrad {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOpStridedDgrad<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    OutputOp::kCount
+  >::Epilogue;
+};
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename OutputOp
+>
+struct DefaultConvEpilogueStridedDgrad<
+  arch::Sm70,
+  Shape,
+  WarpMmaTensorOp,
+  PartitionsK,
+  OutputOp
+> {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueVoltaTensorOpStridedDgrad<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    OutputOp::kCount
+  >::Epilogue;
+};
+} // namespace detail
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_dgrad.h ADDED Viewed

	@@ -0,0 +1,1927 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+#include "cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace conv {
+namespace kernel {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dDgrad
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+> struct DefaultConv2dDgrad;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                               OpClassTensorOp convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm Dgrad Strided and
+// multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kStrided,
+      AccessTypeA
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      StrideSupport::kStrided,
+      AccessTypeB
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages
+  >;
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOpStridedDgrad<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm Dgrad Strided
+// and 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kStrided,
+        AccessTypeA
+      >
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        StrideSupport::kStrided,
+        AccessTypeB
+      >
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogueStridedDgrad<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm Dgrad Unity Strided
+// and multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kUnity,
+      AccessTypeA
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      StrideSupport::kUnity,
+      AccessTypeB
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages
+  >;
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm Dgrad Unity
+// 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kUnity,
+        AccessTypeA
+      >
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        StrideSupport::kUnity,
+        AccessTypeB
+      >
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dDgrad specialization for optimized IteratorAlgorithm Dgrad Unity Strided
+// and multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kUnity,
+      AccessTypeA
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      StrideSupport::kUnity,
+      AccessTypeB
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages
+  >;
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+/// Defines a kernel for Conv2dDgrad specialization for Optimized IteratorAlgorithm Dgrad Strided and
+// multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kStrided,
+      AccessTypeA
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      StrideSupport::kStrided,
+      AccessTypeB
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages
+  >;
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOpStridedDgrad<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+/// Defines a kernel for Conv2dDgrad specialization for Optimized IteratorAlgorithm Dgrad Strided
+// and 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kStrided,
+        AccessTypeA
+      >
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        StrideSupport::kStrided,
+        AccessTypeB
+      >
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogueStridedDgrad<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+/// Defines a kernel for Conv2dDgrad specialization for Optimized IteratorAlgorithm Dgrad Unity
+// 2 stage pipeline
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kUnity,
+        AccessTypeA
+      >
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        StrideSupport::kUnity,
+        AccessTypeB
+      >
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm,
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      conv::StrideSupport::kUnity
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      conv::StrideSupport::kUnity
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages
+  >;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      conv::StrideSupport::kStrided
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      conv::StrideSupport::kStrided
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages
+  >;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dDgrad specialization for Optimized IteratorAlgorithm,
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kUnity
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      StrideSupport::kUnity
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages
+  >;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      conv::StrideSupport::kStrided
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      conv::StrideSupport::kStrided
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages
+  >;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm,
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        conv::StrideSupport::kUnity
+      >
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        conv::StrideSupport::kUnity
+      >
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        conv::StrideSupport::kStrided
+      >
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        conv::StrideSupport::kStrided
+      >
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dDgrad specialization for Optimized IteratorAlgorithm,
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kUnity
+      >
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        StrideSupport::kUnity
+      >
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        conv::StrideSupport::kStrided
+      >
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        conv::StrideSupport::kStrided
+      >
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop.h ADDED Viewed

	@@ -0,0 +1,2007 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace conv {
+namespace kernel {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+> struct DefaultConv2dFprop;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                         OpClassTensorOp convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, LayoutA,
+      ThreadMapA,
+      AccessTypeA
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      AccessTypeB
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages
+  >;
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kFixedChannels,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorFixedChannels<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, LayoutA,
+      ThreadMapA,
+      AccessTypeA
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorFixedChannels<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      AccessTypeB
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages
+  >;
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and two stage
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kFixedChannels,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorFixedChannels<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, LayoutA,
+        ThreadMapA,
+        AccessTypeA
+      >
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorFixedChannels<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        AccessTypeB
+      >
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kFewChannels,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorFewChannels<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, LayoutA,
+      ThreadMapA,
+      AccessTypeA
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorFewChannels<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      AccessTypeB
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages
+  >;
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kFewChannels,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorFewChannels<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, LayoutA,
+        ThreadMapA,
+        AccessTypeA
+      >
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorFewChannels<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        AccessTypeB
+      >
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage
+/// pipeline with interleaved layout.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB,
+  int InterleavedK
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      Stages, MathOperatorTag, true>;
+  // Define iterators over tiles from the A operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapA = typename MmaCore::SmemThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, layout::TensorNCxHWx<InterleavedK>,
+      ThreadMapA
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapB = typename MmaCore::SmemThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, layout::TensorCxRSKx<InterleavedK>,
+      ThreadMapB
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    MmaPolicy,
+    Stages
+  >;
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    InterleavedK
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm
+/// and 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, LayoutA,
+        ThreadMapA,
+        AccessTypeA
+      >
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        AccessTypeB
+      >
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and 2 stage
+/// pipeline with interleaved layout.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB,
+  int InterleavedK
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      2, MathOperatorTag, true>;
+  // Define iterators over tiles from the A operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapA = typename MmaCore::SmemThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, layout::TensorNCxHWx<InterleavedK>,
+        ThreadMapA
+      >
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapB = typename MmaCore::SmemThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, layout::TensorCxRSKx<InterleavedK>,
+        ThreadMapB
+      >
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    InterleavedK
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm and
+/// multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+    ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+    ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+    Stages, MathOperatorTag
+  >;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      LayoutA,
+      ThreadMapA,
+      AccessTypeA
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      LayoutB,
+      ThreadMapB,
+      AccessTypeB
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages
+  >;
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    4
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm and
+// multistage pipeline with interleaved layout.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB,
+  int InterleavedK
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+    ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+    ElementB, layout::RowMajorInterleaved<InterleavedK>, ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+    Stages, MathOperatorTag, true
+  >;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::SmemThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      layout::TensorNCxHWx<InterleavedK>,
+      ThreadMapA
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::SmemThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      layout::TensorCxRSKx<InterleavedK>,
+      ThreadMapB
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    MmaPolicy,
+    Stages
+  >;
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    InterleavedK
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm
+/// and 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        LayoutA,
+        ThreadMapA,
+        AccessTypeA
+      >
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        LayoutB,
+        ThreadMapB,
+        AccessTypeB
+      >
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm and 2 stage
+/// pipeline with interleaved layout.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB,
+  int InterleavedK
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      2, MathOperatorTag, true>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::SmemThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, layout::TensorNCxHWx<InterleavedK>,
+        ThreadMapA
+      >
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::SmemThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, layout::TensorCxRSKx<InterleavedK>,
+        ThreadMapB
+      >
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    InterleavedK
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm,
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, LayoutA,
+      ThreadMapA
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages
+  >;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    4
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm,
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      LayoutA,
+      ThreadMapA
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      LayoutB,
+      ThreadMapB
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages
+  >;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    4
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm,
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, LayoutA,
+        ThreadMapA
+      >
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB
+      >
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    4
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm,
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        LayoutA,
+        ThreadMapA
+      >
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        LayoutB,
+        ThreadMapB
+      >
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    4
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h ADDED Viewed

	@@ -0,0 +1,357 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief
+   Default kernel-level fused activation's scale+bias+relu and implicit GEMM convolution
+   definitions that combine threadblock-scoped matrix multiply-add with the
+   appropriate threadblock-scoped epilogue.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h"
+#include "cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h"
+#include "cutlass/gemm/warp/scale_bias_tile_iterator.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace conv {
+namespace kernel {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for fused batch norm and Conv2dFprop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementScaleBias,
+  typename LayoutScaleBias,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity
+> struct DefaultConv2dFpropFusion;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                         OpClassTensorOp convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementScaleBias,
+  typename LayoutScaleBias,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv2dFpropFusion <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementScaleBias,
+  LayoutScaleBias,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, LayoutA,
+      ThreadMapA
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  /// Define iterators over tiles from scale/bias vectors
+  using IteratorScaleBias =
+      cutlass::conv::threadblock::PredicatedScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+  using SmemIteratorScaleBias =
+      cutlass::transform::threadblock::RegularScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  static int const kThreadCount = 32;
+  // Warp-level iterators to load scale and bias vectors
+  using WarpIteratorScaleBias = cutlass::gemm::warp::ScaleBiasTileIterator<
+      MatrixShape<WarpShape::kM, WarpShape::kK>, ElementScaleBias,
+      LayoutScaleBias, MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+      typename WarpMmaTensorOp::IteratorA::Base::Policy, kThreadCount,
+      MmaCore::WarpCount::kK>;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmFpropFusionMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    IteratorScaleBias,
+    SmemIteratorScaleBias,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    WarpIteratorScaleBias,
+    Stages
+  >;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionFusion<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm and
+/// multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementScaleBias,
+  typename LayoutScaleBias,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv2dFpropFusion <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementScaleBias,
+  LayoutScaleBias,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+    ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+    ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+    Stages, MathOperatorTag
+  >;
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      LayoutA,
+      ThreadMapA
+    >;
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      LayoutB,
+      ThreadMapB
+    >;
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+  /// Define iterators over tiles from scale/bias vectors
+  using IteratorScaleBias =
+      cutlass::conv::threadblock::PredicatedScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+  using SmemIteratorScaleBias =
+      cutlass::transform::threadblock::RegularScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  static int const kThreadCount = 32;
+  // Warp-level iterators to load scale and bias vectors
+  using WarpIteratorScaleBias = cutlass::gemm::warp::ScaleBiasTileIterator<
+      MatrixShape<WarpShape::kM, WarpShape::kK>, ElementScaleBias,
+      LayoutScaleBias, MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+      typename WarpMmaTensorOp::IteratorA::Base::Policy, kThreadCount,
+      MmaCore::WarpCount::kK>;
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmFpropFusionMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    IteratorScaleBias,
+    SmemIteratorScaleBias,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    WarpIteratorScaleBias,
+    Stages
+  >;
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionFusion<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////